# LAB 6: Text classification with linear models

Objectives:

* Train and evaluate linear text classifiers using SGDClassifier
* Experiment with different feature extraction and training methods
* Log and evaluate experimental results using [mlflow](https://mlflow.org)

In [21]:
import numpy as np
import pandas as pd
from cytoolz import *
from tqdm.auto import tqdm

tqdm.pandas()

### Load and preprocess data

In this step, we will divide the data into a training set and testing set.  We will only set the parameters on the training set only.

In [22]:
train = pd.read_parquet(
    "s3://ling583/rcv1-topics-train.parquet", storage_options={"anon": True}
)
test = pd.read_parquet(
    "s3://ling583/rcv1-topics-test.parquet", storage_options={"anon": True}
)

In [23]:
train.head()

Unnamed: 0,text,topics
0,NZ bonds close well bid ahead of key U.S. data...,MCAT
1,Asia Product Swaps - Jet/gas oil regrade at di...,MCAT
2,U.S. public schools get a C report card in qua...,GCAT
3,Thunder Bay vessel clearances - May 12. Daily ...,MCAT
4,"Amoco gains shares in Ula,Gyda N.Sea fields. A...",CCAT


CCAT : CORPORATE/INDUSTRIAL  
ECAT : ECONOMICS  
GCAT : GOVERNMENT/SOCIAL  
MCAT : MARKETS

In [24]:
train["topics"].value_counts()

CCAT    5896
MCAT    3281
GCAT    3225
ECAT    1073
Name: topics, dtype: int64

#### Let's tokenize!

In [25]:
import spacy

nlp = spacy.load(
    "en_core_web_sm",
    exclude=["tagger", "parser", "ner", "lemmatizer", "attribute_ruler"],
)


def tokenize(text):
    doc = nlp.tokenizer(text)
    return [t.norm_ for t in doc if t.is_alpha] # alphabet only

In [26]:
import multiprocessing as mp

In [27]:
with mp.Pool() as p:
    train["tokens"] = pd.Series(p.imap(tokenize, tqdm(train["text"]), chunksize=100))
    test["tokens"] = pd.Series(p.imap(tokenize, tqdm(test["text"]), chunksize=100))

  0%|          | 0/13475 [00:00<?, ?it/s]

  0%|          | 0/3369 [00:00<?, ?it/s]

In [28]:
train.head()

Unnamed: 0,text,topics,tokens
0,NZ bonds close well bid ahead of key U.S. data...,MCAT,"[nz, bonds, close, well, bid, ahead, of, key, ..."
1,Asia Product Swaps - Jet/gas oil regrade at di...,MCAT,"[asia, product, swaps, jet, gas, oil, regrade,..."
2,U.S. public schools get a C report card in qua...,GCAT,"[public, schools, get, a, c, report, card, in,..."
3,Thunder Bay vessel clearances - May 12. Daily ...,MCAT,"[thunder, bay, vessel, clearances, may, daily,..."
4,"Amoco gains shares in Ula,Gyda N.Sea fields. A...",CCAT,"[amoco, gains, shares, in, ula, gyda, fields, ..."


We have our tokens, and we will be using these for the classifier.

---

### Multinomial Naive Bayes

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

In [30]:
mnb = make_pipeline(CountVectorizer(analyzer=identity), MultinomialNB())
mnb.fit(train["tokens"], train["topics"])
predicted = mnb.predict(test["tokens"])
print(classification_report(test["topics"], predicted))

              precision    recall  f1-score   support

        CCAT       0.96      0.94      0.95      1475
        ECAT       0.92      0.65      0.76       268
        GCAT       0.93      0.98      0.95       806
        MCAT       0.90      0.97      0.94       820

    accuracy                           0.93      3369
   macro avg       0.93      0.88      0.90      3369
weighted avg       0.94      0.93      0.93      3369



In [31]:
import logger
import mlflow
from logger import log_search, log_test

In [36]:
mlflow.set_experiment("lab-6")
log_test(mnb, test["topics"], predicted)

INFO: 'lab-6' does not exist. Creating a new experiment


---

### Hyperparameters

In [37]:
from dask.distributed import Client

client = Client("tcp://127.0.0.1:43565")
client

0,1
Client  Scheduler: tcp://127.0.0.1:43565  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 16.62 GB


In [38]:
from dask_ml.model_selection import RandomizedSearchCV
from scipy.stats.distributions import loguniform, randint, uniform

In [39]:
from warnings import simplefilter

simplefilter(action="ignore", category=FutureWarning)

In [40]:
mlflow.set_experiment("lab-6/MultinomialNB")

INFO: 'lab-6/MultinomialNB' does not exist. Creating a new experiment


In [41]:
%%time

search = RandomizedSearchCV(
    mnb,
    {
        "countvectorizer__min_df": randint(1, 10),
        "countvectorizer__max_df": uniform(0.5, 0.5),
        "multinomialnb__alpha": loguniform(1e-8, 100.0),
    },
    n_iter=25,
    scoring="f1_macro",
)
search.fit(train["tokens"], train["topics"])
log_search(search)

CPU times: user 6.8 s, sys: 402 ms, total: 7.2 s
Wall time: 52 s


In [42]:
%%time

search = RandomizedSearchCV(
    mnb,
    {
        "countvectorizer__min_df": randint(1, 10),
        "countvectorizer__max_df": uniform(0.5, 0.5),
        "multinomialnb__alpha": loguniform(1e-3, 1.0),
    },
    n_iter=25,
    scoring="f1_macro",
)
search.fit(train["tokens"], train["topics"])
log_search(search)

CPU times: user 6.83 s, sys: 326 ms, total: 7.16 s
Wall time: 51.7 s


In [43]:
%%time

search = RandomizedSearchCV(
    mnb,
    {
        "countvectorizer__min_df": randint(1, 10),
        "countvectorizer__max_df": uniform(0.5, 0.5),
        "multinomialnb__alpha": [0.1],
    },
    n_iter=25,
    scoring="f1_macro",
)
search.fit(train["tokens"], train["topics"])
log_search(search)

CPU times: user 6.69 s, sys: 379 ms, total: 7.07 s
Wall time: 52.1 s


----

### Optimized model

In [44]:
mnb = make_pipeline(
    CountVectorizer(analyzer=identity, min_df=2, max_df=0.7), MultinomialNB(alpha=0.1)
)
mnb.fit(train["tokens"], train["topics"])
predicted = mnb.predict(test["tokens"])
print(classification_report(test["topics"], predicted))

              precision    recall  f1-score   support

        CCAT       0.97      0.95      0.96      1475
        ECAT       0.92      0.76      0.83       268
        GCAT       0.92      0.98      0.95       806
        MCAT       0.93      0.97      0.95       820

    accuracy                           0.94      3369
   macro avg       0.93      0.91      0.92      3369
weighted avg       0.94      0.94      0.94      3369



In [45]:
mlflow.set_experiment("lab-6")
log_test(mnb, test["topics"], predicted)