# Load Competition Data

In [37]:
# import pandas as pd

# train = pd.read_csv('./Kaggle Data/train.csv')
# test = pd.read_csv('./Kaggle Data/test.csv')

In [39]:
# train.head()

In [28]:
# imports
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
from sklearn.decomposition import TruncatedSVD
import spacy

In [29]:
categories = ['talk.politics.misc', 'sci.space']
data = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))

In [30]:
display(data.keys())
display(data['target'].shape)
display(data['data'][1])

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

(1058,)

' ajteel@dendrite.cs.Colorado.EDU (A.J. Teel) writes...\n\nOn whose authority do you have this and on what grounds was it \ndismissed?\n\n\t\t\t\t\t\tDaniel Reitman\n\nHOW NOT TO WRITE A DEED\n\nOne case involved the construction of a conveyance to grantees "jointly, as \ntenants in common, with equal rights and interest in said land, and to the \nsurvivor thereof, in fee simple. . . . To Have and to Hold the same unto the \nsaid parties hereto, equally, jointly, as tenants in common, with equal rights \nand interest for the period or term of their lives, and to the survivor thereof \nat the death of the other."'

In [31]:
vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
rfc = RandomForestClassifier()

In [32]:
pipe = Pipeline(
    [
        ('vect',vect),
        ('clf', rfc)
    ]
)

In [33]:
parameters = {
    'vect__max_df': (0.75, 1.0),
    'vect__min_df': (0.02, 0.05),
    'vect__max_features': (500, 1000),
    'clf__n_estimators': (5, 10,),
    'clf__max_depth': (15, 20)
}

grid_search = GridSearchCV(pipe, parameters, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(data.data, data.target)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect',
                                        TfidfVectorizer(ngram_range=(1, 2),
                                                        stop_words='english')),
                                       ('clf', RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'clf__max_depth': (15, 20),
                         'clf__n_estimators': (5, 10),
                         'vect__max_df': (0.75, 1.0),
                         'vect__max_features': (500, 1000),
                         'vect__min_df': (0.02, 0.05)},
             verbose=1)

In [35]:
display(grid_search.best_score_)
display(grid_search.best_params_)

0.827971027452383

{'clf__max_depth': 15,
 'clf__n_estimators': 10,
 'vect__max_df': 0.75,
 'vect__max_features': 1000,
 'vect__min_df': 0.02}

In [46]:
display(grid_search.predict([
    'This is a free society',
    'Elections will be in November',
    "Covid doesn't care what your political beliefs are.",
    'The team succesfully launched their satellite into low-earth orbit',
    'The information paradox was solved last week, say some scientists.',
    'We will be living on Mars in 2050.'
]))
display(grid_search.predict_proba(['Send me lots of money now', 'you won the lottery in Nigeria']))

array([0, 0, 0, 0, 1, 0])

array([[0.5967724, 0.4032276],
       [0.5967724, 0.4032276]])

In [49]:
parameters = {
    'vect__max_df': scipy.stats.uniform(0.8, 0.99),
    'vect__min_df': scipy.stats.uniform(0.01, 0.05),
    'vect__max_features': scipy.stats.randint(500, 1000),
    'clf__n_estimators': scipy.stats.randint(30, 100),
    'clf__max_depth': scipy.stats.randint(20, 100)
}

# rand_search = RandomizedSearchCV(pipe, parameters, cv=3, n_jobs=-1, verbose=-1, n_iter=25)
# rand_search.fit(data['data'], data['target'])

In [46]:
display(rand_search.best_score_)
display(rand_search.best_params_)

0.8619865439093485

{'clf__max_depth': 96,
 'clf__n_estimators': 39,
 'vect__max_df': 1.0354135316206217,
 'vect__max_features': 948,
 'vect__min_df': 0.014927942088577232}

In [10]:
display(data['target_names'])
rand_search.predict([
    'This is a free society',
    'Elections will be in November',
    "Covid doesn't care what your political beliefs are.",
    'The team succesfully launched their satellite into low-earth orbit',
    'The information paradox was solved last week, say some scientists.',
    'We will be living on Mars in 2050.'
])

['sci.space', 'talk.politics.misc']

array([1, 0, 1, 0, 0, 0])

In [11]:
svd = TruncatedSVD(n_components=100,
                   algorithm='randomized',
                   n_iter=10)

In [56]:
params = {
    'lsi__svd__n_components': scipy.stats.randint(10, 100),
    'lsi__vect__max_df': scipy.stats.uniform(0.8, 0.99),
    'clf__n_estimators': scipy.stats.randint(10, 100)
}

In [57]:
lsi = Pipeline([('vect', vect), ('svd', svd)])
pipe = Pipeline([('lsi', lsi), ('clf', rfc)])

In [58]:
print(pipe)

Pipeline(steps=[('lsi',
                 Pipeline(steps=[('vect',
                                  TfidfVectorizer(ngram_range=(1, 2),
                                                  stop_words='english')),
                                 ('svd',
                                  TruncatedSVD(n_components=100, n_iter=10))])),
                ('clf', RandomForestClassifier())])


In [62]:
rand_search = RandomizedSearchCV(pipe, params, n_iter=15, cv=3, n_jobs=-1, verbose=1)
rand_search.fit(data.data, data.target)

Fitting 3 folds for each of 15 candidates, totalling 45 fits


RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('lsi',
                                              Pipeline(steps=[('vect',
                                                               TfidfVectorizer(ngram_range=(1,
                                                                                            2),
                                                                               stop_words='english')),
                                                              ('svd',
                                                               TruncatedSVD(n_components=100,
                                                                            n_iter=10))])),
                                             ('clf',
                                              RandomForestClassifier())]),
                   n_iter=15, n_jobs=-1,
                   param_distributions={'clf__n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f7f287cc430>,

In [63]:
display(rand_search.best_score_)
display(rand_search.best_params_)

0.8912809039402525

{'clf__n_estimators': 54,
 'lsi__svd__n_components': 38,
 'lsi__vect__max_df': 1.753657175117353}

In [67]:
best_pipeline = rand_search.best_estimator_

In [70]:
rand_search.best_params_

{'clf__n_estimators': 54,
 'lsi__svd__n_components': 38,
 'lsi__vect__max_df': 1.753657175117353}

In [69]:
best_pipeline.named_steps['lsi'].named_steps['vect'].transform([data['data'][1]]).todense().shape

(1, 123386)

In [22]:
nlp = spacy.load('en_core_web_lg')

In [23]:
doc = nlp('Two bananas in pyjamas')


In [24]:
bananas_vector = doc.vector

In [25]:
print(len(bananas_vector))

300


In [43]:
def get_word_vectors(docs):
    return [nlp(doc).vector for doc in docs]

In [44]:
%%time
X = get_word_vectors(data.data)
len(X) == len(data.data)

CPU times: user 41.4 s, sys: 2.53 s, total: 44 s
Wall time: 44.1 s


True

In [51]:
X[0].shape

(300,)

In [53]:
rfc.fit(X, data['target'])

RandomForestClassifier()

In [54]:
rfc.score(X, data['target'])

0.9877126654064272

In [55]:
from multiprocessing import Pool, cpu_count

In [59]:
# cpu_count()

In [57]:
pool = Pool(3)

In [62]:
def to_vec(doc):
    return nlp(doc).vector

In [64]:
%%time
to_vec(data['data'][0])

CPU times: user 83.3 ms, sys: 14.7 ms, total: 98.1 ms
Wall time: 97.4 ms


array([-6.36314899e-02,  1.15123436e-01, -2.54435129e-02, -5.07751815e-02,
        7.32879788e-02, -1.41258379e-02, -2.37843227e-02, -8.71081837e-04,
        5.16206212e-02,  2.75603235e-01, -1.74923956e-01,  1.93485975e-01,
        3.78735922e-02, -5.68364896e-02,  5.12874825e-03,  5.21986149e-02,
       -1.15655169e-01,  8.18282187e-01, -1.87957734e-01,  2.14149207e-02,
       -6.64218739e-02, -1.22655006e-02, -1.01321600e-01, -3.54594067e-02,
        5.48648052e-02, -1.37901276e-01,  2.59652943e-03, -3.70653681e-02,
        4.70878966e-02,  9.94207859e-02, -6.74006790e-02,  1.01654744e-02,
       -5.52445203e-02, -1.79846697e-02,  3.34651656e-02, -6.76414222e-02,
       -4.32113558e-03,  8.59654397e-02, -3.14159319e-02,  1.38683677e-01,
        1.63842868e-02, -1.80005450e-02,  2.07979470e-01,  1.77050326e-02,
       -4.96200211e-02, -1.17674001e-01,  1.45270769e-02,  1.71200648e-01,
       -1.05166354e-03, -2.36345157e-02,  6.37273025e-03,  3.13890316e-02,
        1.19358571e-02, -