In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import data_utils
import classifier_utils

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.datasets import fetch_20newsgroups


import numpy as np
import scipy

In [29]:
#categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='all', shuffle=True)

In [30]:
texts = twenty_train.data
labels = twenty_train.target.reshape(-1, 1)

In [31]:
#texts, labels = data_utils.get_texts_and_labels('facebook')

In [39]:
featurizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 1)).fit(texts)

In [41]:
tfidf = featurizer.transform(texts)

### Scikit-Learn

In [43]:
mlb = MultiLabelBinarizer().fit(labels)
ml_binarized_labels = mlb.transform(labels)

In [44]:
classifier_utils.kfold_cross_validate(tfidf, ml_binarized_labels, None, 'sklearn')

precision: 0.9670825613267269
recall: 0.7534735057752847
f1: 0.8469865764328526


In [45]:
best_hyperparameters = classifier_utils.find_best_hyperparmeters(tfidf, ml_binarized_labels, 'sklearn')
print("best hyperparmeter are: ", best_hyperparameters)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   49.6s


best hyperparmeter are:  {'estimator__alpha': 2.8061115523949434e-05}


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.5min finished


In [46]:
classifier_utils.kfold_cross_validate(tfidf, ml_binarized_labels, best_hyperparameters, 'sklearn')

precision: 0.96546225990592
recall: 0.8206495294733818
f1: 0.887173790898164


In [47]:
classifier = classifier_utils.create_classifier(None, 'sklearn')
classifier.fit(tfidf, ml_binarized_labels)

OneVsRestClassifier(estimator=SGDClassifier(alpha=0.0001, average=False,
                                            class_weight=None,
                                            early_stopping=False, epsilon=0.1,
                                            eta0=0.0, fit_intercept=True,
                                            l1_ratio=0.15,
                                            learning_rate='optimal',
                                            loss='modified_huber',
                                            max_iter=1000, n_iter_no_change=5,
                                            n_jobs=None, penalty='elasticnet',
                                            power_t=0.5, random_state=None,
                                            shuffle=True, tol=0.001,
                                            validation_fraction=0.1, verbose=0,
                                            warm_start=False),
                    n_jobs=-1)

In [48]:
n = len(classifier.estimators_)
print(n)

20


### Own method

In [49]:
classifier_utils.kfold_cross_validate(tfidf, ml_binarized_labels, None, 'own', n=n)                                      

precision: 0.9670137452054265
recall: 0.7536326288105335
f1: 0.8470652197085474


In [50]:
best_hyperparameters = classifier_utils.find_best_hyperparmeters(tfidf, ml_binarized_labels, 'own', n=n)
print("best hyperparmeter are: ", best_hyperparameters)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    4.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    5.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    5.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    5.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    4.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    4.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    5.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    4.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    4.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    4.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    3.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    3.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    5.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    4.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    4.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    4.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    4.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    3.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    4.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.4s


best hyperparmeter are:  [{'alpha': 2.7433804778017457e-05}, {'alpha': 2.227025475654848e-05}, {'alpha': 2.5404339551899475e-05}, {'alpha': 3.3269019397329184e-05}, {'alpha': 1.7564164485505853e-05}, {'alpha': 1.532317972818177e-05}, {'alpha': 2.4276893994736857e-05}, {'alpha': 7.9456605208417e-06}, {'alpha': 1.6007401756359346e-05}, {'alpha': 8.355965763831181e-08}, {'alpha': 5.470025474963509e-08}, {'alpha': 1.9171424688215757e-05}, {'alpha': 2.439886743623909e-05}, {'alpha': 1.2479084743890675e-05}, {'alpha': 2.169320975977352e-05}, {'alpha': 5.140836859360045e-08}, {'alpha': 5.783309175632145e-07}, {'alpha': 1.6398034508516227e-05}, {'alpha': 2.7198121895353328e-05}, {'alpha': 3.116446939500355e-05}]


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    4.8s finished


In [51]:
classifier_utils.kfold_cross_validate(tfidf, ml_binarized_labels, best_hyperparameters, 'own', n=n)                                      

precision: 0.962737545423272
recall: 0.8230375294959087
f1: 0.8874103963285915
