In [6]:
%load_ext autoreload
%autoreload 2

In [32]:
import scipy
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score

import data_utils

In [14]:
texts, labels = data_utils.get_texts_and_labels('facebook')

In [16]:
print(texts[312])
print(labels[312])

Brothers and sisters: We're going to win this election not because we have a super PAC funded by billionaires.
('corporate power', 'campaign finance')


In [17]:
def create_binarizer(labels):
    binarizer = MultiLabelBinarizer()
    return binarizer.fit(labels)

def create_featurizer(corpus):
    featurizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
    return featurizer.fit(corpus)

def create_classifier(hyperparameters):
    if hyperparameters is not None and 'estimator__alpha' in hyperparameters:
        base_classifier = SGDClassifier(loss='modified_huber', penalty='elasticnet', tol=1e-3, alpha=hyperparameters['estimator__alpha'])
    else:
        base_classifier = SGDClassifier(loss='modified_huber', penalty='elasticnet', tol=1e-3)
    return OneVsRestClassifier(base_classifier, n_jobs=-1)

In [18]:
mlb = create_binarizer(labels)
featurizer = create_featurizer(texts)

tfidf = featurizer.transform(texts)
binarized_labels = mlb.transform(labels)

In [21]:
print(tfidf[312])
print(binarized_labels[312])

  (0, 29192)	0.2867602968749451
  (0, 29191)	0.20563101243820897
  (0, 25712)	0.2597172020626997
  (0, 25709)	0.24389800569408934
  (0, 24446)	0.2867602968749451
  (0, 24442)	0.2239681752576899
  (0, 19066)	0.2867602968749451
  (0, 19065)	0.24389800569408934
  (0, 11552)	0.24389800569408934
  (0, 11429)	0.12156689082737338
  (0, 11000)	0.2867602968749451
  (0, 10998)	0.2239681752576899
  (0, 8880)	0.2867602968749451
  (0, 8869)	0.19892628518558983
  (0, 3475)	0.23267410725045434
  (0, 3471)	0.22024965442790256
  (0, 3064)	0.2081489788890795
[0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [22]:
tfidf_train, tfidf_test, binarized_labels_train, binarized_labels_test = train_test_split(tfidf, binarized_labels, test_size=0.33)

In [23]:
def test_classifier(hyperparameters):
    classifier = create_classifier(hyperparameters)
    classifier.fit(tfidf_train, binarized_labels_train)
    y_pred = classifier.predict(tfidf_test)
    f1 = f1_score(binarized_labels_test, y_pred, average='micro')
    precision = precision_score(binarized_labels_test, y_pred, average='micro')
    recall = recall_score(binarized_labels_test, y_pred, average='micro')
    print(f"precision: {precision}")
    print(f"recall: {recall}")
    print(f"f1: {f1}")

def find_best_hyperparmeters(tfidf, labels):
    classifier = create_classifier(None)
    param_distribution = {'estimator__alpha': scipy.stats.expon(scale=0.00001)}
    extra_kwargs = {'n_jobs': -1}
    scv = RandomizedSearchCV(classifier, param_distribution, n_iter=10, cv=3, scoring='f1_micro', iid=True, verbose=1, refit=False, **extra_kwargs)
    scv.fit(tfidf, labels)
    return scv.best_params_

In [24]:
test_classifier(None)
best_hyperparameters = find_best_hyperparmeters(tfidf, binarized_labels)
test_classifier(best_hyperparameters)
print("best hyperparmeter are: ", best_hyperparameters)

precision: 0.8819875776397516
recall: 0.2634508348794063
f1: 0.40571428571428564
Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    1.5s finished


precision: 0.7574850299401198
recall: 0.46938775510204084
f1: 0.579610538373425
best hyperparmeter are:  {'estimator__alpha': 1.3834633267768954e-06}


In [28]:
classifier = create_classifier(best_hyperparameters)
classifier.fit(tfidf_train, binarized_labels_train)

OneVsRestClassifier(estimator=SGDClassifier(alpha=1.3834633267768954e-06,
                                            average=False, class_weight=None,
                                            early_stopping=False, epsilon=0.1,
                                            eta0=0.0, fit_intercept=True,
                                            l1_ratio=0.15,
                                            learning_rate='optimal',
                                            loss='modified_huber',
                                            max_iter=1000, n_iter_no_change=5,
                                            n_jobs=None, penalty='elasticnet',
                                            power_t=0.5, random_state=None,
                                            shuffle=True, tol=0.001,
                                            validation_fraction=0.1, verbose=0,
                                            warm_start=False),
                    n_jobs=-1)

In [40]:
test_text = "Brothers and sisters: We're going to win this election not because we have a super PAC funded by billionaires."
test_text_tfidf = featurizer.transform([test_text])
test_text_pred = classifier.predict(test_text_tfidf)
mlb.inverse_transform(test_text_pred)[0]

('campaign finance', 'corporate power')