## Imports

In [8]:
!pip install -U -q hyperopt scikit-optimize git+https://github.com/hyperopt/hyperopt-sklearn pandas sklearn numpy

In [9]:
import sys
import os
from pathlib import Path

from sklearn import metrics
import pandas as pd
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from hyperopt import tpe
from hpsklearn import HyperoptEstimator, linear_svc, tfidf, any_sparse_classifier, random_forest_classifier, extra_tree_classifier

In [10]:
import warnings
warnings.filterwarnings("ignore")
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"  # Also affect subprocesses

## Data loading

In [11]:
# DATA_FOLDER = Path('.')
DATA_FOLDER = Path('../datasets/')

In [33]:
df_train = pd.read_csv(DATA_FOLDER / 'train_preprocessed.csv')
df_val = pd.read_csv(DATA_FOLDER / 'val_preprocessed.csv')
df_test = pd.read_csv(DATA_FOLDER / 'test_preprocessed.csv')

len(df_train), len(df_val), len(df_test)

(12240, 1000, 860)

## Hyperopt

In [47]:
%%time

estim = HyperoptEstimator(
    classifier=linear_svc('model'),
    preprocessing=[tfidf('tfidf')],
    algo=tpe.suggest,
    trial_timeout=60*60*60,
    n_jobs=8,
    max_evals=64,
)

estim.fit(df_train['tweet'], df_train['label'], cv_shuffle=True)
estim.best_model()

100%|██████████████████████████████████████████████████████████████████| 1/1 [00:41<00:00, 41.30s/trial, best loss: 0.33129084967320266]
100%|███████████████████████████████████████████████████████████████████| 2/2 [00:03<00:00,  3.23s/trial, best loss: 0.2691993464052288]
100%|███████████████████████████████████████████████████████████████████| 3/3 [00:12<00:00, 12.37s/trial, best loss: 0.2691993464052288]
100%|███████████████████████████████████████████████████████████████████| 4/4 [00:52<00:00, 52.76s/trial, best loss: 0.2691993464052288]
100%|██████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00,  4.23s/trial, best loss: 0.26511437908496727]
100%|██████████████████████████████████████████████████████████████████| 6/6 [00:03<00:00,  3.37s/trial, best loss: 0.26511437908496727]
100%|██████████████████████████████████████████████████████████████████| 7/7 [00:09<00:00,  9.47s/trial, best loss: 0.26511437908496727]
100%|████████████████████████████████████

Process Process-21:
Traceback (most recent call last):
  File "/Users/kblack/.pyenv/versions/3.10.6/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/kblack/.pyenv/versions/3.10.6/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/kblack/Projects/rug/lfd-final-project/.venv/lib/python3.10/site-packages/hpsklearn/estimator/_cost_fn.py", line 199, in _cost_fn
    learner.fit(XEXfit, yfit)
  File "/Users/kblack/Projects/rug/lfd-final-project/.venv/lib/python3.10/site-packages/sklearn/svm/_classes.py", line 257, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
  File "/Users/kblack/Projects/rug/lfd-final-project/.venv/lib/python3.10/site-packages/sklearn/svm/_base.py", line 1205, in _fit_liblinear
    raw_coef_, n_iter_ = liblinear.train_wrap(
KeyboardInterrupt


100%|█████████████████████████████████████████████████████████████████| 16/16 [00:26<00:00, 26.18s/trial, best loss: 0.2577614379084967]
CPU times: user 6.81 s, sys: 993 ms, total: 7.81 s
Wall time: 6min 13s


{'learner': LinearSVC(C=1.281113557643012, intercept_scaling=1.4929904983561797,
           max_iter=1129, random_state=2, tol=0.0020463026535875875),
 'preprocs': (TfidfVectorizer(ngram_range=(1, 3), norm=None, smooth_idf=False, use_idf=False),),
 'ex_preprocs': ()}

In [15]:
test_pipeline = Pipeline([
    ('vec', TfidfVectorizer(ngram_range=(1, 3), norm=None, smooth_idf=False, use_idf=False)),
    ('model', LinearSVC(C=1.2811, intercept_scaling=1.4929,
           max_iter=1129, random_state=2, tol=0.0021)),
]).fit(df_train['tweet'], df_train['label'])

In [16]:
print('test f1-macro score:', metrics.f1_score(df_test['label'], test_pipeline.predict(df_test['tweet']), average='macro'))
print('test acc      score:', metrics.accuracy_score(df_test['label'], test_pipeline.predict(df_test['tweet'])))

test f1-macro score: 0.7048320785167118
test acc      score: 0.7813953488372093


In [27]:
%%timeit
test_pipeline.predict(df_test['tweet'])

45.4 ms ± 3.64 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Bayesian Optimization

In [19]:
%%time

pipeline = Pipeline([
    ('vec', TfidfVectorizer()),
    ('model', LinearSVC()),
])


# can not use tuples for `vec__ngram_range`, due to https://github.com/scikit-optimize/scikit-optimize/issues/967
class MyTuple:
    def __init__(self, *tp):
        self.tp = tp

    def __iter__(self):
        return iter(self.tp)

    def __hash__(self):
        return hash(self.tp)

    def __repr__(self):
        return self.tp.__repr__()

    def __str__(self):
        return self.tp.__str__()


opt = BayesSearchCV(
    pipeline,
    {
        'vec__lowercase': [True, False],
        'vec__stop_words': [None, 'english'],
        'vec__ngram_range': Categorical([MyTuple(1,1), MyTuple(1,2), MyTuple(1,3), MyTuple(2,2), MyTuple(2,3)], transform='identity'),
        'vec__norm': ['l1', 'l2'],
        'vec__sublinear_tf': [True, False],
        'vec__max_df': (0.9, 1.0, 'uniform'),
        'vec__min_df': (0.0, 0.1, 'uniform'),
     
        'model__C': (1e-6, 1e+6, 'log-uniform'),
        'model__tol': (1e-6, 1e-2, 'log-uniform'),
        'model__loss': ['hinge', 'squared_hinge'],
    },
    n_iter=64,
    cv=4,
    n_jobs=4,
    scoring='f1_macro',
    verbose=1,
)

opt.fit(df_train['tweet'], df_train['label'])

print('cv acc score:', opt.best_score_)
print(str(opt.best_estimator_))
opt.best_params_

Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fi

OrderedDict([('model__C', 2.3090870203258773),
             ('model__loss', 'squared_hinge'),
             ('model__tol', 0.0055917094329699935),
             ('vec__lowercase', True),
             ('vec__max_df', 0.9719971271707429),
             ('vec__min_df', 0.0),
             ('vec__ngram_range', (1, 2)),
             ('vec__norm', 'l2'),
             ('vec__stop_words', 'english'),
             ('vec__sublinear_tf', True)])

In [20]:
print('val f1 score:', opt.score(df_val['tweet'], df_val['label']))

val f1 score: 0.696657161924407


In [21]:
print('val f1  score:', metrics.f1_score(df_val['label'], opt.best_estimator_.predict(df_val['tweet']), average='binary', pos_label='NOT'))
print('val acc score:', metrics.accuracy_score(df_val['label'], opt.best_estimator_.predict(df_val['tweet'])))

val f1  score: 0.8113207547169812
val acc score: 0.74


In [36]:
print('val f1  score:', metrics.f1_score(df_val['label'], opt.best_estimator_.predict(df_val['tweet']), average='binary', pos_label='NOT'))
print('val acc score:', metrics.accuracy_score(df_val['label'], opt.best_estimator_.predict(df_val['tweet'])))

val f1  score: 0.7948717948717949
val acc score: 0.728


In [27]:
test_pipeline = Pipeline([
    ('vec', TfidfVectorizer(max_df=0.9719, min_df=0.0,
                                 ngram_range=(1, 2), stop_words='english',
                                 sublinear_tf=True)),
    ('model', LinearSVC(C=2.3091, tol=0.0056))
]).fit(df_train['tweet'], df_train['label'])

In [28]:
print('test f1-macro score:', metrics.f1_score(df_test['label'], test_pipeline.predict(df_test['tweet']), average='macro'))
print('test acc      score:', metrics.accuracy_score(df_test['label'], test_pipeline.predict(df_test['tweet'])))

test f1-macro score: 0.7468693544833663
test acc      score: 0.8058139534883721


In [29]:
metrics.confusion_matrix(df_test['label'], test_pipeline.predict(df_test['tweet']))

array([[554,  66],
       [101, 139]])

In [30]:
print(metrics.classification_report(df_test['label'], test_pipeline.predict(df_test['tweet'])))

              precision    recall  f1-score   support

         NOT       0.85      0.89      0.87       620
         OFF       0.68      0.58      0.62       240

    accuracy                           0.81       860
   macro avg       0.76      0.74      0.75       860
weighted avg       0.80      0.81      0.80       860



In [31]:
%%timeit
test_pipeline.predict(df_test['tweet'])

55.7 ms ± 16.4 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Prediction Save

In [36]:
df_pred = pd.DataFrame.from_dict({
    'tweet': df_val['tweet'],
    'raw': df_val['raw'],
    'true_label': df_val['label'],
    'pred_label': test_pipeline.predict(df_val['tweet']),
})
df_pred.to_csv('../predictions/val-models-output/baseline-tf-idf-linear-svm.csv', index=False)
df_pred

Unnamed: 0,tweet,raw,true_label,pred_label
0,@user @user you are so beautiful,@USER @USER you are so beautiful,NOT,NOT
1,@user he is not a troll he is simply dumb but ...,@USER He is not a troll he is simply dumb but ...,OFF,OFF
2,@user i understand annie she is stuck in betwe...,@USER I understand Annie she is stuck in betwe...,NOT,NOT
3,@user @user hillary was blaming women not too ...,@USER @USER Hillary was blaming women not too ...,OFF,OFF
4,@user @user i support jahs blessings on his hu...,@USER @USER I support Jahs blessings on his hu...,OFF,NOT
...,...,...,...,...
995,@user sometimes i get strong vibes from people...,@USER Sometimes I get strong vibes from people...,OFF,NOT
996,benidorm check mark button creamfields che...,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,NOT
997,@user and why report this garbage. we don't g...,@USER And why report this garbage. We don't g...,OFF,OFF
998,@user pussy,@USER Pussy,OFF,OFF


In [37]:
df_pred = pd.DataFrame.from_dict({
    'tweet': df_test['tweet'],
    'raw': df_test['raw'],
    'true_label': df_test['label'],
    'pred_label': test_pipeline.predict(df_test['tweet']),
})
df_pred.to_csv('../predictions/tst-models-output/baseline-tf-idf-linear-svm.csv', index=False)
df_pred

Unnamed: 0,tweet,raw,true_label,pred_label
0,who is q wheres the server dump nike dec las f...,#WhoIsQ #WheresTheServer #DumpNike #DECLASFISA...,OFF,NOT
1,"constitution day is revered by conservatives, ...","#ConstitutionDay is revered by Conservatives, ...",NOT,NOT
2,foxnews nra maga potus trump 2nd amendment rnc...,#FOXNews #NRA #MAGA #POTUS #TRUMP #2ndAmendmen...,NOT,NOT
3,watching boomer getting the news that she is s...,#Watching #Boomer getting the news that she is...,NOT,NOT
4,no pasaran: unity demo to oppose the far-right...,#NoPasaran: Unity demo to oppose the far-right...,OFF,NOT
...,...,...,...,...
855,despicable dems lie again about rifles. dem di...,#DespicableDems lie again about rifles. Dem Di...,OFF,OFF
856,meet the speakers raising hands @user will p...,#MeetTheSpeakers 🙌 @USER will present in our e...,NOT,NOT
857,3 people just unfollowed me for talking about ...,3 people just unfollowed me for talking about ...,OFF,OFF
858,wednesday wisdom antifa calls the right fascis...,#WednesdayWisdom Antifa calls the right fascis...,NOT,OFF


In [66]:
pd.options.display.max_colwidth = None
df_train['len'] = df_train['tweet'].apply(len)
df_train[df_train['len'] < 50][df_train['label'] == 'OFF'][['raw', 'tweet', 'label']].sample(5)

Unnamed: 0,raw,tweet,label
5772,@USER He schools read a ammo box warning label...,@user he schools read a ammo box warning label...,OFF
5141,@USER Because you are 💩,@user because you are pile of poo,OFF
7753,@USER He’s a dumb ass!,@user he’s a dumb ass!,OFF
1458,@USER Shut up man You are useless,@user shut up man you are useless,OFF
8546,@USER Fool,@user fool,OFF
