## Imports

In [1]:
!pip install -U -q hyperopt scikit-optimize git+https://github.com/hyperopt/hyperopt-sklearnpandas sklearn numpy

[33m  DEPRECATION: future is being installed using the legacy 'setup.py install' method, because it does not have a 'pyproject.toml' and the 'wheel' package is not installed. pip 23.1 will enforce this behaviour change. A possible replacement is to enable the '--use-pep517' option. Discussion can be found at https://github.com/pypa/pip/issues/8559[0m[33m
[0m

In [17]:
import sys
import os
from pathlib import Path

from sklearn import metrics
import pandas as pd
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from hyperopt import tpe
from hpsklearn import HyperoptEstimator, linear_svc, tfidf, any_sparse_classifier, random_forest_classifier, extra_tree_classifier

In [2]:
import warnings
warnings.filterwarnings("ignore")
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"  # Also affect subprocesses

## Data loading

In [3]:
# DATA_FOLDER = Path('.')
DATA_FOLDER = Path('../datasets/')

In [4]:
df_train = pd.read_csv(DATA_FOLDER / 'train.tsv', sep='\t', header=None, names=['text', 'label'])
df_val = pd.read_csv(DATA_FOLDER / 'val.tsv', sep='\t', header=None, names=['text', 'label'])
df_test = pd.read_csv(DATA_FOLDER / 'test.tsv', sep='\t', header=None, names=['text', 'label'])

len(df_train), len(df_val), len(df_test)

(12240, 1000, 860)

## Hyperopt

In [18]:
%%time

estim = HyperoptEstimator(
    classifier=random_forest_classifier('model'),
    # classifier=any_sparse_classifier('model'),
    # preprocessing=[TfidfVectorizer(ngram_range=(1,2), sublinear_tf=True, stop_words='english')],
    preprocessing=[tfidf('tfidf')],
    algo=tpe.suggest,
    trial_timeout=60*60*60,
    n_jobs=8,
    max_evals=16,
)

estim.fit(df_train['text'], df_train['label'], cv_shuffle=True)
estim.best_model()

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:03<00:00,  3.03s/trial, best loss: 0.3308823529411765]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:03<00:00,  3.68s/trial, best loss: 0.3055555555555556]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:08<00:00,  8.91s/trial, best loss: 0.2679738562091504]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñ

Process Process-22:
Traceback (most recent call last):
  File "/Users/kblack/.pyenv/versions/3.10.6/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/kblack/.pyenv/versions/3.10.6/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/kblack/Projects/rug/lfd-final-project/.venv/lib/python3.10/site-packages/hpsklearn/estimator/_cost_fn.py", line 199, in _cost_fn
    learner.fit(XEXfit, yfit)
  File "/Users/kblack/Projects/rug/lfd-final-project/.venv/lib/python3.10/site-packages/sklearn/ensemble/_forest.py", line 476, in fit
    trees = Parallel(
  File "/Users/kblack/Projects/rug/lfd-final-project/.venv/lib/python3.10/site-packages/joblib/parallel.py", line 1098, in __call__
    self.retrieve()
  File "/Users/kblack/Projects/rug/lfd-final-project/.venv/lib/python3.10/site-packages/joblib/parallel.py", line 975, in retrieve
    self._output.extend(job.get(timeout=self.timeou

In [20]:
print('val f1  score:', metrics.f1_score(df_val['label'], estim.predict(df_val['text']), average='binary', pos_label='NOT'))
print('val acc score:', metrics.accuracy_score(df_val['label'], estim.predict(df_val['text'])))

AttributeError: 'list' object has no attribute 'take'

In [47]:
%%time

estim = HyperoptEstimator(
    classifier=linear_svc('model'),
    preprocessing=[tfidf('tfidf')],
    algo=tpe.suggest,
    trial_timeout=60*60*60,
    n_jobs=8,
    max_evals=64,
)

estim.fit(df_train['text'], df_train['label'], cv_shuffle=True)
estim.best_model()

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:41<00:00, 41.30s/trial, best loss: 0.33129084967320266]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:03<00:00,  3.23s/trial, best loss: 0.2691993464052288]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:12<00:00, 12.37s/trial, best loss: 0.2691993464052288]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚

Process Process-21:
Traceback (most recent call last):
  File "/Users/kblack/.pyenv/versions/3.10.6/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/kblack/.pyenv/versions/3.10.6/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/kblack/Projects/rug/lfd-final-project/.venv/lib/python3.10/site-packages/hpsklearn/estimator/_cost_fn.py", line 199, in _cost_fn
    learner.fit(XEXfit, yfit)
  File "/Users/kblack/Projects/rug/lfd-final-project/.venv/lib/python3.10/site-packages/sklearn/svm/_classes.py", line 257, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
  File "/Users/kblack/Projects/rug/lfd-final-project/.venv/lib/python3.10/site-packages/sklearn/svm/_base.py", line 1205, in _fit_liblinear
    raw_coef_, n_iter_ = liblinear.train_wrap(
KeyboardInterrupt


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:26<00:00, 26.18s/trial, best loss: 0.2577614379084967]
CPU times: user 6.81 s, sys: 993 ms, total: 7.81 s
Wall time: 6min 13s


{'learner': LinearSVC(C=1.281113557643012, intercept_scaling=1.4929904983561797,
           max_iter=1129, random_state=2, tol=0.0020463026535875875),
 'preprocs': (TfidfVectorizer(ngram_range=(1, 3), norm=None, smooth_idf=False, use_idf=False),),
 'ex_preprocs': ()}

In [48]:
print('val f1  score:', metrics.f1_score(df_val['label'], estim.predict(df_val['text']), average='binary', pos_label='NOT'))
print('val acc score:', metrics.accuracy_score(df_val['label'], estim.predict(df_val['text'])))

val f1  score: 0.819672131147541
val acc score: 0.747


In [31]:
test_pipeline = Pipeline([
    ('vec', TfidfVectorizer(ngram_range=(1, 3), norm=None, smooth_idf=False, use_idf=False)),
    ('model', LinearSVC(C=1.2811, intercept_scaling=1.4929,
           max_iter=1129, random_state=2, tol=0.0021)),
]).fit(df_train['text'], df_train['label'])

In [32]:
print('test f1-macro score:', metrics.f1_score(df_test['label'], test_pipeline.predict(df_test['text']), average='macro'))
print('test acc      score:', metrics.accuracy_score(df_test['label'], test_pipeline.predict(df_test['text'])))

test f1-macro score: 0.7125790518395421
test acc      score: 0.7895348837209303


In [27]:
%%timeit
test_pipeline.predict(df_test['text'])

45.4 ms ¬± 3.64 ms per loop (mean ¬± std. dev. of 7 runs, 10 loops each)


## Bayesian Optimization

In [33]:
%%time

pipeline = Pipeline([
    ('vec', TfidfVectorizer()),
    ('model', LinearSVC()),
])


# can not use tuples for `vec__ngram_range`, due to https://github.com/scikit-optimize/scikit-optimize/issues/967
class MyTuple:
    def __init__(self, *tp):
        self.tp = tp

    def __iter__(self):
        return iter(self.tp)

    def __hash__(self):
        return hash(self.tp)

    def __repr__(self):
        return self.tp.__repr__()

    def __str__(self):
        return self.tp.__str__()


opt = BayesSearchCV(
    pipeline,
    {
        'vec__lowercase': [True, False],
        'vec__stop_words': [None, 'english'],
        'vec__ngram_range': Categorical([MyTuple(1,1), MyTuple(1,2), MyTuple(1,3), MyTuple(2,2), MyTuple(2,3)], transform='identity'),
        'vec__norm': ['l1', 'l2'],
        'vec__sublinear_tf': [True, False],
        'vec__max_df': (0.9, 1.0, 'uniform'),
        'vec__min_df': (0.0, 0.1, 'uniform'),
     
        'model__C': (1e-6, 1e+6, 'log-uniform'),
        'model__tol': (1e-6, 1e-2, 'log-uniform'),
        'model__loss': ['hinge', 'squared_hinge'],
    },
    n_iter=64,
    cv=4,
    n_jobs=4,
    scoring='f1_macro',
    verbose=1,
)

opt.fit(df_train['text'], df_train['label'])

print('cv acc score:', opt.best_score_)
print(str(opt.best_estimator_))
opt.best_params_

Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fi

OrderedDict([('model__C', 16472.31111090191),
             ('model__loss', 'hinge'),
             ('model__tol', 3.5396595648472668e-06),
             ('vec__lowercase', True),
             ('vec__max_df', 0.9674950665723981),
             ('vec__min_df', 0.0),
             ('vec__ngram_range', (1, 3)),
             ('vec__norm', 'l2'),
             ('vec__stop_words', 'english'),
             ('vec__sublinear_tf', False)])

In [34]:
print('val f1 score:', opt.score(df_val['text'], df_val['label']))

val acc score: 0.6956554820056304


In [35]:
print('val f1 score:', opt.score(df_val['text'], df_val['label']))

val f1 score: 0.6956554820056304


In [36]:
print('val f1  score:', metrics.f1_score(df_val['label'], opt.best_estimator_.predict(df_val['text']), average='binary', pos_label='NOT'))
print('val acc score:', metrics.accuracy_score(df_val['label'], opt.best_estimator_.predict(df_val['text'])))

val f1  score: 0.7948717948717949
val acc score: 0.728


In [46]:
test_pipeline = Pipeline(steps=[('vec',
                 TfidfVectorizer(max_df=0.9675, min_df=0.0,
                                 ngram_range=(1, 3), stop_words='english')),
                ('model',
                 LinearSVC(C=16472.31, loss='hinge',
                           tol=3.54e-06))
]).fit(df_train['text'], df_train['label'])

In [47]:
print('test f1-macro score:', metrics.f1_score(df_test['label'], test_pipeline.predict(df_test['text']), average='macro'))
print('test acc      score:', metrics.accuracy_score(df_test['label'], test_pipeline.predict(df_test['text'])))

test f1-macro score: 0.7314270810454646
test acc      score: 0.7883720930232558


In [48]:
metrics.confusion_matrix(df_test['label'], test_pipeline.predict(df_test['text']))

array([[537,  83],
       [ 99, 141]])

In [50]:
print(metrics.classification_report(df_test['label'], test_pipeline.predict(df_test['text'])))

              precision    recall  f1-score   support

         NOT       0.84      0.87      0.86       620
         OFF       0.63      0.59      0.61       240

    accuracy                           0.79       860
   macro avg       0.74      0.73      0.73       860
weighted avg       0.78      0.79      0.79       860



In [40]:
%%timeit
test_pipeline.predict(df_test['text'])

40.4 ms ¬± 1.68 ms per loop (mean ¬± std. dev. of 7 runs, 10 loops each)


## Prediction Save

In [55]:
df_pred = pd.DataFrame.from_dict({
    'text': df_test['text'],
    'true_label': df_test['label'],
    'pred_label': test_pipeline.predict(df_test['text']),
})
df_pred.to_csv('baseline-tf-idf-linear-svm.csv', index=False)
df_pred

Unnamed: 0,text,true_label,pred_label
0,"#WhoIsQ #WheresTheServer #DumpNike #DECLASFISA Democrats Support Antifa, Muslim Brotherhood, MS13, ISIS, Pedophilia, Child Trafficking, Taxpayer Funded Abortion‚Äôs, Election Fraud, Sedition And Treason!!! #LockThemAllUp #WWG1WGA #QAnon ‚Å¶@USER URL",OFF,NOT
1,"#ConstitutionDay is revered by Conservatives, hated by Progressives/Socialist/Democrats that want to change it.",NOT,NOT
2,"#FOXNews #NRA #MAGA #POTUS #TRUMP #2ndAmendment #RNC #USMC #veterans @USER @USER @USER @USER @USER #fakereporting #THESWAMP #dnc #liberals @USER @USER #fakeoutrage @USER First, it reduces the ca URL",NOT,NOT
3,#Watching #Boomer getting the news that she is still up for parole always makes me smile. #Wentworth Finale...@USER is such a treasure. URL,NOT,NOT
4,#NoPasaran: Unity demo to oppose the far-right in #London ‚Äì #antifa #Oct13 ‚Äî Enough is Enough! URL,OFF,NOT
...,...,...,...
855,#DespicableDems lie again about rifles. Dem Distorted the Law to Push Gun-Control at Kavanaugh Confirmation URL via @USER,OFF,NOT
856,#MeetTheSpeakers üôå @USER will present in our event OIW 2018: Finpact - Global Impact through Financial Technologies. She is Senior Advisor Group Sustainable Finance and worked on green energy and climate risk. Join us to meet Thina URL #oiw2018 URL,NOT,NOT
857,3 people just unfollowed me for talking about merlin sorry y'all im still up covinsky's ass im just waiting for a psisly sequel announcement ive run out of witty and funny tweets about tatbilb i am drained,OFF,OFF
858,"#WednesdayWisdom Antifa calls the right fascist when, in all reality, they and the left are following the same scenario as the Third Reich: indoctrination of our youth, trying to control minorities and a total lack of understanding or knowledge of history. #WalkAway",NOT,OFF
