## Imports

In [1]:
!pip install -U -q hyperopt scikit-optimize git+https://github.com/hyperopt/hyperopt-sklearn

[33m  DEPRECATION: future is being installed using the legacy 'setup.py install' method, because it does not have a 'pyproject.toml' and the 'wheel' package is not installed. pip 23.1 will enforce this behaviour change. A possible replacement is to enable the '--use-pep517' option. Discussion can be found at https://github.com/pypa/pip/issues/8559[0m[33m
[0m

In [6]:
!pip install -U -q pandas sklearn numpy

In [8]:
import sys
import os
from pathlib import Path

from sklearn import metrics
import pandas as pd
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from hyperopt import tpe
from hpsklearn import HyperoptEstimator, linear_svc, tfidf, any_sparse_classifier

In [9]:
import warnings
warnings.filterwarnings("ignore")
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"  # Also affect subprocesses

## Data loading

In [10]:
# DATA_FOLDER = Path('.')
DATA_FOLDER = Path('../datasets/')

In [12]:
df_train = pd.read_csv(DATA_FOLDER / 'train.tsv', sep='\t', header=None, names=['text', 'label'])
df_val = pd.read_csv(DATA_FOLDER / 'val.tsv', sep='\t', header=None, names=['text', 'label'])
df_test = pd.read_csv(DATA_FOLDER / 'test.tsv', sep='\t', header=None, names=['text', 'label'])

len(df_train), len(df_val), len(df_test)

(12240, 1000, 860)

## Hyperopt

In [47]:
%%time

estim = HyperoptEstimator(
    classifier=linear_svc('model'),
    # classifier=any_sparse_classifier('model'),
    # preprocessing=[TfidfVectorizer(ngram_range=(1,2), sublinear_tf=True, stop_words='english')],
    preprocessing=[tfidf('tfidf')],
    algo=tpe.suggest,
    trial_timeout=60*60*60,
    n_jobs=8,
    max_evals=64,
)

estim.fit(df_train['text'], df_train['label'], cv_shuffle=True)
estim.best_model()

100%|██████████████████████████████████████████████████████████████████| 1/1 [00:41<00:00, 41.30s/trial, best loss: 0.33129084967320266]
100%|███████████████████████████████████████████████████████████████████| 2/2 [00:03<00:00,  3.23s/trial, best loss: 0.2691993464052288]
100%|███████████████████████████████████████████████████████████████████| 3/3 [00:12<00:00, 12.37s/trial, best loss: 0.2691993464052288]
100%|███████████████████████████████████████████████████████████████████| 4/4 [00:52<00:00, 52.76s/trial, best loss: 0.2691993464052288]
100%|██████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00,  4.23s/trial, best loss: 0.26511437908496727]
100%|██████████████████████████████████████████████████████████████████| 6/6 [00:03<00:00,  3.37s/trial, best loss: 0.26511437908496727]
100%|██████████████████████████████████████████████████████████████████| 7/7 [00:09<00:00,  9.47s/trial, best loss: 0.26511437908496727]
100%|████████████████████████████████████

Process Process-21:
Traceback (most recent call last):
  File "/Users/kblack/.pyenv/versions/3.10.6/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/kblack/.pyenv/versions/3.10.6/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/kblack/Projects/rug/lfd-final-project/.venv/lib/python3.10/site-packages/hpsklearn/estimator/_cost_fn.py", line 199, in _cost_fn
    learner.fit(XEXfit, yfit)
  File "/Users/kblack/Projects/rug/lfd-final-project/.venv/lib/python3.10/site-packages/sklearn/svm/_classes.py", line 257, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
  File "/Users/kblack/Projects/rug/lfd-final-project/.venv/lib/python3.10/site-packages/sklearn/svm/_base.py", line 1205, in _fit_liblinear
    raw_coef_, n_iter_ = liblinear.train_wrap(
KeyboardInterrupt


100%|█████████████████████████████████████████████████████████████████| 16/16 [00:26<00:00, 26.18s/trial, best loss: 0.2577614379084967]
CPU times: user 6.81 s, sys: 993 ms, total: 7.81 s
Wall time: 6min 13s


{'learner': LinearSVC(C=1.281113557643012, intercept_scaling=1.4929904983561797,
           max_iter=1129, random_state=2, tol=0.0020463026535875875),
 'preprocs': (TfidfVectorizer(ngram_range=(1, 3), norm=None, smooth_idf=False, use_idf=False),),
 'ex_preprocs': ()}

In [48]:
print('val f1  score:', metrics.f1_score(df_val['label'], estim.predict(df_val['text']), average='binary', pos_label='NOT'))
print('val acc score:', metrics.accuracy_score(df_val['label'], estim.predict(df_val['text'])))

val f1  score: 0.819672131147541
val acc score: 0.747


In [54]:
test_pipeline = Pipeline([
    ('vec', TfidfVectorizer(ngram_range=(1, 3), norm=None, smooth_idf=False, use_idf=False)),
    ('model', LinearSVC(C=1.2811, intercept_scaling=1.4929,
           max_iter=1129, random_state=2, tol=0.0021)),
]).fit(df_train['text'], df_train['label'])

In [55]:
print('test f1  score:', metrics.f1_score(df_test['label'], test_pipeline.predict(df_test['text']), average='binary', pos_label='NOT'))
print('test acc score:', metrics.accuracy_score(df_test['label'], test_pipeline.predict(df_test['text'])))

test f1  score: 0.8613026819923372
test acc score: 0.7895348837209303


## Bayesian Optimization

In [56]:
%%time

pipeline = Pipeline([
    ('vec', TfidfVectorizer()),
    ('model', LinearSVC()),
])


# can not use tuples for `vec__ngram_range`, due to https://github.com/scikit-optimize/scikit-optimize/issues/967
class MyTuple:
    def __init__(self, *tp):
        self.tp = tp

    def __iter__(self):
        return iter(self.tp)

    def __hash__(self):
        return hash(self.tp)

    def __repr__(self):
        return self.tp.__repr__()

    def __str__(self):
        return self.tp.__str__()


opt = BayesSearchCV(
    pipeline,
    {
        'vec__lowercase': [True, False],
        'vec__stop_words': [None, 'english'],
        'vec__ngram_range': Categorical([MyTuple(1,1), MyTuple(1,2), MyTuple(1,3), MyTuple(2,2), MyTuple(2,3)], transform='identity'),
        'vec__norm': ['l1', 'l2'],
        'vec__sublinear_tf': [True, False],
        'vec__max_df': (0.9, 1.0, 'uniform'),
        'vec__min_df': (0.0, 0.1, 'uniform'),
     
        'model__C': (1e-6, 1e+6, 'log-uniform'),
        'model__tol': (1e-6, 1e-2, 'log-uniform'),
        'model__loss': ['hinge', 'squared_hinge'],
    },
    n_iter=64,
    cv=4,
    n_jobs=4,
    scoring='accuracy',
    verbose=1,
)

opt.fit(df_train['text'], df_train['label'])

print('cv acc score:', opt.best_score_)
print(str(opt.best_estimator_))
opt.best_params_

Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
cv acc score: 0.6858660130718954
Pipeline(steps=[('vec',

OrderedDict([('model__C', 2.634291071523559),
             ('model__loss', 'hinge'),
             ('model__tol', 3.9313062829860266e-05),
             ('vec__lowercase', True),
             ('vec__max_df', 0.9236574545875886),
             ('vec__min_df', 0.014510750330550229),
             ('vec__ngram_range', (1, 1)),
             ('vec__norm', 'l1'),
             ('vec__stop_words', None),
             ('vec__sublinear_tf', False)])

In [57]:
print('val acc score:', opt.score(df_val['text'], df_val['label']))

val acc score: 0.667


In [58]:
print('val f1  score:', metrics.f1_score(df_val['label'], opt.best_estimator_.predict(df_val['text']), average='binary', pos_label='NOT'))
print('val acc score:', metrics.accuracy_score(df_val['label'], opt.best_estimator_.predict(df_val['text'])))

val f1  score: 0.7948243992606284
val acc score: 0.667


In [39]:
test_pipeline = Pipeline(steps=[('vec',
                 TfidfVectorizer(max_df=0.9, min_df=0.02568255483634174,
                                 norm='l1', stop_words='english',
                                 sublinear_tf=True)),
                ('model', LinearSVC(C=0.1951777495712477, tol=0.01))
]).fit(df_train['text'], df_train['label'])

In [40]:
print('test f1  score:', metrics.f1_score(df_test['label'], opt.best_estimator_.predict(df_test['text']), average='binary', pos_label='NOT'))
print('test acc score:', metrics.accuracy_score(df_test['label'], opt.best_estimator_.predict(df_test['text'])))

test f1  score: 0.846945778997941
test acc score: 0.7406976744186047
