## Imports

In [44]:
import sys
import os

from sklearn import metrics
import pandas as pd
from skopt import BayesSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from hyperopt import tpe
from hpsklearn import HyperoptEstimator, linear_svc

sys.path.append('..')
from utils import cv_kfold, train_validate_split

In [45]:
import warnings
warnings.filterwarnings("ignore")
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses

## Data loading

In [46]:
def read_file(filename: str) -> pd.DataFrame:
    return pd.DataFrame([
        (l.split()[0], l.split()[1], ' '.join(l.split()[3:]))
        for l in open(filename)
        ], columns=['class', 'sent', 'text']
    )

In [47]:
df_train = read_file('../datasets/train.txt')
df_test = read_file('../datasets/test.txt')

len(df_train), len(df_test)

(5400, 600)

In [48]:
X = df_train['text'].values
y = df_train['class'].values

In [49]:
X_test = df_test['text'].values
y_test = df_test['class'].values

## Hyperopt

In [50]:
estim = HyperoptEstimator(
    classifier=linear_svc('model'),
    preprocessing=[TfidfVectorizer(ngram_range=(1,2), sublinear_tf=True, stop_words='english')],
    algo=tpe.suggest,
    trial_timeout=300,
    n_jobs=4,
)

estim.fit(X, y)

estim.best_model()

100%|██████████| 1/1 [00:03<00:00,  3.96s/trial, best loss: 0.07314814814814818]
100%|██████████| 2/2 [00:05<00:00,  5.81s/trial, best loss: 0.07314814814814818]
100%|██████████| 3/3 [00:06<00:00,  6.17s/trial, best loss: 0.07314814814814818]
100%|██████████| 4/4 [00:04<00:00,  4.37s/trial, best loss: 0.07314814814814818]
100%|██████████| 5/5 [00:10<00:00, 10.63s/trial, best loss: 0.07314814814814818]
100%|██████████| 6/6 [00:03<00:00,  3.65s/trial, best loss: 0.07314814814814818]
100%|██████████| 7/7 [00:04<00:00,  4.06s/trial, best loss: 0.07314814814814818]
100%|██████████| 8/8 [00:03<00:00,  3.65s/trial, best loss: 0.07222222222222219]
100%|██████████| 9/9 [00:04<00:00,  4.46s/trial, best loss: 0.07222222222222219]
100%|██████████| 10/10 [00:06<00:00,  6.90s/trial, best loss: 0.07222222222222219]


{'learner': LinearSVC(C=0.5885398838335058, intercept_scaling=0.6329639882756152,
           max_iter=1478, multi_class='crammer_singer', random_state=2,
           tol=0.009741897651227838),
 'preprocs': (TfidfVectorizer(ngram_range=(1, 2), stop_words='english', sublinear_tf=True),),
 'ex_preprocs': ()}

In [51]:
print('test score: ', metrics.f1_score(y_test, estim.predict(X_test), average='micro'))

test score:  0.9333333333333333


## Bayesian Optimization

In [23]:
pipeline = Pipeline([
    ('vec', TfidfVectorizer(ngram_range=(1,2), sublinear_tf=True, stop_words='english')),
    ('model', LinearSVC()),
])


opt = BayesSearchCV(
    pipeline,
    {
        'model__C': (1e-6, 1e+6, 'log-uniform'),
        'model__tol': (1e-6, 1e-2, 'log-uniform'),
        'model__loss': ['hinge', 'squared_hinge'],
        'model__multi_class': ['ovr', 'crammer_singer'],
    },
    n_iter=32,
    cv=5,
    n_jobs=4,
    scoring='f1_micro',
)

opt.fit(X, y)

print('val score:', opt.best_score_)
str(opt.best_params_)



val. score: 0.9262962962962963


"OrderedDict([('model__C', 86.48231563164862), ('model__loss', 'squared_hinge'), ('model__multi_class', 'crammer_singer'), ('model__tol', 4.569176354773765e-05)])"

In [33]:
print('test score:', opt.score(X_test, y_test))

test score: 0.9316666666666665
