## Imports

In [None]:
import sys
import os

from sklearn import metrics
import pandas as pd
from skopt import BayesSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from hyperopt import tpe
from hpsklearn import HyperoptEstimator, linear_svc

In [None]:
import warnings
warnings.filterwarnings("ignore")
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses

## Data loading

In [None]:
def read_file(filename: str) -> pd.DataFrame:
    return pd.DataFrame([
        (l.split()[0], l.split()[1], ' '.join(l.split()[3:]))
        for l in open(filename)
        ], columns=['class', 'sent', 'text']
    )

In [None]:
df_train = read_file('../datasets/train.txt')
df_test = read_file('../datasets/test.txt')

len(df_train), len(df_test)

In [None]:
X = df_train['text'].values
y = df_train['class'].values

In [None]:
X_test = df_test['text'].values
y_test = df_test['class'].values

## Hyperopt

In [None]:
estim = HyperoptEstimator(
    classifier=linear_svc('model'),
    preprocessing=[TfidfVectorizer(ngram_range=(1,2), sublinear_tf=True, stop_words='english')],
    algo=tpe.suggest,
    trial_timeout=300,
    n_jobs=4,
)

estim.fit(X, y)

estim.best_model()

In [None]:
print('test score: ', metrics.f1_score(y_test, estim.predict(X_test), average='micro'))

## Bayesian Optimization

In [None]:
pipeline = Pipeline([
    ('vec', TfidfVectorizer(ngram_range=(1,2), sublinear_tf=True, stop_words='english')),
    ('model', LinearSVC()),
])


opt = BayesSearchCV(
    pipeline,
    {
        'model__C': (1e-6, 1e+6, 'log-uniform'),
        'model__tol': (1e-6, 1e-2, 'log-uniform'),
        'model__loss': ['hinge', 'squared_hinge'],
        'model__multi_class': ['ovr', 'crammer_singer'],
    },
    n_iter=32,
    cv=5,
    n_jobs=4,
    scoring='f1_micro',
)

opt.fit(X, y)

print('val score:', opt.best_score_)
str(opt.best_params_)

In [None]:
print('test score:', opt.score(X_test, y_test))