In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn import svm
from sklearn.linear_model import LogisticRegression as Logistic
# from sklearn.metrics import accuracy_score, label_ranking_average_precision_score

from utils.utils import *
from utils.utils_baseline_svm import *

from collections import Counter, OrderedDict

In [2]:
# call for linear model
kwargs_lin_clf = nice_dict({'mk_chars': True, 
                            'model': 'linear', 
                            'char_filter': 10, 'allowed_chars': None, 
                            'mk_ngrams': True, 'ngram_width': 5, 
                            'ngram_filter': 10, 'allowed_ngrams': None, 
                            'keep_infreq_labels': False, 'label_count_thresh': 10, 
                            'valid_ratio': 0.25, 
                            'scale_func': unscale, 'to_permute': True, })

# vectorizer transforms dict into sparse matrix
v = DictVectorizer(sparse=True)

x_train, x_val, y_train, y_val, allowed_ngrams = \
    data_load_preprocess(**kwargs_lin_clf)

The are 2919 observations
Sampling from allowed 82 labels
82 labels in the validation set, with
1587 potential observation to draw from.
365 observations sampled for validation
1222 observations for training
The ratio of validation to *training* is about 0.299


In [4]:
x_train[0]

Counter({' ': 1,
         ' 0.9%': 1,
         '%': 1,
         '.': 1,
         '0': 1,
         '9': 1,
         'C': 1,
         'Cl 0.': 1,
         'N': 1,
         'NaCl ': 1,
         'a': 1,
         'aCl 0': 1,
         'l': 1,
         'l 0.9': 1})

In [5]:
x_train, x_val, y_train, y_val, _ = \
    data_load_preprocess(**{**kwargs_lin_clf, 
                            **{'linear_counters': False}})

The are 2919 observations
Sampling from allowed 82 labels
82 labels in the validation set, with
1587 potential observation to draw from.
365 observations sampled for validation
1222 observations for training
The ratio of validation to *training* is about 0.299


In [6]:
keep_first_k_chars(input=x_train, k=40, 
                   model='linear', 
                   ngram_width=kwargs_lin_clf.ngram_width, 
                   mk_ngrams=kwargs_lin_clf.mk_ngrams, 
                   allowed_ngrams=allowed_ngrams)

[Counter({' ': 1,
          ' 0.9%': 2,
          '%': 1,
          '.': 1,
          '0': 1,
          '9': 1,
          '<unk-ngram>': 5,
          'C': 1,
          'Cl 0.': 2,
          'N': 1,
          'NaCl ': 2,
          'a': 1,
          'aCl 0': 2,
          'l': 1,
          'l 0.9': 2}),
 Counter({' ': 6,
          ' (Fil': 1,
          ' / Pa': 1,
          ' 1 g)': 1,
          ' Para': 1,
          '(': 1,
          '(Film': 1,
          ')': 1,
          ') / P': 1,
          '/': 1,
          '/ Par': 1,
          '0': 1,
          '1': 2,
          '1 g) ': 1,
          '<unk-ngram>': 3,
          'D': 1,
          'Dafal': 1,
          'F': 1,
          'Filmt': 1,
          'P': 1,
          'Parac': 1,
          'a': 7,
          'abl 1': 1,
          'aceta': 1,
          'afalg': 1,
          'algan': 1,
          'amol ': 1,
          'an (F': 1,
          'arace': 1,
          'b': 1,
          'bl 1 ': 1,
          'c': 1,
          'cetam': 1,
          'e':

In [None]:
ngram_width = 5
k = 80
# allowed_ngrams = ['NaCl ']
# allowed_ngrams = ['NaCl ', 'aCl 0']
unknown_ngram='<unk-ngram>'

func = Counter if True else lambda x: x
filter_join_sliding_window = lambda x, allowed_ngrams: \
    [ngram if ngram in allowed_ngrams else unknown_ngram
     for ngram in join_sliding_window(x[:k], ngram_width)]

line_rep = lambda x: x[:k] if False \
    else x[:k] + filter_join_sliding_window(x, allowed_ngrams)

[func(line_rep(line)) for line in x_train]

In [None]:
Counter([char for line in x_train for char in line])['<unk-char>']

In [None]:
# create a sparse X matrix with character and n-grams features
X_train = v.fit_transform(x_train)
X_val = v.transform(x_val)

# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html

print('X_train (sparse) matrix, of size {} by {} has been created.'
      .format(X_train.get_shape()[0], X_train.get_shape()[1]))  # vectorized

kwargs_svm = nice_dict({'C': 0.01,  # penalty term
                        'decision_function_shape': 'ovr',  # one-vs-rest (‘ovr’) / one-vs-one (‘ovo’) 
                        'random_state': seed(), 
                        'kernel': 'linear', 
                        'gamma': 'auto' ,  # kernel coef for ‘rbf’, ‘poly’ and ‘sigmoid’. ‘auto’ -> 1/n_features
                        'probability': True,  # enable probability estimates 
                        'shrinking': True,  # use the shrinking heuristic 
                        'max_iter': -1  # -1 mean no limitation 
                        })

svm_clf = svm.SVC(**kwargs_svm)

print(svm_clf)

In [None]:
svm_clf.fit(X_train, y_train)

In [None]:
pred = svm_clf.predict(X_val)

# http://scikit-learn.org/stable/modules/svm.html
# http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

# print(accuracy_score(y_val, pred))
# equivalent to:
print('Accuracy on validation set is {:.3f}'.format(svm_clf.score(X_val, y_val)))
# print(pred[:10])

# array to hold log probabilities (takes a bit longer to calc)
pred_prob = svm_clf.predict_log_proba(X_val)
# makes y into array with the same shape as the log prob
y_val_dense = y_to_dense(y=y_val, 
                         classes=svm_clf.classes_)

In [None]:
k = 5
'{:.3f} of observations has the correct class in the top {} prediction'.format(
    in_top_k(y_val_dense, pred_prob, k), k)

print('Mean Reciprocal Rank is {:.3f}'.format(mean_reciprocal_rank(y_val_dense, pred_prob)))

In [None]:
# logistic regression
kwargs_logistic = nice_dict({'C': 1.0, 
                             'penalty': 'l2', 
                             'multi_class': 'ovr', # one-vs-rest (‘ovr’) / one-vs-one (‘ovo’) 
                             'random_state': seed(), 
                             'solver': 'newton-cg',  # ‘liblinear’ is fit to "small" data-sets, crashes kernel 
                             # solver:{‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’}, default: ‘liblinear’
                             'fit_intercept': True
                        })

logistic_clf = Logistic(**kwargs_logistic)
logistic_clf.fit(X_train, y_train)

In [None]:
pred_logistic = logistic_clf.predict(X_val)
print('Accuracy on validation set is {:.3f}'.format(svm_clf.score(X_val, y_val)))

In [None]:
"""
Train multiple classifiers with different hyper-parameters.
"""

In [None]:
k = 5  # for the top_k metric

# for writing the results to file
summary_path = 'SVM_hyperparameter_summary_WITH_NGRAMS_near_NO_FILTER.csv'
# summary_path = 'SVM_hyperparameter_summary_only_BOC.csv'
# summary_path = 'SVM_hyperparameter_summary_WITH_NGRAMS.csv'

summary_to_file = []

for kernel, degree in zip(
    ['linear', *['poly'] * 3, 'rbf', 'sigmoid'], 
    [3, *list(range(2, 2+3)), 3, 3]):
    for C in [0.01, 0.1, 1.0]: 
        for shrinking in [True, False]:
            kwargs_cur_params = {**kwargs_svm, 
                                 **{'kernel': kernel, 
                                    'degree': degree, 
                                    'C': C,
                                    'shrinking': shrinking}}
            
            # clearing all vars (just making sure)
            svm_cur = ''
            pred_cur = ''
            pred_prob_cur = ''
            y_val_dense_cur = ''
            cur_acc, cur_topk, cur_mrr = '', '', ''
            summary_cur_model, summary_dict = '', ''

            # create and fit model
            svm_cur = svm.SVC(**kwargs_cur_params)
            svm_cur.fit(X_train, y_train)
            # predictions
            pred_cur = svm_cur.predict(X_val)
            # log probabilities
            pred_prob_cur = svm_cur.predict_log_proba(X_val)
            # makes y into array with the same shape as the log prob
            # get dense y again, not sure if classes are always ordered the same
            y_val_dense_cur = y_to_dense(y=y_val, 
                                         classes=svm_cur.classes_)
            # collect evaluation metrics
            cur_acc, cur_topk, cur_mrr = \
                svm_cur.score(X_val, y_val), \
                in_top_k(y_val_dense_cur, pred_prob_cur, k), \
                mean_reciprocal_rank(y_val_dense, pred_prob_cur)
            
            summary_to_file.append(OrderedDict(
                sorted(tuple((k,v) for k,v in svm_cur.get_params().items() 
                             if k not in ['class_weight', 
                                          'cache_size', 
                                          'max_iter', 
                                          'random_state', 
                                          'tol', 
                                          'verbose'])) + \
                [('Accuracy', '{:.3f}'.format(cur_acc)), 
                 ('Mean Reciprocal Rank', '{:.3f}'.format(cur_topk)), 
                 ('Top {}'.format(k), '{:.3f}'.format(cur_mrr))]
            ))
            
            print('Model fitted: {}'.format(svm_cur))
            print('Accuracy on test set is {:.3f}'.format(cur_acc))
            print('{:.3f} of observations has the correct class in the top {} prediction'.format(
                cur_topk, k))
            print('Mean Reciprocal Rank is {:.3f}'.format(cur_mrr))

In [None]:
results_df = pd.DataFrame(summary_to_file)
results_df.sort_values(by='Mean Reciprocal Rank', 
                       ascending=False, 
                       inplace=True)

results_df

In [None]:
# save to file
results_df.to_csv(summary_path)