In [1]:
# import tensorflow as tf
import numpy as np
# import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn import svm
from sklearn.linear_model import LogisticRegression as Logistic
# from sklearn.metrics import accuracy_score, label_ranking_average_precision_score

from utils.utils import *
from utils.utils_baseline_svm import *

# from collections import Counter

# import matplotlib.pyplot as plt

# import re

In [2]:
# call for linear model
kwargs_lin_clf = nice_dict({'mk_chars': True, 
                            'model': 'linear', 
                            'char_filter': 100, 'allowed_chars': None, 
                            'mk_ngrams': False, 'ngram_width': 5, 
                            'ngram_filter': 10, 'allowed_ngrams': None, 
                            'keep_infreq_labels': False, 'label_count_thresh': 10, 
                            'valid_ratio': 0.25, 
                            'scale_func': unscale, 'to_permute': True, })

x_train, x_val, y_train, y_val = \
    data_load_preprocess(**kwargs_lin_clf)

The are 2919 observations
Sampling from allowed 82 labels
82 labels in the validation set, with
1587 potential observation to draw from.
365 observations sampled for validation
1222 observations for training
The ratio of validation to *training* is about 0.299


In [3]:
# vectorizer transforms dict into sparse matrix
v = DictVectorizer(sparse=True)

# create a sparse X matrix with character and n-grams features
X_train = v.fit_transform(x_train)
X_val = v.transform(x_val)

# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html
# v.get_feature_names()
# v.restrict()

# print(y.shape)  # pd object
print('X_train (sparse) matrix, of size {} by {} has been created.'
      .format(X_train.get_shape()[0], X_train.get_shape()[1]))  # vectorized

kwargs_svm = nice_dict({'C': 1.0,  # penalty term
                        'decision_function_shape': 'ovr',  # one-vs-rest (‘ovr’) / one-vs-one (‘ovo’) 
                        'random_state': seed(), 
                        'kernel': 'rbf', 
                        'gamma': 'auto' ,  # kernel coef for ‘rbf’, ‘poly’ and ‘sigmoid’. ‘auto’ -> 1/n_features
                        'probability': True,  # enable probability estimates 
                        'shrinking': True,  # use the shrinking heuristic 
                        'max_iter': -1  # -1 mean no limitation 
                        })

svm_clf = svm.SVC(**kwargs_svm)

print(svm_clf)

X_train (sparse) matrix, of size 1222 by 63 has been created.
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=2178, shrinking=True,
  tol=0.001, verbose=False)


In [4]:
svm_clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=2178, shrinking=True,
  tol=0.001, verbose=False)

In [5]:
pred = svm_clf.predict(X_val)

# http://scikit-learn.org/stable/modules/svm.html
# http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

# print(accuracy_score(y_val, pred))
# equivalent to:
print('Accuracy on validation set is {:.3f}'.format(svm_clf.score(X_val, y_val)))
# print(pred[:10])

Accuracy on validation set is 0.934


In [6]:
# array to hold log probabilities (takes a bit longer to calc)
pred_prob = svm_clf.predict_log_proba(X_val)
# makes y into array with the same shape as the log prob
y_val_dense = y_to_dense(y=y_val, 
                         classes=svm_clf.classes_)

In [7]:
k = 5
'{:.3f} of observations has the correct class in the top {} prediction'.format(
    in_top_k(y_val_dense, pred_prob, k), k)

'0.981 of observations has the correct class in the top 5 prediction'

In [8]:
print('Mean Reciprocal Rank is {:.3f}'.format(mean_reciprocal_rank(y_val_dense, pred_prob)))

Mean Reciprocal Rank is 0.959


In [9]:
# logistic regression
kwargs_logistic = nice_dict({'C': 1.0, 
                             'penalty': 'l2', 
                             'multi_class': 'ovr', # one-vs-rest (‘ovr’) / one-vs-one (‘ovo’) 
                             'random_state': seed(), 
                             'solver': 'newton-cg',  # ‘liblinear’ is fit to "small" data-sets, crashes kernel 
                             # solver:{‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’}, default: ‘liblinear’
                             'fit_intercept': True
                        })

logistic_clf = Logistic(**kwargs_logistic)
logistic_clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=2178, solver='newton-cg', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
pred_logistic = logistic_clf.predict(X_val)
print('Accuracy on validation set is {:.3f}'.format(svm_clf.score(X_val, y_val)))

Accuracy on validation set is 0.934


In [None]:
k = 5  # for the top_k metric
# for writing the results to file
summary_path = 'SVM_hyperparameter_summary_only_BOC.csv'
file = open(summary_path,'w')

for kernel, degree in zip(
    ['linear', *['poly'] * 3, 'rbf', 'sigmoid'], 
    [3, *list(range(2, 2+3)), 3, 3]):
    for C in [0.01, 0.1, 1.0]: 
        for shrinking in [True, False]:
            kwargs_cur_params = {**kwargs_svm, 
                                 **{'kernel': kernel, 
                                    'degree': degree, 
                                    'C': C,
                                    'shrinking': shrinking}}
            
            # clearing all vars (just making sure)
            svm_cur = ''
            pred_cur = ''
            pred_prob_cur = ''
            y_val_dense_cur = ''
            cur_acc, cur_topk, cur_mrr = '', '', ''
            summary_cur_model, summary_dict = '', ''

            # create and fit model
            svm_cur = svm.SVC(**kwargs_cur_params)
            svm_cur.fit(X_train, y_train)
            # predictions
            pred_cur = svm_cur.predict(X_val)
            # log probabilities
            pred_prob_cur = svm_cur.predict_log_proba(X_val)
            # makes y into array with the same shape as the log prob
            # get dense y again, not sure if classes are always ordered the same
            y_val_dense_cur = y_to_dense(y=y_val, 
                                         classes=svm_cur.classes_)
            # collect evaluation metrics
            cur_acc, cur_topk, cur_mrr = \
                svm_cur.score(X_val, y_val), \
                in_top_k(y_val_dense_cur, pred_prob_cur, k), \
                mean_reciprocal_rank(y_val_dense, pred_prob_cur)

            summary_dict = {'Accuracy': cur_acc, 
                            'Mean Reciprocal Rank': cur_topk,
                            'Top {}'.format(k): cur_mrr}

            # create string for model params
            summary_cur_model = ' '.join(
                [':'.join([str(key),str(value)]) 
                 for key,value in svm_cur.get_params().items()
                 if key not in ['random_state', 
                                'tol', 
                                'max_iter', 
                                'cache_size', 
                                'verbose', 
                                'class_weight']])

            print('Model fitted: {}'.format(summary_cur_model))
            print('Accuracy on test set is {:.3f}'.format(cur_acc))
            print('{:.3f} of observations has the correct class in the top {} prediction'.format(
                cur_topk, k))
            print('Mean Reciprocal Rank is {:.3f}'.format(cur_mrr))

            summary_cur_model_metrics = ','.join(
                [' '.join(['{},{:.3f}'.format(key, value)]) 
                 for key,value in summary_dict.items()])

            summary_cur_model = ','.join([summary_cur_model, summary_cur_model_metrics])

            # write to file
            file.write('\n{}'.format(summary_cur_model))

file.close()