In [1]:
# import tensorflow as tf
import numpy as np
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn import svm
# from sklearn.metrics import accuracy_score, label_ranking_average_precision_score

from utils.utils import init_data, init_data_suggest, train_validation_split
from utils.utils_baseline_svm import *

# from collections import Counter

# import matplotlib.pyplot as plt

# import re

In [2]:
# creates a list of arrays
# each have an entry of 1.0 where that class is defined (in the classes array)
def y_to_dense(y, classes, dtype=float):
    return [(y[i] == classes).astype(dtype) 
            for i in range(len(y))]

# get the ranking of the elements in an array (argmax is 1)
# order is maintained
def get_rank_order(input_array):
    temp = input_array.argsort()
    ranks = np.empty(len(input_array), int)
    ranks[temp] = np.arange(len(input_array))
    return np.shape(ranks)[0] - ranks

# return a the % of observations with the correct label at the top k
def in_top_k(y_dense, log_pred, k):
    assert len(y_dense) == len(log_pred), 'y and predictions are not of same length'
    return np.mean(
        [get_rank_order(log_pred[i])[np.argmax(y_dense[i])] <= k
         for i in range(len(y_dense))])

# returns the mean reciprocal rank
def mean_reciprocal_rank(y_dense, log_pred):
    assert len(y_dense) == len(log_pred), 'y and predictions are not of same length'
    return np.mean(
        [1.0 / get_rank_order(log_pred[i])[np.argmax(y_dense[i])]
         for i in range(len(y_dense))])

In [3]:
# initialize data from main (original) CSV file
x, y, _, _ = init_data()
x = [i for i in x]  # x, turned into a list
# initialize data from suggestions CSV file
x_suggest, y_suggest, _ = init_data_suggest()

x_merge, y_merge = x + x_suggest, y + y_suggest

# create a sparse matrix (X) to hold features 
kwargs_lin_clf = nice_dict({'width': 5, # NOTE : mk_ngrams': False
                            'mk_ngrams': False, 'ngram_filter': 10, 'filter_keys_ngrams': None,
                            'mk_chars': True, 'char_filter': 100, 'filter_keys_chars': None, 
                            'label_count_thresh': 10,  # threshold for train-test split
                            'valid_ratio': 0.25,  # ratio for train-test split
                            'keep_rare_labels': False})

x_features, filter_ngrams, filter_chars = lin_clf_features(
    **{**kwargs_lin_clf, 
       **{'input_data': x}})

x_suggest_features, _, _ = lin_clf_features(
    **{**kwargs_lin_clf, 
       **{'input_data': x_suggest, 
          'filter_keys_ngrams': filter_ngrams, 
          'filter_keys_chars': filter_chars}})

x_merge_features = x_features + x_suggest_features

Character filter is applied
Character filter is applied
Character filter has been detected, using those keys as filters.


In [4]:
x_val, x_train, y_val, y_train, _, _, _, statistics_dict = \
    train_validation_split(x=x_merge_features, y=y_merge, 
                           freq=[1 for j in range(len(y_merge))], 
                           label_count_thresh=kwargs_lin_clf.label_count_thresh, 
                           valid_ratio=kwargs_lin_clf.valid_ratio, 
                           keep_rare_labels=kwargs_lin_clf.keep_rare_labels)

The are 2919 observations
Sampling from allowed 82 labels
82 labels in the validation set, with
1587 potential observation to draw from.
365 observations sampled for validation
1222 observations for training
The ratio of validation to *training* is about 0.299


In [5]:
# vectorizer transforms dict into sparse matrix
v = DictVectorizer(sparse=True)

# create a sparse X matrix with character and n-grams features
X_train = v.fit_transform(x_train)
X_val = v.transform(x_val)

# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html
# v.get_feature_names()
# v.restrict()

# print(y.shape)  # pd object
print('X_train (sparse) matrix, of size {} by {} has been created.'
      .format(X_train.get_shape()[0], X_train.get_shape()[1]))  # vectorized

kwargs_svm = nice_dict({'C': 1.0,  # penalty term
                        'decision_function_shape': 'ovr',  # one-vs-rest (‘ovr’) / one-vs-one (‘ovo’) 
                        'random_state': seed(), 
                        'kernel': 'rbf', 
                        'gamma': 'auto' ,  # kernel coef for ‘rbf’, ‘poly’ and ‘sigmoid’. ‘auto’ -> 1/n_features
                        'probability': True,  # enable probability estimates 
                        'shrinking': True,  # use the shrinking heuristic 
                        'max_iter': -1  # -1 mean no limitation 
                        })

svm_clf = svm.SVC(**kwargs_svm)

print(svm_clf)

X_train (sparse) matrix, of size 1222 by 63 has been created.
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=2178, shrinking=True,
  tol=0.001, verbose=False)


In [6]:
svm_clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=2178, shrinking=True,
  tol=0.001, verbose=False)

In [7]:
pred = svm_clf.predict(X_val)

# http://scikit-learn.org/stable/modules/svm.html
# http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

# print(accuracy_score(y_val, pred))
# equivalent to:
print('Accuracy on test set is {:.3f}'.format(svm_clf.score(X_val, y_val)))
# print(pred[:10])

Accuracy on test set is 0.934


In [8]:
# array to hold log probabilities (takes a bit longer to calc)
pred_prob = svm_clf.predict_log_proba(X_val)
# makes y into array with the same shape as the log prob
y_val_dense = y_to_dense(y=y_val, 
                         classes=svm_clf.classes_)

In [9]:
k = 5
'{:.3f} of observations has the correct class in the top {} prediction'.format(
    in_top_k(y_val_dense, pred_prob, k), k)

'0.981 of observations has the correct class in the top 5 prediction'

In [10]:
print('Mean Reciprocal Rank is {:.3f}'.format(mean_reciprocal_rank(y_val_dense, pred_prob)))

Mean Reciprocal Rank is 0.959


In [11]:
k = 5  # for the top_k metric
# for writing the results to file
summary_path = 'SVM_hyperparameter_summary_only_BOC.csv'
file = open(summary_path,'w')

for kernel, degree in zip(
    ['linear', *['poly'] * 3, 'rbf', 'sigmoid'], 
    [3, *list(range(2, 2+3)), 3, 3]):
    for C in [0.01, 0.1, 1.0]: 
        for shrinking in [True, False]:
            kwargs_cur_params = {**kwargs_svm, 
                                 **{'kernel': kernel, 
                                    'degree': degree, 
                                    'C': C,
                                    'shrinking': shrinking}}
            
            # clearing all vars (just making sure)
            svm_cur = ''
            pred_cur = ''
            pred_prob_cur = ''
            y_val_dense_cur = ''
            cur_acc, cur_topk, cur_mrr = '', '', ''
            summary_cur_model, summary_dict = '', ''

            # create and fit model
            svm_cur = svm.SVC(**kwargs_cur_params)
            svm_cur.fit(X_train, y_train)
            # predictions
            pred_cur = svm_cur.predict(X_val)
            # log probabilities
            pred_prob_cur = svm_cur.predict_log_proba(X_val)
            # makes y into array with the same shape as the log prob
            # get dense y again, not sure if classes are always ordered the same
            y_val_dense_cur = y_to_dense(y=y_val, 
                                         classes=svm_cur.classes_)
            # collect evaluation metrics
            cur_acc, cur_topk, cur_mrr = \
                svm_cur.score(X_val, y_val), \
                in_top_k(y_val_dense_cur, pred_prob_cur, k), \
                mean_reciprocal_rank(y_val_dense, pred_prob_cur)

            summary_dict = {'Accuracy': cur_acc, 
                            'Mean Reciprocal Rank': cur_topk,
                            'Top {}'.format(k): cur_mrr}

            # create string for model params
            summary_cur_model = ' '.join(
                [':'.join([str(key),str(value)]) 
                 for key,value in svm_cur.get_params().items()
                 if key not in ['random_state', 
                                'tol', 
                                'max_iter', 
                                'cache_size', 
                                'verbose', 
                                'class_weight']])

            print('Model fitted: {}'.format(summary_cur_model))
            print('Accuracy on test set is {:.3f}'.format(cur_acc))
            print('{:.3f} of observations has the correct class in the top {} prediction'.format(
                cur_topk, k))
            print('Mean Reciprocal Rank is {:.3f}'.format(cur_mrr))

            summary_cur_model_metrics = ','.join(
                [' '.join(['{},{:.3f}'.format(key, value)]) 
                 for key,value in summary_dict.items()])

            summary_cur_model = ','.join([summary_cur_model, summary_cur_model_metrics])

            # write to file
            file.write('\n{}'.format(summary_cur_model))

file.close()

Model fitted: kernel:linear degree:3 gamma:auto shrinking:True C:0.01 decision_function_shape:ovr probability:True coef0:0.0
Accuracy on test set is 0.874
0.967 of observations has the correct class in the top 5 prediction
Mean Reciprocal Rank is 0.942
Model fitted: kernel:linear degree:3 gamma:auto shrinking:False C:0.01 decision_function_shape:ovr probability:True coef0:0.0
Accuracy on test set is 0.874
0.967 of observations has the correct class in the top 5 prediction
Mean Reciprocal Rank is 0.942
Model fitted: kernel:linear degree:3 gamma:auto shrinking:True C:0.1 decision_function_shape:ovr probability:True coef0:0.0
Accuracy on test set is 0.959
0.986 of observations has the correct class in the top 5 prediction
Mean Reciprocal Rank is 0.962
Model fitted: kernel:linear degree:3 gamma:auto shrinking:False C:0.1 decision_function_shape:ovr probability:True coef0:0.0
Accuracy on test set is 0.959
0.986 of observations has the correct class in the top 5 prediction
Mean Reciprocal Ra