In [None]:
# import tensorflow as tf
import numpy as np
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score  # gt, pred

from utils.utils import user_opt_gen, nice_dict, seed, init_data, pcp1, pcp2, pcp3, pcp4
from utils.utils_baseline_svm import *

# from collections import Counter

# import matplotlib.pyplot as plt

# import re

In [None]:
if __name__ == '__main__':
    
    x, y, n, _ = init_data()
    
    # create a sparse matrix (X) to hold features 
    kwargs_lin_clf = nice_dict({'input_data': x, 
                                'width': 5, 
                                'mk_ngrams': True, 'ngram_filter': 10, 'filter_keys_ngrams': None,
                                'mk_chars': True, 'char_filter': 100, 'filter_keys_chars': None})

    X_features, _, _ = lin_clf_features(**kwargs_lin_clf)
    
    # vectorizer transforms dict into sparse matrix
    v = DictVectorizer(sparse=True)

    # create a sparse X matrix with character and n-grams features
    X = v.fit_transform(X_features)

    # http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html
    # v.get_feature_names()
    # v.restrict()

    # print(y.shape)  # pd object
    print('X (sparse) matrix, of size {} by {} has been created.'
          .format(X.get_shape()[0], X.get_shape()[1]))  # vectorized
    
    kwargs_svm = nice_dict({'C': 1.0,  # penalty term
                        'decision_function_shape': 'ovr',  # one-vs-rest (‘ovr’) / one-vs-one (‘ovo’) 
                        'random_state': seed(), 
                        'kernel': 'rbf', 
                        'gamma': 'auto' ,  # kernel coef for ‘rbf’, ‘poly’ and ‘sigmoid’. ‘auto’ -> 1/n_features
                        'probability': True,  # enable probability estimates 
                        'shrinking': True,  # use the shrinking heuristic 
                        'max_iter': -1  # -1 mean no limitation 
                        })

    svm_clf = svm.SVC(**kwargs_svm)

    print(svm_clf)
    
    svm_clf.fit(X, y)

    pred = svm_clf.predict(X)

    # http://scikit-learn.org/stable/modules/svm.html
    # http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
    
    print(accuracy_score(y, pred))
    print(pred[:10])

In [None]:
if __name__ == '__main__':
    # array to hold log probabilities (takes a bit longer to calc)
    pred_prob = svm_clf.predict_log_proba(X)

    # returns an array with indecies of top k elements in the input_array
    def top_k_ind(input_array, k):
        return input_array.argsort()[-k:][::-1]


    # returns an array with probabilities of top k elements (log likelihood) in the input_array
    def top_k_prob(input_array, k):
        return np.exp(input_array[top_k_ind(input_array, k)])

    # proto-type for outputting a ranked list of predictions with SVM
    test_obs = pred_prob[0,:]
    test_k = 3
    # indecies
    top_ind = top_k_ind(test_obs, test_k)
    print('Indecies of top entries are {}'.format(top_ind))
    # probabilities
    print('Corresponding probabilities are {}'.format(
        top_k_prob(test_obs, test_k))
         )

    # labels
    print('Corresponding labels are {}'.format(
        svm_clf.classes_[top_ind])
         )

    print(
    list(zip(svm_clf.classes_[top_ind], top_k_prob(test_obs, test_k)))
    )