In [1]:
# import tensorflow as tf
import numpy as np
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score  # gt, pred

from utils.utils import user_opt_gen, nice_dict, seed, pcp1, pcp2, pcp3, pcp4

# from collections import Counter
# from math import isnan

# import matplotlib.pyplot as plt

# import re

In [2]:
# create a charachter:count dict
def char_freq_map(*, input_data, filter_by_chars = None, **kwargs):
    char_dict = {}
    unknown = '<unk-char>'
    # check if dataframe or a single obs
    if isinstance(input_data, pd.core.series.Series):
        # getting line
        for line in input_data:
            # splitting into characters
            chars = list(line)
            for char in chars:
                if filter_by_chars == None or char in filter_by_chars:
                    char_dict[char] = char_dict.get(char, 0) + 1
                else:
                    char_dict[unknown] = char_dict.get(unknown, 0) + 1
    elif isinstance(input_data, str):
        # splitting into characters
            chars = list(input_data)
            for char in chars:
                if filter_by_chars == None or char in filter_by_chars:
                    char_dict[char] = char_dict.get(char, 0) + 1
                else:
                    char_dict[unknown] = char_dict.get(unknown, 0) + 1
    return nice_dict(char_dict)


# create a ngram:count dict
def ngram_freq_map(*, input_data, width, filter_by_keys = None, **kwargs):
    ngram_dict = {}
    # check if dataframe or a single obs
    if isinstance(input_data, pd.core.series.Series):
        # getting line
        for line in input_data:
            ngram_dict = update_ngram_dict(line, width, ngram_dict, filter_by_keys)
    elif isinstance(input_data, str):
        ngram_dict = update_ngram_dict(input_data, width, ngram_dict, filter_by_keys)
    return nice_dict(ngram_dict)


# create a sliding window and update a dict with counts (default 0)
def update_ngram_dict(line, width, ngram_dict, filter_by_keys, **kwargs):
    ngrams = sliding_window(line, width)
    unknown = '<unk-ngram>'
    for ngram in ngrams:
        if filter_by_keys == None or ngram in filter_by_keys:
            ngram_dict[ngram] = ngram_dict.get(ngram, 0) + 1
        else:
            ngram_dict[unknown] = ngram_dict.get(unknown, 0) + 1
    return ngram_dict


def filter_dict_by_val_atleast(input_dict, value):
    return nice_dict({k:input_dict[k] for k in input_dict if input_dict[k] >= value})


# returns a list with a sliding window
# over the string with given width
def sliding_window(input_str, width):
    assert len(input_str) >= width, 'Cannot slide with width larger than the string!'
    return [input_str[i:i + width] for i in range(len(input_str) - width + 1)]


# create a joint dict for every observation in the input_data
# based on 'ngram_freq_map' and 'char_freq_map'
# enables to only select one feature type and filtering
def lin_clf_features(*, input_data,
                     mk_ngrams=None, width, ngram_filter, 
                     mk_chars=None, char_filter, 
                     **kwargs):
    assert (mk_ngrams or mk_chars), 'Please select either to create n-grams or character features.'

    if mk_ngrams:
        # filter ngrams to only those that appear at least 'ngram_filter' times in the input
        if isinstance(ngram_filter, int):
            print('N-grams filter is applied')
            filter_keys_ngrams = list(
                filter_dict_by_val_atleast(
                    input_dict=ngram_freq_map(input_data=input_data, 
                                              width=width), 
                    value=ngram_filter)
                .keys())
            # apply ngram_freq_map, after figuring out which keys to keep
            X_features_ngrams = [ngram_freq_map(input_data=obs, 
                                                width=width, 
                                                filter_by_keys=filter_keys_ngrams) for obs in input_data]
        # if no filter, just apply the function (for all keys)
        else:
            print('N-grams filter is NOT applied')
            X_features_ngrams = [ngram_freq_map(input_data=obs, 
                                                width=width)
                     for obs in input_data]
    else:
        X_features_ngrams = [{} for ind in range(len(input_data))]
        
    if mk_chars:
        # filter by character, appear at least 'char_filter' times in the input
        if isinstance(char_filter, int):
            print('Character filter is applied')
            filter_keys_chars = list(
                filter_dict_by_val_atleast(
                    input_dict=char_freq_map(input_data=input_data), 
                    value=char_filter)
                .keys())
            # apply ngram_freq_map, after figuring out which keys to keep
            X_features_chars = [char_freq_map(input_data = obs, 
                                              filter_by_chars=filter_keys_chars) 
                                for obs in input_data]
        else:
            print('Character filter is NOT applied')
            X_features_chars = [char_freq_map(input_data = obs) 
                                for obs in input_data]
    else:
        X_features_chars = [{} for ind in range(len(input_data))]
    
    # merge two dicts
    return [nice_dict({** X_features_ngrams[ind] ,**X_features_chars[ind]}) 
              for ind in range(len(input_data))]

In [3]:
if __name__ == '__main__':
    user_opt = user_opt_gen()

    main_data = pd.read_csv(user_opt['data_path'], 
                             sep=';', 
                             header=0, 
                             encoding='cp850')

    # only observations with ATC labels
    main_data_labeled = main_data.loc[[isinstance(k, str) for k in main_data['ATC']],:]

#     atc_conversion_data = pd.read_csv(user_opt['atc_conversion_data_path'], 
#                                       sep=';', 
#                                       header=0, 
#                                       encoding='cp850')
    
    # smaller n for testing purposes
    # n = 1000
    n = len(main_data_labeled)

    x = main_data_labeled['FREETXT'][:n]
    y = main_data_labeled['ATC'][:n]
    
    # create a sparse matrix (X) to hold features 
    kwargs_lin_clf = nice_dict({'input_data': x, 
                                'width': 5, 
                                'mk_ngrams':True, 'ngram_filter': 10, 
                                'mk_chars':True, 'char_filter': 100})

    X_features = lin_clf_features(**kwargs_lin_clf)
    
    # vectorizer transforms dict into sparse matrix
    v = DictVectorizer(sparse=True)

    # create a sparse X matrix with character and n-grams features
    X = v.fit_transform(X_features)

    # http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html
    # v.get_feature_names()
    # v.restrict()

    # print(y.shape)  # pd object
    print('X (sparse) matrix, of size {} by {} has been created.'
          .format(X.get_shape()[0], X.get_shape()[1]))  # vectorized
    
    kwargs_svm = nice_dict({'C': 1.0,  # penalty term
                        'decision_function_shape': 'ovr',  # one-vs-rest (‘ovr’) / one-vs-one (‘ovo’) 
                        'random_state': seed(), 
                        'kernel': 'rbf', 
                        'gamma': 'auto' ,  # kernel coef for ‘rbf’, ‘poly’ and ‘sigmoid’. ‘auto’ -> 1/n_features
                        'probability': True,  # enable probability estimates 
                        'shrinking': True,  # use the shrinking heuristic 
                        'max_iter': -1  # -1 mean no limitation 
                        })

    svm_clf = svm.SVC(**kwargs_svm)

    print(svm_clf)
    
    svm_clf.fit(X, y)

    pred = svm_clf.predict(X)

    # http://scikit-learn.org/stable/modules/svm.html
    # http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
    
    print(accuracy_score(y, pred))
    print(pred[:10])

N-grams filter is applied
Character filter is applied
X (sparse) matrix, of size 2028 by 1988 has been created.
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=2178, shrinking=True,
  tol=0.001, verbose=False)
0.214990138067
['B05B' 'A10BA02' 'B01AB04' 'V06DB' 'V06DB' 'A10BA02' 'V06DB' 'B05B'
 'B01AB04' 'V06DB']


In [4]:
# array to hold log probabilities (takes a bit longer to calc)
pred_prob = svm_clf.predict_log_proba(X)

In [5]:
# returns an array with indecies of top k elements in the input_array
def top_k_ind(input_array, k):
    return input_array.argsort()[-k:][::-1]


# returns an array with probabilities of top k elements (log likelihood) in the input_array
def top_k_prob(input_array, k):
    return np.exp(input_array[top_k_ind(input_array, k)])

In [6]:
# proto-type for outputting a ranked list of predictions with SVM
test_obs = pred_prob[0,:]
test_k = 3
# indecies
top_ind = top_k_ind(test_obs, test_k)
print('Indecies of top entries are {}'.format(top_ind))
# probabilities
print('Corresponding probabilities are {}'.format(
    top_k_prob(test_obs, test_k))
     )

# labels
print('Corresponding labels are {}'.format(
    svm_clf.classes_[top_ind])
     )

print(
list(zip(svm_clf.classes_[top_ind], top_k_prob(test_obs, test_k)))
)

Indecies of top entries are [103 248 379]
Corresponding probabilities are [ 0.15474735  0.01789337  0.01780771]
Corresponding labels are ['B05B' 'H02AB07' 'L04AD01']
[('B05B', 0.15474734918011418), ('H02AB07', 0.017893367158681218), ('L04AD01', 0.017807706617362012)]
