In [1]:
# import tensorflow as tf
import numpy as np
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score  # gt, pred

from utils import utils

from collections import Counter
# from math import isnan

# import matplotlib.pyplot as plt

# import re

In [2]:
user_opt = utils.user_opt_gen()

main_data = pd.read_csv(user_opt['data_path'], 
                         sep=';', 
                         header=0, 
                         encoding='cp850')

# only observations with ATC labels
main_data_labeled = main_data.loc[[isinstance(k, str) for k in main_data['ATC']],:]

atc_conversion_data = pd.read_csv(user_opt['atc_conversion_data_path'], 
                                  sep=';', 
                                  header=0, 
                                  encoding='cp850')

In [3]:
# create a charachter:count dict
def char_freq_map(input_data):
    char_dict = {}
    # check if dataframe or a single obs
    if isinstance(input_data, pd.core.series.Series):
        # getting line
        for line in input_data:
            # splitting into characters
            chars = list(line)
            for char in chars:
                char_dict[char] = char_dict.get(char, 0) + 1
    elif isinstance(input_data, str):
        # splitting into characters
            chars = list(input_data)
            for char in chars:
                char_dict[char] = char_dict.get(char, 0) + 1
    return char_dict


# create a ngram:count dict
def ngram_freq_map(input_data, width):
    ngram_dict = {}
    # check if dataframe or a single obs
    if isinstance(input_data, pd.core.series.Series):
        # getting line
        for line in input_data:
            ngram_dict = update_ngram_dict(line, width, ngram_dict)
    elif isinstance(input_data, str):
        ngram_dict = update_ngram_dict(input_data, width, ngram_dict)
    return ngram_dict


# create a sliding window and update a dict with counts (default 0)
def update_ngram_dict(line, width, ngram_dict):
    ngrams = sliding_window(line, width)
    for ngram in ngrams:
        ngram_dict[ngram] = ngram_dict.get(ngram, 0) + 1
    return ngram_dict


# returns a list with a sliding window
# over the string with given width
def sliding_window(input_str, width):
    assert len(input_str) >= width, 'Cannot slide with width larger than the string!'
    return [input_str[i:i + width] for i in range(len(input_str) - width + 1)]

In [4]:
# for col in range(X.get_shape()[1]):
#     print(sum(X.getcol(col)))

# for i in range(20, 30):
#     print(i)
#     print(X.getcol(i))
#     print()
#     print()

# counter = 0
# for col in range(X.get_shape()[1]):
#     if sum(X.getcol(col)) >= 5:
#         counter += 1
# print(counter)

# char_freq_map(x)
# sliding_window(x[0], 4)
# [print(text) for text in x]

# ngram_dict = ngram_freq_map(x, 5)
# smaller_ngram_dict = {k: ngram_dict[k] for k in ngram_dict if ngram_dict[k] >= 5}
# print(len(ngram_dict))
# print(len(smaller_ngram_dict))

In [5]:
# smaller n for testing purposes
# n = 1000
n = len(main_data_labeled)

x = main_data_labeled['FREETXT'][:n]
y = main_data_labeled['ATC'][:n]

# vectorizer transforms dict into sparse matrix
v = DictVectorizer(sparse=True)
# [len(obs) for obs in x]

# create a sparse X matrix with character and n-grams features
# z = {**a, **b} merges a, b dicts by key
X = v.fit_transform(
    [{**ngram_freq_map(obs, 5), **char_freq_map(obs)} for obs in x])

# Y = [{label:1} for label in y]
## can check:
## for i in range(len(Y)):
##     print(list(Y[i].keys())[0] == y[i])
# Y = v.fit_transform(Y)

# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html
# v.get_feature_names()
# v.restrict()

# print(y.shape)  # pd object
# print(X.get_shape())  # vectorized

In [10]:
# svm_clf = svm.LinearSVC(verbose=True, 
#                         max_iter=10000, dual=True)

svm_clf = svm.SVC()

print(svm_clf)

svm_clf.fit(X, y)

pred = svm_clf.predict(X)

# http://scikit-learn.org/stable/modules/svm.html
# http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [11]:
print(accuracy_score(y, pred))

0.0389546351085
