In [169]:
import pandas as pd
import numpy as np
from collections import Counter
from itertools import dropwhile
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import  AdaBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OutputCodeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
import csv
import pickle
print("Import Complete")

Import Complete


In [170]:
def save_obj(obj, name ):
    with open( name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open( name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [171]:
def feature_list(file):
    vocab = []
    with open(file, 'rb') as f:
        reader = csv.reader(f)
        for row in reader:
            word = row[0]
            if word not in vocab:
                vocab.append(word)
            else:
                print("Repeat: {}".format(word))
    return vocab

def label_features(df, features_master):
        for i,row in df.iterrows():
            #message = pruning_dict.remove_nonalphanumeric(row.text)
            message = str(row.question_class)
            features = build_vocabulary([message], word_drop=False) & features_master
            features = features + features_master
            features = list(np.array(list(features.values())) - 1)
            df.set_value(i,'features',features)
        return df

def create_feature_dataframe(df, features_master):
    return pd.DataFrame(list(df.features), columns=range(len(features_master)))

def split_set(x, y, test_size):
    X_train, X_test, y_train, y_test = train_test_split(list(x),
                                                        list(y),
                                                        test_size=test_size)
    return X_train, X_test, y_train, y_test

In [172]:
def build_vocabulary(messages, word_drop=True):
    cvocabularies = Counter()
    #print("building vocabulary from {} messages".format(len(messages)))
    for start in range(0, len(messages), 500):
        vocabulary = Counter()
        for message in messages[start:start + 500]:
            message = str(message)
            #message = remove_nonalphanumeric(message)
            message_split = message.split()
            gram_count = 3
            grams = []
            for i in range(len(message_split) - (gram_count - 1)):
                gram = ''
                for n in range(gram_count):
                    gram = gram + message_split[i+n] + ' '
                grams.append(gram[:-1])
            vocabulary = vocabulary + Counter(message_split) + Counter(grams)
        cvocabularies = cvocabularies + vocabulary
    if word_drop == True:
        for key, count in dropwhile(
                                    lambda key_count: key_count[1] >= (len(messages) * .01),
                                    cvocabularies.most_common()):
            del cvocabularies[key]
    #print("The vocabulary contains {} words".format(len(cvocabularies)))
    return cvocabularies

In [173]:
def prune_vocab(df, vocabulary, group1, group2, percent_saved):

    try:
        info_v = load_obj(group1)
    except:
        info = df.groupby('category').get_group(group1)
        info_v = build_vocabulary(info.question_class, word_drop=True)
        save_obj(info_v, group1)
    info_w = info_v.keys()
    try:
        express_v = load_obj(group2)
    except:
        express = df.groupby('category').get_group(group2)
        express_v = build_vocabulary(express.question_class, word_drop=True)
        save_obj(express_v, group2)
    express_w = express_v.keys()

    common_v = info_v & express_v
    common_w = common_v.keys()

    words = []
    ratios = []

    for word in info_w:
        if word not in common_w:
            ratios.append(info_v[word]/float(len(info_v)))
            words.append(word)

    for word in common_w:
        ratios.append((info_v[word]/float(len(info_v))) / (express_v[word]/float(len(express_v))))
        words.append(word)

    for word in express_w:
        if word not in common_w:
            ratios.append(express_v[word]/float(len(express_v) * -1))
            words.append(word)


    threshold = int(len(words) * (percent_saved/2))
    top20 = np.argsort(ratios)[-threshold:]
    bottom20 = np.argsort(ratios)[:threshold]

    polar_words = []
    for group in [top20, bottom20]:
        for index in group:
            polar_words.append(words[index])

    vocabulary = Counter(polar_words) & vocabulary

    return vocabulary

def polarize_dict(df, vocab_m, catoregories, keep_percent):
    pairs = [[0,1], [0,2], [1,2]]
    vocabularies = []
    for pair in pairs:
        a,b = pair
        group1 = catoregories[a]
        group2 = catoregories[b]
        vocab_CC = prune_vocab(df, vocab_m, group1, group2, keep_percent/10.0)
        vocabularies.append(vocab_CC)
        #print("The length of the pruned vocab between {} and {} is {}".format(group1, group2, len(vocab_CC)))

    vocab = vocabularies[0] & vocabularies[1] & vocabularies[2]
    print("Length of final vocab {}".format(len(vocab)))
    return vocab


In [174]:
def train_models(df, v=False):
    
    X_train, X_test, y_train, y_test = split_set(df.features, df.category, 0.2)

    names = ["NerualNet", "GaussianProcessClassifier", 
             "DecisionTree", "OneVsRestClassifier", "AdaBoost",
             "OneVsOneClassifier", "OutputCodeClassifier",  "KNeighborsClassifier",
             ]
    clfs = [MLPClassifier( max_iter=500),
            GaussianProcessClassifier(),
                DecisionTreeClassifier(),
            OneVsRestClassifier( DecisionTreeClassifier()),
                AdaBoostClassifier(),
           OneVsOneClassifier(DecisionTreeClassifier()),
           OutputCodeClassifier(DecisionTreeClassifier()),
            KNeighborsClassifier()]
    accuracies = []
    for clf, name in zip(clfs, names):
        clf.fit(X_train, y_train)
        y_predict = clf.predict(X_test)
        '''
        print(y_test)
        print("y_predict")
        print(y_predict)
        
        y_test2 = np.asarray(y_test)
        y_test2 = y_test2.dot(ohe.active_features_).astype(int)
        y_predict = np.asarray(y_predict.dot(ohe.active_features_).astype(int))
        #print(y_test2, y_predict)
        '''
        score = accuracy_score(y_test, y_predict)
        if v == True:
            print("The accuracy for {} is {}".format(name, score))
        accuracies.append(score)
    return np.mean(accuracies)

In [175]:
print("Start")
csvdata = "./data/subcategory_2_think_hr_v2.csv"
df = pd.read_csv(csvdata)
df = df[:3000]
catoregories = ["Compensation", "Compliance", "Employee Benefits"]

le = preprocessing.LabelEncoder()
le.fit(catoregories)
ohe = preprocessing.OneHotEncoder()
ohe.fit([[0],[1],[2]])

#df = df.loc[(df['category'] == "Compensation") | (df['category'] == "Compliance")]
#print(df.head(10))


ohe_array = ohe.transform(le.transform(df.category).reshape(-1,1)).toarray()
df['category'] = le.transform(df.category)
#df['category'] = list(ohe_array)


#vocab = feature_list('./HotWords.csv')
try:
    vocab_m = load_obj('vocab_master')
except:
    print("Building Vocab")
    vocab_m = build_vocabulary(df.question_class)
    save_obj(vocab_m, 'vocab_master')
vocab = vocab_m

'''
avg_acc = []
keep_percentages = range(5,11)
for keep_percent in keep_percentages:
    print("Training with {}%".format(keep_percent*10))
    vocab = polarize_dict(df, vocab_m, catoregories, keep_percent)
    #print(vocab)


    df["features"] = [[0] * len(vocab)] * len(df)
    df = label_features(df, features_master)

    acc = train_models(df)
    avg_acc.append(acc)
    print("The mean accuracy was {}".format(avg_acc))



keep_percent = keep_percentages[np.argmax(avg_acc)]
'''
keep_percent = 8

print("Training with {}%".format(keep_percent*10))
vocab_p = polarize_dict(df, vocab_m, catoregories, keep_percent)

features_master = Counter(list(vocab_p.keys()))

#print(features_master)
#save_obj(vocab, "final_feature_set")
df["features"] = [[0] * len(vocab_p)] * len(df)
df = label_features(df, features_master)
train_models(df, v=True)
selectors = [chi2]#, f_classif, mutual_info_classif]
selectors_vocab = []
for selector in selectors:
    features = SelectKBest(selector, k=250).fit(list(df.features), list(df.category))
    opt_index = features.get_support(indices=True)
    selector_vocab = []
    for i in opt_index:
        selector_vocab.append(list(vocab)[i])
    selectors_vocab.append(selector_vocab)
    save_obj(selector_vocab, "final_feature_set")
    features_master = Counter(selector_vocab)
    df["features"] = [[0] * len(selector_vocab)] * len(df)
    df = label_features(df, features_master)
    train_models(df, v=True)
    #print(selector_vocab)
opt_vocab = Counter(selectors_vocab[0]) & Counter(selectors_vocab[1]) & Counter(selectors_vocab[2]) & vocab_p

vocab = list(opt_vocab)
features_master = Counter(vocab)
df["features"] = [[0] * len(vocab)] * len(df)
df = label_features(df, features_master)
print(vocab)
print("Vocabulary lenght: {}".format(len(vocab)))

acc = train_models(df, v=True)

#print("The mean accuracy was {}".format(acc))
#print(vocab)
#print(vocab_m)


Start
Training with 80%
Length of final vocab 266




The accuracy for NerualNet is 0.558333333333
The accuracy for GaussianProcessClassifier is 0.56
The accuracy for DecisionTree is 0.518333333333
The accuracy for OneVsRestClassifier is 0.458333333333
The accuracy for AdaBoost is 0.55
The accuracy for OneVsOneClassifier is 0.531666666667
The accuracy for OutputCodeClassifier is 0.498333333333
The accuracy for KNeighborsClassifier is 0.503333333333
The accuracy for NerualNet is 0.643333333333
The accuracy for GaussianProcessClassifier is 0.666666666667
The accuracy for DecisionTree is 0.548333333333
The accuracy for OneVsRestClassifier is 0.578333333333
The accuracy for AdaBoost is 0.656666666667
The accuracy for OneVsOneClassifier is 0.598333333333
The accuracy for OutputCodeClassifier is 0.551666666667
The accuracy for KNeighborsClassifier is 0.508333333333


IndexError: list index out of range