In [100]:
import pandas as pd
import numpy as np
from collections import Counter
from itertools import dropwhile
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import  AdaBoostClassifier
from sklearn.metrics import accuracy_score
import csv
print("Import Complete")

Import Complete


In [101]:
def feature_list(file):
    vocab = []
    with open(file, 'rb') as f:
        reader = csv.reader(f)
        for row in reader:
            word = row[0]
            if word not in vocab:
                vocab.append(word)
            else:
                print("Repeat: {}".format(word))
    return vocab

def label_features(df, features_master):
        for i,row in df.iterrows():
            #message = pruning_dict.remove_nonalphanumeric(row.text)
            message = str(row.question_class)
            features = Counter(message.split()) & features_master
            features = features + features_master
            features = list(np.array(list(features.values())) - 1)
            df.set_value(i,'features',features)
        return df

def create_feature_dataframe(df, features_master):
    return pd.DataFrame(list(df.features), columns=range(len(features_master)))

def split_set(x, y, test_size):
    X_train, X_test, y_train, y_test = train_test_split(list(x),
                                                        list(y),
                                                        test_size=test_size)
    return X_train, X_test, y_train, y_test

In [102]:
def build_vocabulary(messages, word_drop=True):
        vocabulary = Counter()
        print("building vocabulary from {} messages".format(len(messages)))
        for message in messages:
            message = str(message)
            #message = remove_nonalphanumeric(message)
            message_split = message.split()
            gram_count = 2
            grams = []
            for i in range(len(message_split) - 1):
                gram = ''
                for n in range(gram_count):
                    gram = gram + message_split[i+n] + ' '
                grams.append(gram[:-1])
            vocabulary = vocabulary + Counter(message_split) + Counter(grams)
        if word_drop == True:
            for key, count in dropwhile(
                                        lambda key_count: key_count[1] >= (len(messages) * .01),
                                        vocabulary.most_common()):
                del vocabulary[key]
        print("The vocabulary contains {} words".format(len(vocabulary)))
        return vocabulary

In [103]:
def prune_vocab(df, vocabulary, group1, group2, percent_saved):

    info = df.groupby('category').get_group(group1)
    info_v = build_vocabulary(info.question_class, word_drop=True)
    info_w = info_v.keys()
    express = df.groupby('category').get_group(group2)
    express_v = build_vocabulary(express.question_class, word_drop=True)
    express_w = express_v.keys()

    common_v = info_v & express_v
    common_w = common_v.keys()

    words = []
    ratios = []

    for word in info_w:
        if word not in common_w:
            ratios.append(info_v[word])
            words.append(word)

    for word in common_w:
        ratios.append(info_v[word] / express_v[word])
        words.append(word)

    for word in express_w:
        if word not in common_w:
            ratios.append(express_v[word] * -1)
            words.append(word)


    threshold = int(len(words) * (percent_saved/2))
    top20 = np.argsort(ratios)[-threshold:]
    bottom20 = np.argsort(ratios)[:threshold]

    polar_words = []
    for group in [top20, bottom20]:
        for index in group:
            polar_words.append(words[index])


    vocabulary = Counter(polar_words) & vocabulary

    return vocabulary


In [104]:
print("Start")
csvdata = "./data/subcategory_2_think_hr_v2.csv"
df = pd.read_csv(csvdata)
df = df[:500]
catoregories = ["Compensation", "Compliance", "Employee Benefits"]

le = preprocessing.LabelEncoder()
le.fit(catoregories)
ohe = preprocessing.OneHotEncoder()
ohe.fit([[0],[1],[2]])


ohe_array = ohe.transform(le.transform(df.category).reshape(-1,1)).toarray()
df['category_ohe'] = list(ohe_array)


#vocab = feature_list('./HotWords.csv')
vocab = build_vocabulary(df.question_class)
pairs = [[0,1], [0,2], [1,2]]
vocabularies = []
for pair in pairs:
    a,b = pair
    group1 = catoregories[a]
    group2 = catoregories[b]
    vocab_CC = prune_vocab(df, vocab, group1, group2, .9)
    vocabularies.append(vocab_CC)
    print("The length of the pruned vocab between {} and {} is {}".format(group1, group2, len(vocab_CC)))

vocab = vocabularies[0] & vocabularies[1] & vocabularies[2]
print("Length of final vocab {}".format(len(vocab)))
vocab = list(vocab)
print(vocab[:100])

features_master = Counter(vocab)
df["features"] = [[0] * len(vocab)] * len(df)
df = label_features(df, features_master)
df2 = create_feature_dataframe(df, features_master)
X_train, X_test, y_train, y_test = split_set(df.features, df.category, 0.1)

names = ["NerualNet", "DecisionTree", "AdaBoost"]
clfs = [MLPClassifier( max_iter=500),
            DecisionTreeClassifier(),
            AdaBoostClassifier()]
for clf, name in zip(clfs, names):
    print("Training {}".format(name))
    clf.fit(X_train, y_train)
    y_predict = clf.predict(X_test)
    #print(y_predict)
    score = accuracy_score(y_test, y_predict)
    print("The accuracy for {} is {}".format(name, score))


Start
building vocabulary from 500 messages
The vocabulary contains 741 words
building vocabulary from 161 messages
The vocabulary contains 865 words
building vocabulary from 190 messages
The vocabulary contains 1149 words
The length of the pruned vocab between Compensation and Compliance is 520
building vocabulary from 161 messages
The vocabulary contains 865 words
building vocabulary from 149 messages
The vocabulary contains 1220 words
The length of the pruned vocab between Compensation and Employee Benefits is 590
building vocabulary from 190 messages
The vocabulary contains 1149 words
building vocabulary from 149 messages
The vocabulary contains 1220 words
The length of the pruned vocab between Compliance and Employee Benefits is 493
Length of final vocab 201
['money', 'executive', 'work week', 'years', 'new hire', 'area', 'find', 'staff', 'non exempt', 'member', 'believe', 'much', 'outside', 'better', 'th', 'exempt', 'employed', 'main', 'employees work', 'resources', 'hours worked



The accuracy for NerualNet is 0.6
Training DecisionTree
The accuracy for DecisionTree is 0.66
Training AdaBoost
The accuracy for AdaBoost is 0.6
