In [1]:
import pandas as pd
import numpy as np
import sklearn
from collections import Counter
from itertools import dropwhile
from sklearn.cluster import KMeans
import csv
import pickle


print("Import Complete")

Import Complete


In [2]:
def save_obj(obj, name ):
    with open( name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open( name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [None]:
def build_vocabulary(messages, word_drop=True):
    cvocabularies = Counter()
    #print("building vocabulary from {} messages".format(len(messages)))
    for start in range(0, len(messages), 500):
        vocabulary = Counter()
        for message in messages[start:start + 500]:
            message = str(message)
            #message = remove_nonalphanumeric(message)
            message_split = message.split()
            gram_count = 3
            grams = []
            for i in range(len(message_split) - (gram_count - 1)):
                gram = ''
                for n in range(gram_count):
                    gram = gram + message_split[i+n] + ' '
                grams.append(gram[:-1])
            vocabulary = vocabulary + Counter(message_split) + Counter(grams)
        cvocabularies = cvocabularies + vocabulary
    if word_drop == True:
        for key, count in dropwhile(
                                    lambda key_count: key_count[1] >= (len(messages) * .01),
                                    cvocabularies.most_common()):
            del cvocabularies[key]
    #print("The vocabulary contains {} words".format(len(cvocabularies)))
    return cvocabularies

In [3]:
def feature_list(file):
    vocab = []
    with open(file, 'rb') as f:
        reader = csv.reader(f)
        for row in reader:
            word = row[0]
            if word not in vocab:
                vocab.append(word)
            else:
                print("Repeat: {}".format(word))
    return vocab

def label_features(df, features_master):
        for i,row in df.iterrows():
            #message = pruning_dict.remove_nonalphanumeric(row.text)
            message = str(row.question_class)
            features = build_vocabulary([message], word_drop=False) & features_master
            features = features + features_master
            features = list(np.array(list(features.values())) - 1)
            df.set_value(i,'features',features)
        return df

def create_feature_dataframe(df, features_master):
    return pd.DataFrame(list(df.features), columns=range(len(features_master)))

def cluster_filter2(df, df2, N, features_master, v=False):
    #print("Start loop")
    gpercentages = []
    singles = 0
    clusterer = KMeans(n_clusters=N)
    clusterer.fit(df2)
    transform = clusterer.transform(df2)
    d_center = []
    cluster = []
    for x in transform:
        d_center.append(min(x)**2)
        cluster.append(np.argmin(x))
    df['cluster'] = cluster
    df['d_from_center'] = d_center
    d_center = np.array(d_center)
    mean = np.mean(d_center)
    std = np.std(d_center)
    '''
    for cgroup in range(N):
        group = df.groupby('cluster').get_group(cgroup)
        if group.question_class.count() <= len(df) * .01:
            df = df.drop(group.index)
            singles += 1
    print("# of singles: {}".format(singles))
    print("df length: {}".format(len(df)))
    if singles >= 6 and N <= (len(df) - singles):
        df = cluster_filter2(df, create_feature_dataframe(df, features_master), N, features_master)
        return df
    '''
    for cgroup in range(N):
        group = df.groupby('cluster').get_group(cgroup)
        gpercent = (compute_gpercentage(group, mean, std))
        gpercentages.append(gpercent)
        '''
        if v == True:
            print("Found {} messages of the same form with a gpercent of {}.".format(len(group), gpercent))
            for message in group.question_class.head(3):
                if group.question_class.count() > 1:
                    print(message)
                    print("")
            print("")
        '''
            
    
    median = np.median(np.array(gpercentages))
    if v == True:
        for cgroup in range(N):
            group = df.groupby('cluster').get_group(cgroup)
            gpercent = (compute_gpercentage(group, mean, std))
            if gpercent >= median:
                print_clusters(group, gpercent)
    else:
        pass
        #print(median)
        #print(np.mean(np.array(gpercentages)))
    return df, median

def compute_gpercentage(group, mean, std):

    gscore = 0.0
    for i, row in group.iterrows():
        z = (row.d_from_center - mean) / std
        if z < -0.68:
            gscore += 1
    glength = len(group)
    gpercent = gscore/glength
    return gpercent

def print_clusters(group, gpercent):
    print("Found {} messages of the same form with a gpercent of {}.".format(len(group), gpercent))
    for message in group.question_class.head(5):
        if group.question_class.count() > 1:
            print(message)
            print("")
    print("")
            
    
    

In [4]:
csvdata = "./data/subcategory_2_think_hr_v2.csv"
df = pd.read_csv(csvdata)
df = df[['question_class']]

#vocab = feature_list('./HotWords.csv')
vocab = load_obj('final_feature_set')

features_master = Counter(vocab)
df["features"] = [[0] * len(vocab)] * len(df)
df = label_features(df, features_master)
df2 = create_feature_dataframe(df, features_master)

medians = []
for N in range(25,35):
    print("N: {}".format(N))
    df, median = cluster_filter2(df, df2, N, features_master)
    medians.append([median, N])

best_clustering_index = np.argmax(medians, axis=0)
best_clustering = medians[best_clustering_index[0]][1]
print("The best clustering is {}:".format(best_clustering))
df, median = cluster_filter2(df, df2, best_clustering, features_master, v=True)





N: 25
N: 26
N: 27
N: 28
N: 29
N: 30
N: 31
N: 32
N: 33
N: 34
The best clustering is 30:
Found 582 messages of the same form with a gpercent of 0.283505154639.
employee hired several weeks ago never provided documents needed verification week told us could provide needed documents proceed

please send information ada interactive process issue

update client asking providing notice employees regarding ab provide information could locate information thinkhr site ab provide notice employees regarding rights victim domestic violence sexual assault stalking

employee drives vehicle course performing job employer provide tank gasoline

employee pass way weekend group life plan employee decedent brother calling us according file bother primary beneficiary superior telling one reach back information sure release information sure executor either


Found 375 messages of the same form with a gpercent of 0.173333333333.
question ways employers payroll entering employee deductions payroll want make s

Found 128 messages of the same form with a gpercent of 0.21875.
ontario california san bernardinodino county see county listed look minimum wage information mean follow los angeles county minimum wage

san bernardino county follow la county minimum wage

follow group located california asking portland commuter law life group sunnyvale ca employees portland understand portland city commuter law employers must follow however know applies let know provisions apply

follow question garnishment question posted information needed answer question two garnishments child support vendor disposable income di di leaves available vendor garnishment still child support consider gets paid first question vendor garnishment calculations think enough di left garnish enough garnish vendor

company policy technicians start timeclock get first job needs serviced employees live town main office driving literally minutes away one employee getting supplies punching immediately driving first service already cl