In [1673]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
import os
import pickle

from sklearn.ensemble import RandomForestClassifier

import pymysql
import json

config_fn = './config.json'


In [1674]:
def save_obj(obj, name ):
    with open( name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open( name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [1675]:
def connect(config):
    return pymysql.connect(
        host=config['ai_db_host'],  # Database host
        port=config['ai_db_port'],  # Database port
        user=config['ai_db_username'],  # Database user
        passwd=config['ai_db_password'],  # Database password
        db=config['ai_db_name'],  # Database name
        connect_timeout=5,
        cursorclass=pymysql.cursors.DictCursor
    )

def pull_data():
    with open(config_fn, "r") as f:
        config = json.loads(f.read())
    conn = connect(config)
    sql_1 = "SELECT rowId, question, category FROM cleanHotlineQuestionAnswer;"
    with conn.cursor() as cursor:
        cursor.execute(sql_1)
    result = cursor.fetchall()
    cursor.close()
    return result

In [1676]:
def generlize_cluster(cluster, category, cluster_list, df):


    vectorizer, svd, normalizer, clusterer = load_model(category)
    pipeline =  make_pipeline(vectorizer, svd)

    X = pipeline.transform(list(df.question))    
    y = [1 if x == cluster else 0 for x in cluster_list]

    clf = RandomForestClassifier(max_depth=2, random_state=0)
    clf.fit(X, y)

    features = clf.feature_importances_

    releventF = np.argsort(features)[-2:]
    svdcomponent = []
    for i in releventF:
        for m in svd.components_[i]:
            svdcomponent.append(m)
    releventsvd = np.argsort(svdcomponent)[-5:]
    print(releventsvd)
    for n in releventsvd:
        try:
            print(vectorizer.get_feature_names()[n])
        except: pass


In [1677]:
def cluster(df, df2, N, name, v=False):
    clusterer = KMeans(n_clusters=N)
    clusterer.fit(list(df.features))
    save_obj(clusterer, './models/clusterer_' + name )

    transform = clusterer.transform(list(df.features))

    d_center = []
    cluster = []
    for x in transform:
        d_center.append(min(x)**2)
        cluster.append(np.argmin(x))
    df['cluster'] = cluster
    df['d_from_center'] = d_center
    d_center = np.array(d_center)
    mean = np.mean(d_center)
    std = np.std(d_center)
    
    if v == True:
        print("Mean: {}".format(round(mean, 3)))
        print("STD: {}".format(round(std, 3)))
        print("")
         
        for cgroup in range(N):
            generlize_cluster(cgroup, name, cluster, df)
            group = df.groupby('cluster').get_group(cgroup)
            print_clusters(group)

    return df

def print_clusters(group):
    std = np.std(list(group.d_from_center))
    mean = np.mean(list(group.d_from_center))
    
    center = group[group["d_from_center"] == min(group["d_from_center"])]

    center.drop_duplicates(subset=['question'], inplace=True)
    
    

    
    print("Found {} messages of the same form.  Mean: {} STD: {}".format(len(group), mean, std))
    print("*** {} ***".format(list(center.question)[0]))
    print("")
    for message in group.question.head(5):
        if group.question.count() > 1:
            print(message)
            print("")
    print("")

In [1678]:
def print_to_tsv(df, X, cat_name):
    vector_doc = './visualization_data/doc_vectors_' + cat_name + '.tsv'
    count = 0
    with open(vector_doc,'w') as w:
        for question in X:
            string = ""
            for v in question:
                string = string + str(v) + "\t"
            w.write(string + os.linesep)
            count += 1
    w.close
    print("Wrote file {} with {} entries".format(vector_doc, count))


    meta_doc = './visualization_data/doc_meta_' + cat_name + '.tsv'
    count = 0
    with open(meta_doc,'w') as w:
        w.write("cluster\tquestion\t" + os.linesep)
        for question, cluster in zip(list(df.question), list(df.cluster)):
            string = ""
            string = str(cluster) + "\t" + str(question) + "\t"
            w.write(string + os.linesep)  
            count += 1
    w.close
    print("Wrote file {} with {} entries".format(meta_doc, count))


In [1679]:
def train_model(df, N, name):
    print("Loaded {} Data Points".format(len(df)))
    vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.7 )
    X_vectoizer = vectorizer.fit_transform(list(df.question))
    save_obj(vectorizer, './models/vectorizer_' + name )
    print("Vectorization Complete")

    n_components = 60
    explained_variance = 0.0
    while explained_variance < .5 and n_components < 175:
        svd = TruncatedSVD(n_components=n_components)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X_vectoizer)
        
        save_obj(svd, './models/svd_' + name )
        save_obj(normalizer, './models/normalizer_' + name )
        df["features"] = list(X)
        
        explained_variance = svd.explained_variance_ratio_.sum()
        n_components += 5
    print("Explained variance of the SVD step: {}%     n_componets: {}".format(
        int(explained_variance * 100), n_components))
    df = cluster(df, X, N, name, v=True)
    print_to_tsv(df, X, name)

In [1680]:
def train_all():    
    df_master = pd.DataFrame(pull_data())
    cat_names = ["Terminations"]
    #cat_names = ["Compensation"#, "Compliance", "Employee Benefits",
                #"Leaves of Absence", "Recruiting and Hiring", "Terminations"]
    Ns = [20, 15, 14, 9, 9, 7]
    Ns = [7]

    for name, N in zip(cat_names, Ns):
        df = df_master[df_master["category"] == name].copy()[:100]
        train_model(df, N, name)
        df["prediction"] = predict(list(df.question), name)
        correct = len(df[df["cluster"] == df["prediction"]])
        total = float(len(df))
        print("The model {} consistantly classified {}% of the dataset".format(name, round(correct/total * 100, 1)))
        print("")






In [1681]:
def predict(messages, category):
    "This predicts the cluster of a message based on the kmeans model for the given category"
    "Inputs:"
    "        messages: a list of strings to be classified"
    "        category: a string containing the model to be used"
    "Output:"
    "        clusters: a list of intergers corresponding to the message classification cluster"
    
    vectorizer, svd, normalizer, clusterer = load_model(category)

    
    pipeline =  make_pipeline(vectorizer, svd, normalizer)
    messages = pipeline.transform(messages)

    clusters = clusterer.predict(messages)
    return clusters

def load_model(category):
    vectorizer = load_obj( './models/vectorizer_' + category )
    svd = load_obj( './models/svd_' + category )
    normalizer = load_obj( './models/normalizer_' + category )
    clusterer = load_obj("./models/clusterer_" + category)
    return vectorizer, svd, normalizer, clusterer

In [1682]:
train_all()

Loaded 100 Data Points
Vectorization Complete
Explained variance of the SVD step: 76%     n_componets: 65
Mean: 0.777
STD: 0.14

[  29 2433 2175  771 2350]
about
performance


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Found 8 messages of the same form.  Mean: 0.755896431393 STD: 0.071721228746
*** Our Michigan office has a few college students doing an internship and they are paid interns. Unfortunately, one of them seems not to be a good fit and HR is receiving many complaints from employees. Although we have not recorded any tardiness and poor performance of this intern and have not officially disciplined her yet, but employees wants her to go now. Can we end her internship immediately without giving any reasons? It's an internship relationship, not employment relationship from our point of view, but since we are paying her, I wonder from the legal point of view, the court regards it as an employment relationship if we are sued by her. ***

* > > In California, when it comes to termination that isn't on a regular pay > day, can we deposit into an employees account if they have regular direct > deposit set up? Or do we need written consent from the employee for the > final check? > > --------------

[1900 2418 2335  510 1738]
how
Found 10 messages of the same form.  Mean: 0.724658789791 STD: 0.120563783949
*** We have an issue with a long time employee. This employee has been with the company for many years. He has been counseled and written up on no less than two separate occasions for asking female employees out to lunch or to get a drink. Even after being spoken to about it he then tries to apologize by bringing them flowers and asking them out again. We have had another incident with him, and we have decided that this is the final straw. We are planning to terminate, however, the CEO wants to offer him the choice of resigning (mentioning retirement) and allowing this to be his two weeks' notice. What are your thoughts about this? ***

We have a longer term manager we are seeking to terminate. We recently discovered there have been coupon/discounts that this manager have applied spanning a 2 year period, however these cannot be substantiated in the reporting (e.g. coupons not p