In [466]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
import os

import pymysql
import json

config_fn = './config.json'


print("Import Complete")

Import Complete


In [467]:
def connect(config):
    return pymysql.connect(
        host=config['ai_db_host'],  # Database host
        port=config['ai_db_port'],  # Database port
        user=config['ai_db_username'],  # Database user
        passwd=config['ai_db_password'],  # Database password
        db=config['ai_db_name'],  # Database name
        connect_timeout=5,
        cursorclass=pymysql.cursors.DictCursor
    )

def pull_data():
    with open(config_fn, "r") as f:
        config = json.loads(f.read())
    conn = connect(config)
    sql_1 = "SELECT rowId, question, category FROM cleanHotlineQuestionAnswer WHERE category='Compensation';"
    with conn.cursor() as cursor:
        cursor.execute(sql_1)
    result = cursor.fetchall()
    cursor.close()
    return result

In [468]:
def cluster(df, df2, N, v=False):
    clusterer = KMeans(n_clusters=N)
    clusterer.fit(df2)
    transform = clusterer.transform(df2)
    
    d_center = []
    cluster = []
    for x in transform:
        d_center.append(min(x)**2)
        cluster.append(np.argmin(x))
    df['cluster'] = cluster
    df['d_from_center'] = d_center
    d_center = np.array(d_center)
    mean = np.mean(d_center)
    std = np.std(d_center)

    print("Mean: {}".format(round(mean, 3)))
    print("STD: {}".format(round(std, 3)))
    print("")

  

    if v == True:
        for cgroup in range(N):
            group = df.groupby('cluster').get_group(cgroup)
            print_clusters(group)

    
    return df

def print_clusters(group):
    std = np.std(list(group.d_from_center))
    print("Found {} messages of the same form.   STD: {}".format(len(group), std))
    for message in group.question.head(5):
        if group.question.count() > 1:
            print(message)
            print("")
    print("")

In [469]:
def plot_clusters(df, N):
    features = list(df.features)
    svd = TruncatedSVD(n_components=50, n_iter=2, random_state=42)
    #svd =  PCA(n_components=2)
    tsne = TSNE(n_components=2, perplexity=10, verbose=2)
    feature_reduction =  make_pipeline(svd, tsne)

    transformed = pd.DataFrame(feature_reduction.fit_transform(features), columns=["plot_cordX", "plot_cordY"])
    meanX = transformed["plot_cordX"].mean()
    meanY = transformed["plot_cordY"].mean()
    transformed["plot_cordX"] = transformed["plot_cordX"].divide(meanX) - 1
    transformed["plot_cordY"] = transformed["plot_cordY"].divide(meanY) - 1
    meanX = transformed["plot_cordX"].mean()



    df["plot_cordX"] = transformed.plot_cordX
    df["plot_cordY"] = transformed.plot_cordY

    q = df["plot_cordX"].quantile(0.9)
    df = df[df["plot_cordX"] < q]
    q = df["plot_cordY"].quantile(0.9)
    df = df[df["plot_cordY"] < q]


    for n in range(N):
        plt.scatter(df[df["cluster"] == n].plot_cordX, df[df["cluster"] == n].plot_cordY, label="Class " + str(n))



    #plt.legend()
    plt.show()



In [470]:
df = pd.DataFrame(pull_data())
#df = df[df["category"] == "Compensation"][:10000]
print("Loaded {} Data Points".format(len(df)))
print(df.head())
N = 20




vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.7 )
X_vectoizer = vectorizer.fit_transform(list(df.question))
print("Vectorization Complete")

n_components = 60
explained_variance = 0.0
while explained_variance < .5 and n_components < 175:
    svd = TruncatedSVD(n_components=n_components)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    X = lsa.fit_transform(X_vectoizer)
    df["features"] = list(X)
    explained_variance = svd.explained_variance_ratio_.sum()
    n_components += 5

    print("Explained variance of the SVD step: {}%     n_componets: {}".format(
        int(explained_variance * 100), n_components))


df = cluster(df, X, N, v=False)





Loaded 20275 Data Points
       category                                           question  rowId
0  Compensation  We use time card rounding. Should we go off th...      7
1  Compensation  Amy- Thank you for the response. Im aware of t...     22
2  Compensation  Hourly employees- CA breaks and lunch clarific...     32
3  Compensation  Hello, I am inquiring what the rule is for bre...     36
4  Compensation  Severance Plan. We are updating our severance ...     45
Vectorization Complete
Explained variance of the SVD step: 37%     n_componets: 65
Explained variance of the SVD step: 39%     n_componets: 70
Explained variance of the SVD step: 41%     n_componets: 75
Explained variance of the SVD step: 42%     n_componets: 80
Explained variance of the SVD step: 44%     n_componets: 85
Explained variance of the SVD step: 45%     n_componets: 90
Explained variance of the SVD step: 47%     n_componets: 95
Explained variance of the SVD step: 48%     n_componets: 100
Explained variance of the S

In [471]:
vector_doc = 'doc_vectors_compensation.tsv'
count = 0
with open(vector_doc,'w') as w:
    for question in X:
        string = ""
        for v in question:
            string = string + str(v) + "\t"
        w.write(string + os.linesep)
        count += 1
w.close
print("Wrote file {} with {} entries".format(vector_doc, count))


meta_doc = 'doc_meta_compensation.tsv'
count = 0
with open(meta_doc,'w') as w:
    w.write("cluster\tquestion\t" + os.linesep)
    for question, cluster in zip(list(df.question), list(df.cluster)):
        string = ""
        string = str(cluster) + "\t" + str(question) + "\t"
        w.write(string + os.linesep)  
        count += 1
w.close
print("Wrote file {} with {} entries".format(meta_doc, count))



Wrote file doc_vectors_compensation.tsv with 20275 entries
Wrote file doc_meta_compensation.tsv with 20275 entries
