In [1859]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
import os
import pickle

from sklearn.ensemble import RandomForestClassifier

import pymysql
import json

config_fn = './config.json'


In [1860]:
def save_obj(obj, name ):
    with open( name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open( name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [1861]:
def connect(config):
    return pymysql.connect(
        host=config['ai_db_host'],  # Database host
        port=config['ai_db_port'],  # Database port
        user=config['ai_db_username'],  # Database user
        passwd=config['ai_db_password'],  # Database password
        db=config['ai_db_name'],  # Database name
        connect_timeout=5,
        cursorclass=pymysql.cursors.DictCursor
    )

def pull_data():
    with open(config_fn, "r") as f:
        config = json.loads(f.read())
    conn = connect(config)
    sql_1 = "SELECT rowId, question, category FROM cleanHotlineQuestionAnswer;"
    with conn.cursor() as cursor:
        cursor.execute(sql_1)
    result = cursor.fetchall()
    cursor.close()
    return result

In [1862]:
def cluster(df, df2, N, name, v=False):
    clusterer = KMeans(n_clusters=N)
    clusterer.fit(list(df.features))
    save_obj(clusterer, './models/clusterer_' + name )

    transform = clusterer.transform(list(df.features))

    d_center = []
    cluster = []
    for x in transform:
        d_center.append(min(x)**2)
        cluster.append(np.argmin(x))
    df['cluster'] = cluster
    df['d_from_center'] = d_center
    d_center = np.array(d_center)
    mean = np.mean(d_center)
    std = np.std(d_center)
    
    if v == True:
        print("Mean: {}".format(round(mean, 3)))
        print("STD: {}".format(round(std, 3)))
        print("")
         
        for cgroup in range(N):
            group = df.groupby('cluster').get_group(cgroup)
            print_clusters(group)

    return df

def print_clusters(group):
    std = np.std(list(group.d_from_center))
    mean = np.mean(list(group.d_from_center))
    
    center = group[group["d_from_center"] == min(group["d_from_center"])]

    center.drop_duplicates(subset=['question'], inplace=True)
    
    

    
    print("Found {} messages of the same form.  Mean: {} STD: {}".format(len(group), mean, std))
    print("*** {} ***".format(list(center.question)[0]))
    print("")
    for message in group.question.head(10):
        if group.question.count() > 1:
            print(message)
            print("")
    print("")

In [1863]:
def print_to_tsv(df, X, cat_name):
    vector_doc = './visualization_data/doc_vectors_' + cat_name + '.tsv'
    count = 0
    with open(vector_doc,'w') as w:
        for question in X:
            string = ""
            for v in question:
                string = string + str(v) + "\t"
            w.write(string + os.linesep)
            count += 1
    w.close
    print("Wrote file {} with {} entries".format(vector_doc, count))


    meta_doc = './visualization_data/doc_meta_' + cat_name + '.tsv'
    count = 0
    with open(meta_doc,'w') as w:
        w.write("cluster\tquestion\t" + os.linesep)
        for question, cluster in zip(list(df.question), list(df.cluster)):
            string = ""
            string = str(cluster) + "\t" + str(question) + "\t"
            w.write(string + os.linesep)  
            count += 1
    w.close
    print("Wrote file {} with {} entries".format(meta_doc, count))


In [1864]:
def train_model(df, N, name):
    print("Loaded {} Data Points".format(len(df)))
    vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.7 )
    X_vectoizer = vectorizer.fit_transform(list(df.question))
    save_obj(vectorizer, './models/vectorizer_' + name )
    print("Vectorization Complete")

    n_components = 60
    explained_variance = 0.0
    while explained_variance < .5 and n_components < 175:
        svd = TruncatedSVD(n_components=n_components)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X_vectoizer)
        
        save_obj(svd, './models/svd_' + name )
        save_obj(normalizer, './models/normalizer_' + name )
        df["features"] = list(X)
        
        explained_variance = svd.explained_variance_ratio_.sum()
        n_components += 5
    print("Explained variance of the SVD step: {}%     n_componets: {}".format(
        int(explained_variance * 100), n_components))
    df = cluster(df, X, N, name, v=True)
    print_to_tsv(df, X, name)

In [1865]:
def train_all():    
    df_master = pd.DataFrame(pull_data())
    cat_names = [ "Compliance", "Employee Benefits",
                "Leaves of Absence", "Recruiting and Hiring", "Terminations"]
    Ns = [ 15, 14, 9, 9, 7]

    for name, N in zip(cat_names, Ns):
        df = df_master[df_master["category"] == name].copy()
        train_model(df, N, name)
        df["prediction"] = predict(list(df.question), name)
        correct = len(df[df["cluster"] == df["prediction"]])
        total = float(len(df))
        print("The model {} consistantly classified {}% of the dataset".format(name, round(correct/total * 100, 1)))
        print("")






In [1866]:
def predict(messages, category):
    "This predicts the cluster of a message based on the kmeans model for the given category"
    "Inputs:"
    "        messages: a list of strings to be classified"
    "        category: a string containing the model to be used"
    "Output:"
    "        clusters: a list of intergers corresponding to the message classification cluster"
    
    vectorizer, svd, normalizer, clusterer = load_model(category)

    
    pipeline =  make_pipeline(vectorizer, svd, normalizer)
    messages = pipeline.transform(messages)
    
    clusters = clusterer.predict(messages)

    
    return clusters

def load_model(category):
    vectorizer = load_obj( './models/vectorizer_' + category )
    svd = load_obj( './models/svd_' + category )
    normalizer = load_obj( './models/normalizer_' + category )
    clusterer = load_obj("./models/clusterer_" + category)
    return vectorizer, svd, normalizer, clusterer

In [1867]:
train_all()



Loaded 27277 Data Points
Vectorization Complete
Explained variance of the SVD step: 50%     n_componets: 120
Mean: 0.687
STD: 0.15

Found 3690 messages of the same form.  Mean: 0.709560861424 STD: 0.101242823393
*** Thanks for the information. I have additional questions in regards to I-9. Could you please advise? * In our specific case, our headquarters (where all of the I9 Forms are stored) is located in NJ. However, we have stores in several different towns/cities throughout NY, NJ, PA, GA, and MA. How do we go about correcting Section 1 errors on the I9 form? * With the same case in mind, different managers filled out Section 2 of the I9. Are we (HR) allowed to fix the errors that they made or do they have to do it? If they have to do it, how do we go about it? What if they are currently inactive? * We have a lot that have errors so we made a spreadsheet of each active employee with I9 errors. We have also heard that it is advised against to ask employees to redo the I9 form (due t

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Found 3228 messages of the same form.  Mean: 0.674223515171 STD: 0.105559435401
*** WAND Corporation HR question - incentive attendance program with FMLA team member Company: WAND Corporation Think HR Hotline: Questions: 1) Can we exclude "John" from being eligible in the "SCURAP" program? 2) If not, can we ask him to remove himself from the program? 3) If not, how would you recommend we handle the below situation. Details: Our 24/7/365 call center was having issues with unplanned absences. To combat this, we rolled out an incentive program that employees are eligible for if they do not have a an unplanned absence for the period. We have one employee "John" who is on intermittent FMLA. He has several unplanned absences throughout the period which are identified as FMLA absences. For the first period we ran this program, FMLA unplanned absences did not exclude John from the incentive program. However, for this 2nd period (Q4-2017) of this program, he has had many unplanned FMLA absences

Hi Claudia, I should have clarified my inquiry it is in regards to the CA workers compensation requirement for first aid reporting. It appears there is a new amendment effective 1/1/2017 stating that first aid also needs to be reported for workers comp. https://www.wcirb.com/sites/default/files/bulletins/2016-25_reporting_small_medical_only_first_aid_claims.pdf However, on the ThinkHR website it says reporting is only for treatment beyond first aid

From : mary.pham@stp-sf.org Subject : Hello, I had a question about having a mother's room. Is a sink required?


Found 449 messages of the same form.  Mean: 0.59608345771 STD: 0.126084426535
*** If an employee has a temporary social security and then they bring in a permanent social security card, would the numbers be different? It is possible to get a new SSN? Is it possible to have an unrestricted SSN Card and a EAD card? We been doing E-Verify for two and she has been employed since 2013. Her EAD expires in November. ***

Thank you. If 

Found 1364 messages of the same form.  Mean: 0.713959482567 STD: 0.0994830151936
*** I am working on getting our personnel files compliant. We keep paper copies of files - nothing is filed electronically. In my research for best practices, I've found some concrete things (EEO-1, I-9, medical records, background/drug screens, etc. must be kept separate) but there is also a lot of grey area. Our corporate HR office has 3 people with access to personnel files and only 2 of those people handle record requests. Anyone requesting info is not handed the whole personnel file - instead, we screen and only provide pertinent information. Currently, our personnel file contains hiring/job docs, tax/payroll docs, and performance docs. We do keep those components separate by using tabbed dividers within the file. I worry that if we further break down the personnel file, we will run out of space. Because of the screening system we have in place, will it result in a compliance issue if we continue what

Wrote file ./visualization_data/doc_vectors_Compliance.tsv with 27277 entries
Wrote file ./visualization_data/doc_meta_Compliance.tsv with 27277 entries
The model Compliance consistantly classified 99.9% of the dataset

Loaded 40236 Data Points
Vectorization Complete
Explained variance of the SVD step: 50%     n_componets: 130
Mean: 0.711
STD: 0.144

Found 6038 messages of the same form.  Mean: 0.867030931293 STD: 0.0898394639109
*** * ***

Hi Erin, What we need to know is if the new law went into effect that requires the plans to be put in a wrap document for 5500 instead of being filed separately. Thank you, Sandy Intfen, RHU Benefits Senior Account Manager

Hi Erin, What we need to know is if the new law went into effect that requires the plans to be put in a wrap document for 5500 instead of being filed separately. Thank you, Sandy Intfen, RHU Benefits Senior Account Manager

Set of a POP: Group with 25 employees fully insured has some questions: Year End Testing: How does the empl

Found 2019 messages of the same form.  Mean: 0.637932021521 STD: 0.115029766795
*** Smalley Steel has an HSA plan (among other medical offerings), and the employer makes a contribution to the HSA account. They are currently entering the full HSA rate (including the employer funding) into their payroll system. 1. Is it appropriate to enter the full rate (including HSA funding) into the payroll system? 2. For W2 reporting purposes, is it okay that the full value of the HSA plan is being reported for all employees enrolled in that plan? The concern is that some people enrolled in the HSA plan have not opened up an HSA account, and are therefore not receiving the employer funding. Hence, the value of this plan for those individuals would be overstated on the W2. ***

Employee is enrolled in a PPO plan and the spouse is enrolled in a HDHP with an HSA. The employee has elected a healthcare FSA with the PPO. This is not permitted under IRS rules. Is the employer allowed to switch funds mid-ye

Found 2558 messages of the same form.  Mean: 0.60063306238 STD: 0.118102755194
*** Back in September, she was diagnosed with ovarian cancer and had a complete hysterectomy. After additional testing after the surgery, they cleared her and said they did not find additional cancer, so she did not have to follow up with radiation or chemotherapy. In January, she started feeling sick like she had the flu. She worked through the 13th of January. Starting on the 16th, she was out sick. She was out that whole week. At the end of the week, she was in the hospital and was told that she had liver cancer, but on Monday, the 23rd, her general practicioner said that she did not have cancer, but had not seen the results of the biopsy. She came to work on Wednesday the 25th, but really was not productive, did not feel well, had her head on the desk at points. At the end of the week, she found out that it was in fact cancer that had traveled to her liver. She was scheduled for additional testing and pr

Wrote file ./visualization_data/doc_vectors_Employee Benefits.tsv with 40236 entries
Wrote file ./visualization_data/doc_meta_Employee Benefits.tsv with 40236 entries
The model Employee Benefits consistantly classified 99.9% of the dataset

Loaded 11513 Data Points
Vectorization Complete
Explained variance of the SVD step: 50%     n_componets: 115
Mean: 0.697
STD: 0.139

Found 1142 messages of the same form.  Mean: 0.690753426881 STD: 0.100096439654
*** I have an employer that has an employee who has been out for almost a month. They did not initiate any FMLA paperwork because every time they talk to the employee, they say that they are coming back to work. However, they have only worked a couple of days over the last four weeks. They keep sending notes from doctors stating they can't come back to work. They have used all of their PTO and are not receiving pay. They work in an office of two people so it is impacting their ability to do business so they want to let them go and replace t

Found 831 messages of the same form.  Mean: 0.720649990995 STD: 0.101711816926
*** Employee - New Army Reservist Hello, Can someone point me in the direction of any literature that you may have regarding a current employee (firefighter) becoming a newly active Army Reservist? Such as, helpful information on leave for initial training, leave for monthly drills, and leave for deployments? Thank you! Kimberly Fugate Executive Assistant Pinellas Suncoast Fire & Rescue District 304 First Street Indian Rocks Beach, FL 33785 Phone: (727) 595-1117 ext.100 Fax: (727) 595-5879 Under Florida law, e-mail addresses are public records. If you do not want your e-mail address released in response to a public records request, do not send electronic mail to this entity. Instead, contact this office by phone or in writing. FS 668.6076. ***

Who manages FLMA, does our company work on it or is it better to contract a third party that manages just FMLA? Please advise. Thank you.

We have some questions abou

Wrote file ./visualization_data/doc_vectors_Leaves of Absence.tsv with 11513 entries
Wrote file ./visualization_data/doc_meta_Leaves of Absence.tsv with 11513 entries
The model Leaves of Absence consistantly classified 100.0% of the dataset

Loaded 4813 Data Points
Vectorization Complete
Explained variance of the SVD step: 51%     n_componets: 110
Mean: 0.725
STD: 0.128

Found 431 messages of the same form.  Mean: 0.759318088509 STD: 0.0836138309547
*** Good Morning, I am so grateful for you services that you provide for us? I have been given the opportunity to interview potential new hires. My questions are, do you have anything form or documentation on the correct question to ask during an interview? I know you cannot write comments on the application or resume. Discuss religion or ethnics. Anything to do with age. Politics'? If you have anything to assist me, I would really appreciate it. Thank you for your ongoing support. ***

I have a client that might be hiring a few employees i

Found 443 messages of the same form.  Mean: 0.683032009152 STD: 0.104667410957
*** HI, we are implementing background checks at our organization and need guidance on order of process. Do we have to offer the job and receive acceptance before we run the background check? or do can we order the background check with the proper authorization and offer the job after the background has been ran? ***

We currently do not do background checks, but we are considering implementing them. Can we set a hard and fast rule that candidates can have no felonies whatsoever to pass the background process?

What forms are we required to give a job candidate that is rejected due to their background check? We have locations in CA, NY, OR, MA, PA, VA, FL, CT, NH

When we issue a candidate a preliminary adverse action letter based upon background check results, how long do we have to give him to respond before taking final action?

Hello! Should I be stating the fact that we do background checks and pre empl

Found 821 messages of the same form.  Mean: 0.694350591754 STD: 0.0972266536123
*** References Hi, We had a potential employer for a terminated employee call one of our supervisors on the company number to ask them for "Professional References". Normally, I take all Employment verification calls but this really wasn't that type of call. How would we handle this call? From my experience we aren't supposed to give opinions on job performance, work ethic or any of those things. Can you please advise on what we should do in this case? Should they have called this person on their personal phone to ask these questions???? Have a fantastic day!!! [Carleton Gotlin] DAWN J DEPRIEST OFFICE MANAGER Carleton Gotlin Law PC 1580 Lincoln St, Suite 1200 Denver, CO 80203 T: 303.825.1125 F: 303.302.3088 www.carletongotlinlaw.com NOTICE: The information set forth in this electronic mail transmission, and any attachments, are intended solely for the person to whom it is addressed, and may contain confiden

The model Recruiting and Hiring consistantly classified 100.0% of the dataset

Loaded 6830 Data Points
Vectorization Complete
Explained variance of the SVD step: 50%     n_componets: 110
Mean: 0.699
STD: 0.149

Found 833 messages of the same form.  Mean: 0.834745727724 STD: 0.104611686039
*** Confidential. ***

Do you have access to or have any benchmark data for executive employment agreements as it relates to severance payments. Specifically for a company with around 175 employees in Tennessee. I found a Lee Hecht Harrison and a world at work study. It's difficult to compare to those since company size is important.

Are there any recent (last 24 months) severance agreement cases that provided favorable outcomes to employees with under 25 employees? Are severance agreements advised for groups that size if they have a clear employee handbook signed at hire? What states require severance?

Do you have a proprietary data form for exiting employees to sign? Currently have a non-disclosur

Found 1275 messages of the same form.  Mean: 0.563866922643 STD: 0.134597632781
*** Hello, We have a situation that Id like some guidance on: Our receptionist has been feeling stressed about work and life issues. In high emotion on 1/14/17 she said she was packing her desk and quitting. Our General Manager encouraged her to take a week of personal time to decide if she wanted to stay and work things out, or if she really wanted to quit. She did take her personal items with her that day. While she was gone, the Club functioned better than usual, without the emotion and stress that her presence created. We hoped she would decide to quit, but planned to terminate her otherwise. We called her today before she reported for her return shift and she informed us that she did indeed decide to retire. She intended to come in to work out her notice, but we told her that we think it is best to end things immediately and we would pay her for the two weeks in lieu of her coming in. We have not done 

Wrote file ./visualization_data/doc_vectors_Terminations.tsv with 6830 entries
Wrote file ./visualization_data/doc_meta_Terminations.tsv with 6830 entries
The model Terminations consistantly classified 100.0% of the dataset

