In [44]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import  AdaBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OutputCodeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.metrics import accuracy_score

from sklearn.decomposition import TruncatedSVD

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
import os

import pymysql
import json

config_fn = './config.json'

print("Import Complete")

Import Complete


In [45]:
def connect(config):
    return pymysql.connect(
        host=config['ai_db_host'],  # Database host
        port=config['ai_db_port'],  # Database port
        user=config['ai_db_username'],  # Database user
        passwd=config['ai_db_password'],  # Database password
        db=config['ai_db_name'],  # Database name
        connect_timeout=5,
        cursorclass=pymysql.cursors.DictCursor
    )

def pull_data():
    with open(config_fn, "r") as f:
        config = json.loads(f.read())
    conn = connect(config)
    sql_1 = "SELECT rowId, question, category FROM cleanHotlineQuestionAnswer;"
    with conn.cursor() as cursor:
        cursor.execute(sql_1)
    result = cursor.fetchall()
    cursor.close()
    return result

In [46]:
def split_set(x, y, test_size):
    X_train, X_test, y_train, y_test = train_test_split(list(x),
                                                        list(y),
                                                        test_size=test_size)
    return X_train, X_test, y_train, y_test

def train_models(df, v=False):
    
    X_train, X_test, y_train, y_test = split_set(df.features, df.category, 0.2)

    names = ["NerualNet",  
             "DecisionTree", "OneVsRestClassifier", 
             "OneVsOneClassifier", "OutputCodeClassifier",  
             ]
    clfs = [MLPClassifier( max_iter=500),
            
                DecisionTreeClassifier(),
            OneVsRestClassifier( MLPClassifier( max_iter=500)),
                
           OneVsOneClassifier(DecisionTreeClassifier()),
           OutputCodeClassifier(MLPClassifier( max_iter=500)),
            ]
    accuracies = []
    for clf, name in zip(clfs, names):
        clf.fit(X_train, y_train)
        y_predict = clf.predict(X_test)
        '''
        print(y_test)
        print("y_predict")
        print(y_predict)
        
        y_test2 = np.asarray(y_test)
        y_test2 = y_test2.dot(ohe.active_features_).astype(int)
        y_predict = np.asarray(y_predict.dot(ohe.active_features_).astype(int))
        #print(y_test2, y_predict)
        '''
        score = accuracy_score(y_test, y_predict)
        if v == True:
            print("The accuracy for {} is {}".format(name, score))
        accuracies.append(score)
    return np.mean(accuracies)

In [47]:
def build_features(df):
    vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.7 )
    X_vectoizer = vectorizer.fit_transform(list(df.question))
    print("Vectorization Complete")

    n_components = 50
    explained_variance = 0.0
    while explained_variance < .5 and n_components < 175:
        svd = TruncatedSVD(n_components=n_components)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X_vectoizer)
        #df["features"] = list(X)
        explained_variance = svd.explained_variance_ratio_.sum()
        n_components += 5

        print("Explained variance of the SVD step: {}%     n_componets: {}".format(
            int(explained_variance * 100), n_components))
    return X


In [48]:
df = pd.DataFrame(pull_data())
df = df[:50000]
print(df.head())
print("Loaded {} Data Points".format(len(df)))

df["features"] = list(build_features(df))

acc = train_models(df, v=True)


            category                                           question  rowId
0         Compliance  I have both a men's and a women's restrooms bu...      1
1  Employee Benefits  I have had an employee ask if their dependent ...      2
2  Employee Benefits  As a broker do we need a business associate ag...      3
3  Employee Benefits  The company I work for currently offers a simp...      4
4         Compliance  Hello, I have been asked to create a project f...      5
Loaded 50000 Data Points
Vectorization Complete
Explained variance of the SVD step: 29%     n_componets: 55
Explained variance of the SVD step: 31%     n_componets: 60
Explained variance of the SVD step: 33%     n_componets: 65
Explained variance of the SVD step: 34%     n_componets: 70
Explained variance of the SVD step: 36%     n_componets: 75
Explained variance of the SVD step: 38%     n_componets: 80
Explained variance of the SVD step: 39%     n_componets: 85
Explained variance of the SVD step: 40%     n_componets: 9