In [1]:
from data_preprocessing import DataProcessor
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import numpy as np
import pandas as pd

In [36]:
def get_cv_results(clf,kf,inputDF,df):
    
    def print_pretty(title,scores):
        print title,':'
        print 'Average\tStandard Deviation'
        print np.mean(scores),'\t',np.std(scores)
        print 'P1\t\tP2\tP3\t\tP4\t\tP5'
        print 'Mean:'
        print np.mean(scores,axis=0)
        print 'CV:'
        for f in scores:
            print f
        return
    
    cvscores = []
    p_scores = []
    r_scores = []
    for train,test in kf.split(inputDF):
        train_input = inputDF[train]
        test_input = inputDF[test]
        train_output = df['Priority'][train]
        test_output = df['Priority'][test]
        clf.fit(train_input,train_output)
        prediction = clf.predict(test_input)
        scores = f1_score(test_output, prediction, average=None)
        p_scores.append(precision_score(test_output, prediction, average=None))
        r_scores.append(recall_score(test_output, prediction, average=None))
        cvscores.append(scores)
    print 'Cross-Validation Results'
    print_pretty('F1 Scores',cvscores)
    print ''
    print_pretty('Precision Scores',p_scores)
    print ''
    print_pretty('Recall Scores',r_scores)
    return cvscores, p_scores, r_scores

In [33]:
def get_test_result(clf, trainDF, df, testDF, df1):
    
    def print_pretty(title,score):
        print title,':'
        print 'Average:',np.mean(score)
        print 'P1\tP2\tP3\tP4\tP5'
        print score
        return

    clf.fit(trainDF, df['Priority'])
    prediction = clf.predict(testDF)
    f1 = f1_score(df1['Priority'], prediction, average=None)
    p = precision_score(df1['Priority'], prediction, average=None)
    r = recall_score(df1['Priority'], prediction, average=None)
    print 'TEST RESULTS:'
    print_pretty('F1',f1)
    print ''
    print_pretty('Precision',p)
    print ''
    print_pretty('Recall',r)
    return f1,p,r

In [17]:
df_train = pd.read_csv('../data/processed/train_processed.csv')
df_test = pd.read_csv('../data/processed/test_processed.csv')

In [18]:
p = DataProcessor()
inputDF, df = p.fit_transform(df_train)
testDF, df1 = p.transform(df_test)

removed nan values
count vectorizer finished fitting
count vector finished transforming
dummy variables created
created a sparse matrix of all features
count vector finished transforming
dummy variables created
created a sparse matrix of all features


In [19]:
rf = RandomForestClassifier(class_weight="balanced", n_jobs=-1)
nb = MultinomialNB()
svc = SVC()
kf = KFold(n_splits=10)

## Cross-validation Results

In [24]:
%time f1, precision, recall = get_cv_results(nb,kf,inputDF,df)

F1 Scores :
Average	Standard Deviation
0.206922714903 	0.203746442485
P1		P2	P3		P4		P5
Mean:
[ 0.1643758   0.09933404  0.60543734  0.12834093  0.03712546]
CV:
[ 0.15804598  0.09156194  0.60387023  0.12365931  0.04059329]
[ 0.16285714  0.08830744  0.60404213  0.12228797  0.0244898 ]
[ 0.16835017  0.12195122  0.6040446   0.1218593   0.03314002]
[ 0.15882353  0.10403587  0.61451354  0.14303639  0.04278922]
[ 0.17228104  0.10172414  0.60970544  0.14105793  0.02689076]
[ 0.15194869  0.09411765  0.60151229  0.121673    0.05190592]
[ 0.17552182  0.11330472  0.61178461  0.12475634  0.04848485]
[ 0.16336634  0.09515718  0.59800475  0.11485643  0.04205607]
[ 0.16839135  0.08591065  0.60072079  0.12659847  0.02975207]
[ 0.1641719   0.09726962  0.60617505  0.14362416  0.03115265]

Precision Scores :
Average	Standard Deviation
0.243620796339 	0.341869421559
P1		P2	P3		P4		P5
Mean:
[ 0.09391629  0.10556979  0.92473778  0.07313659  0.02074354]
CV:
[ 0.09011469  0.10282258  0.92663755  0.07035176  0.

## Results on Test Data

In [35]:
%time _,_,_ = get_test_result(clf=nb,trainDF=inputDF,df=df,testDF=testDF,df1=df1)

TEST RESULTS:
F1 :
Average: 0.210103322556
P1	P2	P3	P4	P5
[ 0.16997836  0.11461908  0.6032767   0.1261485   0.03649397]

Precision :
Average: 0.246429501152
P1	P2	P3	P4	P5
[ 0.09738503  0.12110225  0.92172505  0.0715493   0.02038588]

Recall :
Average: 0.386253721582
P1	P2	P3	P4	P5
[ 0.66769706  0.10879479  0.44836895  0.53249476  0.17391304]
CPU times: user 356 ms, sys: 0 ns, total: 356 ms
Wall time: 358 ms


(20761, 40)