In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC

# Naive Bayes

In [4]:
# Dataset, Split, Train set, Test set, Include x% of test participant in train, Feature Set, Preprocessing, Algorithm
# String, Float (0-1), List, List, Float (0-1), String, String, String

In [5]:
def prepare_data(dataset, split, train, test, includex, features, preprocessing):
    
    if(split>0):
        if(dataset=='regular'):
            path = 'data/Master/' + item + '_master.csv'
            tdf = pd.read_csv(path)
            length = int(split*len(tdf))
            train_df = tdf.iloc[:length,:]
            test_df = tdf.iloc[length:,:]
            #test_df = test_df.drop(['engagement'],axis=1)
        elif(dataset=='smooth'):
            path = 'data/Master_Smooth/' + item + '_master_smooth.csv'
            tdf = pd.read_csv(path)
            length = int(split*len(tdf))
            train_df = tdf.iloc[:length,:]
            test_df = tdf.iloc[length:,:]
            #test_df = test_df.drop(['engagement'],axis=1)
        else:
            path = 'data/Master_Window/' + item + '_master_window.csv'
            tdf = pd.read_csv(path)
            length = int(split*len(tdf))
            train_df = tdf.iloc[:length,:]
            test_df = tdf.iloc[length:,:] 
            #test_df = test_df.drop(['engagement'],axis=1)
    else:
        # Training Data
        train_df = pd.DataFrame()
        for item in train:
            if(dataset=='regular'):
                path = 'data/Master/' + item + '_master.csv'
                tdf = pd.read_csv(path)
                train_df = train_df.append(tdf)
            elif(dataset=='smooth'):
                path = 'data/Master_Smooth/' + item + '_master_smooth.csv'
                tdf = pd.read_csv(path)
                train_df = train_df.append(tdf)
            else:
                path = 'data/Master_Window/' + item + '_master_window.csv'
                tdf = pd.read_csv(path)
                train_df = train_df.append(tdf)       

        # Include x% of Test Participants Data
        if(includex>0):
            item = test[0]
            if(dataset=='regular'):
                path = 'data/Master/' + item + '_master.csv'
                tdf = pd.read_csv(path)
                length = int(includex*len(tdf))
                tdf = tdf.iloc[:length,:]
                train_df = train_df.append(tdf)
            elif(dataset=='smooth'):
                path = 'data/Master_Smooth/' + item + '_master_smooth.csv'
                tdf = pd.read_csv(path)
                length = int(includex*len(tdf))
                tdf = tdf.iloc[:length,:]
                train_df = train_df.append(tdf)
            else:
                path = 'data/Master_Window/' + item + '_master_window.csv'
                tdf = pd.read_csv(path)
                length = int(includex*len(tdf))
                tdf = tdf.iloc[:length,:]
                train_df = train_df.append(tdf)      

        # Test Data
        test_df = pd.DataFrame()
        for item in test:
            if(dataset=='regular'):
                path = 'data/Master/' + item + '_master.csv'
                tdf = pd.read_csv(path)
                #tdf = tdf.drop(['engagement'],axis=1)
                test_df = test_df.append(tdf)
            elif(dataset=='smooth'):
                path = 'data/Master_Smooth/' + item + '_master_smooth.csv'
                tdf = pd.read_csv(path)
                #tdf = tdf.drop(['engagement'],axis=1)
                test_df = test_df.append(tdf)
            else:
                path = 'data/Master_Window/' + item + '_master_window.csv'
                tdf = pd.read_csv(path)
                #tdf = tdf.drop(['engagement'],axis=1)
                test_df = test_df.append(tdf)   
    
    y_train = train_df['engagement']
    y_test = test_df['engagement']
    X_train = train_df.drop(['engagement'],axis = 1)
    X_test = test_df.drop(['engagement'],axis = 1)
    
    
    if(features == 'variance_threshold'):
        selector = VarianceThreshold(threshold=(.8 * (1 - .8)))
        selector.fit(X_train)
        X_train = selector.transform(X_train)
        X_test = selector.transform(X_test)
        
    if(preprocessing == 'standard'):
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
    elif(preprocessing == 'minmax'):
        scaler = MinMaxScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
    
    imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
    imputer.fit(X_train)
    X_train = imputer.transform(X_train)
    X_test = imputer.transform(X_test)
        
    return X_train, X_test, y_train, y_test

In [6]:
def model(X_train, X_test, y_train, y_test, algorithm):
    if(algorithm == 'logistic'):
        clf = LogisticRegression(solver='lbfgs') 
    elif(algorithm == 'naivebayes'):
        clf = GaussianNB(var_smoothing=1e-8)
    else:
        clf = LinearSVC()
    
    clf.fit(X_train, y_train)
    
    pred = clf.predict(X_test)
    try:
        scores = clf.decision_function(X_test)
    except:
        scores = clf.predict_proba(X_test)
        sd = pd.DataFrame(scores)
        scores = sd[1]
    #print scores
    # Accuracy:
    print("Accuracy:",metrics.accuracy_score(y_test,pred))
    print(metrics.classification_report(y_test,pred))
    print(metrics.confusion_matrix(y_test, pred))
    try:
        print("AUC:",roc_auc_score(y_test, scores)) 
    except:
        print("AUC undefinied, only 1 class in test data")

### Configuration:
- dataset = ['smooth','window']

- split = 0

- train = [p5,p7]

- test = [p9]

- includex = [0,0.2]

- features = ['all']

- prepocessing = ['standard','no']

- Algorithm = Naive Bayes

In [7]:
%%time
for dataset in ['smooth','window']:
    for includex in [0,0.2]:
            for preprocessing in ['standard','no']:
                print(dataset,includex,preprocessing)
                X_train, X_test, y_train, y_test = prepare_data(dataset,0,['p5','p7'],['p9'],includex,'all',preprocessing)
                model(X_train, X_test, y_train, y_test,'naivebayes')
                print("\n")

('smooth', 0, 'standard')


  return self.partial_fit(X, y)


('Accuracy:', 0.7103273596698564)
              precision    recall  f1-score   support

           0       0.89      0.51      0.65    191811
           1       0.63      0.93      0.75    174575

   micro avg       0.71      0.71      0.71    366386
   macro avg       0.76      0.72      0.70    366386
weighted avg       0.77      0.71      0.70    366386

[[ 97405  94406]
 [ 11726 162849]]
('AUC:', 0.8066723789463343)


('smooth', 0, 'no')
('Accuracy:', 0.7252460519779685)
              precision    recall  f1-score   support

           0       0.87      0.56      0.68    191811
           1       0.65      0.91      0.76    174575

   micro avg       0.73      0.73      0.73    366386
   macro avg       0.76      0.73      0.72    366386
weighted avg       0.77      0.73      0.72    366386

[[107098  84713]
 [ 15953 158622]]
('AUC:', 0.8359379959215113)


('smooth', 0.2, 'standard')
('Accuracy:', 0.7133351165164608)
              precision    recall  f1-score   support

         

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


('Accuracy:', 0.7428641631516442)
              precision    recall  f1-score   support

         0.0       0.86      0.61      0.71     12792
         1.0       0.67      0.89      0.77     11627

   micro avg       0.74      0.74      0.74     24419
   macro avg       0.77      0.75      0.74     24419
weighted avg       0.77      0.74      0.74     24419

[[ 7774  5018]
 [ 1261 10366]]
('AUC:', 0.8294257228799305)


('window', 0, 'no')
('Accuracy:', 0.4786027273844138)
              precision    recall  f1-score   support

         0.0       0.62      0.01      0.02     12792
         1.0       0.48      0.99      0.64     11627

   micro avg       0.48      0.48      0.48     24419
   macro avg       0.55      0.50      0.33     24419
weighted avg       0.55      0.48      0.32     24419

[[  156 12636]
 [   96 11531]]
('AUC:', 0.635715886573987)


('window', 0.2, 'standard')
('Accuracy:', 0.7568286989639216)
              precision    recall  f1-score   support

         0.0      

### Configuration:
- dataset = ['smooth','window']

- split = 0

- train = [p7,p9]

- test = [p5]

- includex = [0,0.2]

- features = ['all']

- prepocessing = ['standard','no']

- Algorithm = Naive Bayes

In [8]:
%%time
for dataset in ['smooth','window']:
    for includex in [0,0.2]:
            for preprocessing in ['standard','no']:
                print(dataset,includex,preprocessing)
                X_train, X_test, y_train, y_test = prepare_data(dataset,0,['p7','p9'],['p5'],includex,'all',preprocessing)
                model(X_train, X_test, y_train, y_test,'naivebayes')
                print("\n")

('smooth', 0, 'standard')




('Accuracy:', 0.8660653914924947)
              precision    recall  f1-score   support

           0       0.93      0.68      0.79     95873
           1       0.84      0.97      0.90    170339

   micro avg       0.87      0.87      0.87    266212
   macro avg       0.89      0.83      0.84    266212
weighted avg       0.87      0.87      0.86    266212

[[ 65189  30684]
 [  4971 165368]]
('AUC:', 0.8831595974534097)


('smooth', 0, 'no')
('Accuracy:', 0.8595330037714303)
              precision    recall  f1-score   support

           0       0.94      0.65      0.77     95873
           1       0.83      0.98      0.90    170339

   micro avg       0.86      0.86      0.86    266212
   macro avg       0.89      0.81      0.83    266212
weighted avg       0.87      0.86      0.85    266212

[[ 62242  33631]
 [  3763 166576]]
('AUC:', 0.913991653860679)


('smooth', 0.2, 'standard')
('Accuracy:', 0.8652727901071326)
              precision    recall  f1-score   support

          

### Configuration:
- dataset = ['smooth','window']

- split = 0

- train = [p5,p9]

- test = [p7]

- includex = [0,0.2]

- features = ['all']

- prepocessing = ['standard','no']

- Algorithm = Naive Bayes

In [9]:
%%time
for dataset in ['smooth','window']:
    for includex in [0,0.2]:
            for preprocessing in ['standard','no']:
                print(dataset,includex,preprocessing)
                X_train, X_test, y_train, y_test = prepare_data(dataset,0,['p5','p9'],['p7'],includex,'all',preprocessing)
                model(X_train, X_test, y_train, y_test,'naivebayes')
                print("\n")

('smooth', 0, 'standard')




('Accuracy:', 0.807582089203796)
              precision    recall  f1-score   support

           0       0.91      0.68      0.78    173189
           1       0.74      0.93      0.83    169487

   micro avg       0.81      0.81      0.81    342676
   macro avg       0.83      0.81      0.80    342676
weighted avg       0.83      0.81      0.80    342676

[[118291  54898]
 [ 11039 158448]]
('AUC:', 0.8844101870669858)


('smooth', 0, 'no')
('Accuracy:', 0.7899269280603252)
              precision    recall  f1-score   support

           0       0.91      0.65      0.76    173189
           1       0.72      0.94      0.82    169487

   micro avg       0.79      0.79      0.79    342676
   macro avg       0.82      0.79      0.79    342676
weighted avg       0.82      0.79      0.79    342676

[[111727  61462]
 [ 10525 158962]]
('AUC:', 0.8898446460960444)


('smooth', 0.2, 'standard')
('Accuracy:', 0.8011386849385426)
              precision    recall  f1-score   support

          