In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC

# Logistic Regression

In [4]:
# Dataset, Split, Train set, Test set, Include x% of test participant in train, Feature Set, Preprocessing, Algorithm
# String, Float (0-1), List, List, Float (0-1), String, String, String

In [5]:
def prepare_data(dataset, split, train, test, includex, features, preprocessing):
    
    if(split>0):
        if(dataset=='regular'):
            path = 'data/Master/' + item + '_master.csv'
            tdf = pd.read_csv(path)
            length = int(split*len(tdf))
            train_df = tdf.iloc[:length,:]
            test_df = tdf.iloc[length:,:]
            #test_df = test_df.drop(['engagement'],axis=1)
        elif(dataset=='smooth'):
            path = 'data/Master_Smooth/' + item + '_master_smooth.csv'
            tdf = pd.read_csv(path)
            length = int(split*len(tdf))
            train_df = tdf.iloc[:length,:]
            test_df = tdf.iloc[length:,:]
            #test_df = test_df.drop(['engagement'],axis=1)
        else:
            path = 'data/Master_Window/' + item + '_master_window.csv'
            tdf = pd.read_csv(path)
            length = int(split*len(tdf))
            train_df = tdf.iloc[:length,:]
            test_df = tdf.iloc[length:,:] 
            #test_df = test_df.drop(['engagement'],axis=1)
    else:
        # Training Data
        train_df = pd.DataFrame()
        for item in train:
            if(dataset=='regular'):
                path = 'data/Master/' + item + '_master.csv'
                tdf = pd.read_csv(path)
                train_df = train_df.append(tdf)
            elif(dataset=='smooth'):
                path = 'data/Master_Smooth/' + item + '_master_smooth.csv'
                tdf = pd.read_csv(path)
                train_df = train_df.append(tdf)
            else:
                path = 'data/Master_Window/' + item + '_master_window.csv'
                tdf = pd.read_csv(path)
                train_df = train_df.append(tdf)       

        # Include x% of Test Participants Data
        if(includex>0):
            item = test[0]
            if(dataset=='regular'):
                path = 'data/Master/' + item + '_master.csv'
                tdf = pd.read_csv(path)
                length = int(includex*len(tdf))
                tdf = tdf.iloc[:length,:]
                train_df = train_df.append(tdf)
            elif(dataset=='smooth'):
                path = 'data/Master_Smooth/' + item + '_master_smooth.csv'
                tdf = pd.read_csv(path)
                length = int(includex*len(tdf))
                tdf = tdf.iloc[:length,:]
                train_df = train_df.append(tdf)
            else:
                path = 'data/Master_Window/' + item + '_master_window.csv'
                tdf = pd.read_csv(path)
                length = int(includex*len(tdf))
                tdf = tdf.iloc[:length,:]
                train_df = train_df.append(tdf)      

        # Test Data
        test_df = pd.DataFrame()
        for item in test:
            if(dataset=='regular'):
                path = 'data/Master/' + item + '_master.csv'
                tdf = pd.read_csv(path)
                #tdf = tdf.drop(['engagement'],axis=1)
                test_df = test_df.append(tdf)
            elif(dataset=='smooth'):
                path = 'data/Master_Smooth/' + item + '_master_smooth.csv'
                tdf = pd.read_csv(path)
                #tdf = tdf.drop(['engagement'],axis=1)
                test_df = test_df.append(tdf)
            else:
                path = 'data/Master_Window/' + item + '_master_window.csv'
                tdf = pd.read_csv(path)
                #tdf = tdf.drop(['engagement'],axis=1)
                test_df = test_df.append(tdf)   
    
    y_train = train_df['engagement']
    y_test = test_df['engagement']
    X_train = train_df.drop(['engagement'],axis = 1)
    X_test = test_df.drop(['engagement'],axis = 1)
    
    
    if(features == 'variance_threshold'):
        selector = VarianceThreshold(threshold=(.8 * (1 - .8)))
        selector.fit(X_train)
        X_train = selector.transform(X_train)
        X_test = selector.transform(X_test)
        
    if(preprocessing == 'standard'):
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
    elif(preprocessing == 'minmax'):
        scaler = MinMaxScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
    
    imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
    imputer.fit(X_train)
    X_train = imputer.transform(X_train)
    X_test = imputer.transform(X_test)
        
    return X_train, X_test, y_train, y_test

In [6]:
def model(X_train, X_test, y_train, y_test, algorithm):
    if(algorithm == 'logistic'):
        clf = LogisticRegression(solver='lbfgs') 
    elif(algorithm == 'naivebayes'):
        clf = GaussianNB()
    else:
        clf = LinearSVC()
    
    clf.fit(X_train, y_train)
    
    pred = clf.predict(X_test)
    try:
        scores = clf.decision_function(X_test)
    except:
        scores = clf.predict_proba(X_test)
        sd = pd.DataFrame(scores)
        scores = sd[1]
    #print scores
    # Accuracy:
    print("Accuracy:",metrics.accuracy_score(y_test,pred))
    print(metrics.classification_report(y_test,pred))
    print(metrics.confusion_matrix(y_test, pred))
    try:
        print("AUC:",roc_auc_score(y_test, scores)) 
    except:
        print("AUC undefinied, only 1 class in test data")

### Configuration:
- dataset = ['smooth','window']

- split = 0

- train = [p5,p7]

- test = [p9]

- includex = [0,0.2]

- features = ['all']

- prepocessing = ['standard','no']

- Algorithm = Logistic Regression

In [7]:
%%time
for dataset in ['smooth','window']:
    for includex in [0,0.2]:
            for preprocessing in ['standard','no']:
                print(dataset,includex,preprocessing)
                X_train, X_test, y_train, y_test = prepare_data(dataset,0,['p5','p7'],['p9'],includex,'all',preprocessing)
                model(X_train, X_test, y_train, y_test,'logistic')
                print("\n")

('smooth', 0, 'standard')


  return self.partial_fit(X, y)


('Accuracy:', 0.769933894854061)
              precision    recall  f1-score   support

           0       0.73      0.89      0.80    191811
           1       0.84      0.64      0.73    174575

   micro avg       0.77      0.77      0.77    366386
   macro avg       0.79      0.76      0.76    366386
weighted avg       0.78      0.77      0.77    366386

[[170880  20931]
 [ 63362 111213]]
('AUC:', 0.8804615392840552)


('smooth', 0, 'no')
('Accuracy:', 0.7099643545331973)
              precision    recall  f1-score   support

           0       0.77      0.64      0.70    191811
           1       0.67      0.79      0.72    174575

   micro avg       0.71      0.71      0.71    366386
   macro avg       0.72      0.71      0.71    366386
weighted avg       0.72      0.71      0.71    366386

[[122523  69288]
 [ 36977 137598]]
('AUC:', 0.7713753144482804)


('smooth', 0.2, 'standard')
('Accuracy:', 0.7929806269890225)
              precision    recall  f1-score   support

          

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


('Accuracy:', 0.7900814939186699)
              precision    recall  f1-score   support

         0.0       0.75      0.91      0.82     12792
         1.0       0.87      0.66      0.75     11627

   micro avg       0.79      0.79      0.79     24419
   macro avg       0.81      0.78      0.78     24419
weighted avg       0.80      0.79      0.79     24419

[[11613  1179]
 [ 3947  7680]]
('AUC:', 0.8965425760370034)


('window', 0, 'no')
('Accuracy:', 0.579671567222245)
              precision    recall  f1-score   support

         0.0       0.59      0.65      0.62     12792
         1.0       0.57      0.50      0.53     11627

   micro avg       0.58      0.58      0.58     24419
   macro avg       0.58      0.58      0.58     24419
weighted avg       0.58      0.58      0.58     24419

[[8326 4466]
 [5798 5829]]
('AUC:', 0.636388472885)


('window', 0.2, 'standard')
('Accuracy:', 0.8141611040583152)
              precision    recall  f1-score   support

         0.0       0.78   

### Configuration:
- dataset = ['smooth','window']

- split = 0

- train = [p7,p9]

- test = [p5]

- includex = [0,0.2]

- features = ['all']

- prepocessing = ['standard','no']

- Algorithm = Logistic Regression

In [8]:
%%time
for dataset in ['smooth','window']:
    for includex in [0,0.2]:
            for preprocessing in ['standard','no']:
                print(dataset,includex,preprocessing)
                X_train, X_test, y_train, y_test = prepare_data(dataset,0,['p7','p9'],['p5'],includex,'all',preprocessing)
                model(X_train, X_test, y_train, y_test,'logistic')
                print("\n")

('smooth', 0, 'standard')




('Accuracy:', 0.8522493351163735)
              precision    recall  f1-score   support

           0       0.80      0.79      0.79     95873
           1       0.88      0.89      0.88    170339

   micro avg       0.85      0.85      0.85    266212
   macro avg       0.84      0.84      0.84    266212
weighted avg       0.85      0.85      0.85    266212

[[ 76127  19746]
 [ 19587 150752]]
('AUC:', 0.9183215137031264)


('smooth', 0, 'no')
('Accuracy:', 0.855288266494373)
              precision    recall  f1-score   support

           0       0.92      0.66      0.77     95873
           1       0.83      0.97      0.90    170339

   micro avg       0.86      0.86      0.86    266212
   macro avg       0.87      0.81      0.83    266212
weighted avg       0.86      0.86      0.85    266212

[[ 63157  32716]
 [  5808 164531]]
('AUC:', 0.8370564906246806)


('smooth', 0.2, 'standard')
('Accuracy:', 0.8524521809685514)
              precision    recall  f1-score   support

          

### Configuration:
- dataset = ['smooth','window']

- split = 0

- train = [p5,p9]

- test = [p7]

- includex = [0,0.2]

- features = ['all']

- prepocessing = ['standard','no']

- Algorithm = Logistic Regression

In [9]:
%%time
for dataset in ['smooth','window']:
    for includex in [0,0.2]:
            for preprocessing in ['standard','no']:
                print(dataset,includex,preprocessing)
                X_train, X_test, y_train, y_test = prepare_data(dataset,0,['p5','p9'],['p7'],includex,'all',preprocessing)
                model(X_train, X_test, y_train, y_test,'logistic')
                print("\n")

('smooth', 0, 'standard')




('Accuracy:', 0.8215807351550736)
              precision    recall  f1-score   support

           0       0.87      0.76      0.81    173189
           1       0.78      0.89      0.83    169487

   micro avg       0.82      0.82      0.82    342676
   macro avg       0.83      0.82      0.82    342676
weighted avg       0.83      0.82      0.82    342676

[[131273  41916]
 [ 19224 150263]]
('AUC:', 0.9025424082767256)


('smooth', 0, 'no')
('Accuracy:', 0.7536419241499259)
              precision    recall  f1-score   support

           0       0.87      0.60      0.71    173189
           1       0.69      0.91      0.79    169487

   micro avg       0.75      0.75      0.75    342676
   macro avg       0.78      0.76      0.75    342676
weighted avg       0.78      0.75      0.75    342676

[[103710  69479]
 [ 14942 154545]]
('AUC:', 0.8300288952101154)


('smooth', 0.2, 'standard')
('Accuracy:', 0.826646745030291)
              precision    recall  f1-score   support

          