In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline
#scikit imports
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import classification_report, accuracy_score
#classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import  AdaBoostClassifier
# class balancing
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings("ignore")

In [2]:
#Load train and test datasets
train_Data = pd.read_csv('training_new_data.csv')
test_Data = pd.read_csv('data/test_data.csv')

In [3]:
#Now let's use both CD4-t0 and VL-t0 to predict Resp
featureSet = ["VL.t0","CD4.t0","rtlength", "pr_A", "pr_C","pr_G", 
              "pr_R", "pr_T","pr_Y", "PR_GC","RT_A", "RT_C","RT_G","RT_R", "RT_T", "RT_Y", "RT_GC"]
# featureSet = ["VL.t0":"RT_GC"]
X = train_Data[featureSet]
y = train_Data.Resp
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Running 4 models

In [6]:
# define scoring method
scoring = 'accuracy'

# Define models 
names = ["Neural Net","SVM Linear", "AdaBoost", "Logistic Regression"]

classifiers = [
     MLPClassifier(alpha=1,batch_size=30),
     SVC(kernel = 'linear',), 
     AdaBoostClassifier(),
     LogisticRegression(C=8.0, verbose=5, solver='lbfgs')
]
seed = 1
models = zip(names, classifiers)

# evaluate each model
results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=10, random_state = seed)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    print('--------------------------------------------------')
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(name)
    print(accuracy_score(y_test, predictions))
    print(classification_report(y_test, predictions))
    print('--------------------------------------------------')

Neural Net: 0.708941 (0.089829)
--------------------------------------------------
Neural Net
0.8478260869565217
              precision    recall  f1-score   support

           0       0.89      0.93      0.91       151
           1       0.59      0.48      0.53        33

   micro avg       0.85      0.85      0.85       184
   macro avg       0.74      0.71      0.72       184
weighted avg       0.84      0.85      0.84       184

--------------------------------------------------
SVM Linear: 0.799056 (0.052924)
--------------------------------------------------
SVM Linear
0.8369565217391305
              precision    recall  f1-score   support

           0       0.84      0.99      0.91       151
           1       0.71      0.15      0.25        33

   micro avg       0.84      0.84      0.84       184
   macro avg       0.78      0.57      0.58       184
weighted avg       0.82      0.84      0.79       184

--------------------------------------------------
AdaBoost: 0.786690

[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 co

Logistic Regression: 0.793521 (0.049384)
--------------------------------------------------
Logistic Regression
0.8532608695652174
              precision    recall  f1-score   support

           0       0.87      0.97      0.92       151
           1       0.69      0.33      0.45        33

   micro avg       0.85      0.85      0.85       184
   macro avg       0.78      0.65      0.68       184
weighted avg       0.84      0.85      0.83       184

--------------------------------------------------


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s rema