In [1]:
###Hide
import numpy as np
import pandas as pd
import scipy as sp
from sklearn import preprocessing
from sklearn.cross_validation import KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DecisionTree
from sklearn.ensemble import RandomForestClassifier as RandomForest
from sklearn.ensemble import AdaBoostClassifier as AdaBoost
from sklearn.svm import SVC
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn import grid_search
from sklearn.decomposition import PCA
from sklearn import feature_selection as fs
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

## Baseline Models

#### Step 1.1: Load the cleaned data

In [3]:
# Load the clean data
ncds_data_no_indicators = pd.read_csv('datasets/ncds_data_no_indicators.csv', delimiter=',', low_memory=False)
# Print shapes
print "Shape of data:", ncds_data_no_indicators.shape
ncds_data_no_indicators.head()

Shape of data: (18558, 1803)


Unnamed: 0,n622,n0region,n1region,n2region,n3region,n553,n545,n520,n490,n492,...,OUTCME02,OUTCME03,OUTCME04,OUTCME05,OUTCME06,OUTCMEBM,OUTCME07,OUTCME08,OUTCME09,epileptic
0,2.0,9.0,9.0,9.0,9.0,23.0,4.0,2.0,12.0,5.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0
1,1.0,9.0,8.0,8.0,8.0,34.0,4.0,5.0,1.0,4.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0
2,2.0,1.0,1.0,1.0,1.0,26.0,4.0,11.0,1.0,4.0,...,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,3.0,0
3,2.0,10.0,10.0,10.0,10.0,25.0,4.0,1.0,3.0,6.0,...,1.0,1.0,2.0,2.0,2.0,6.0,6.0,6.0,6.0,0
4,2.0,7.0,7.0,7.0,7.0,26.0,4.0,1.0,1.0,4.0,...,1.0,1.0,3.0,2.0,3.0,6.0,6.0,4.0,6.0,1


#### Step 1.2: Split data into train and test
Split our dataset into train and test and analyze the splits. We can explore and verify the matrix of classes to check if our data is balanced. If the class is Imbalanced we will need to do any of the following:
1. Over sample
2. Under sample
3. Over weight
4. Adjust class weights in model

In [4]:
x = ncds_data_no_indicators.values[:, :-1]
y = ncds_data_no_indicators.values[:, -1]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=42)

#Print some useful info for our test, train sets
print 'Train data: ', x_train.shape
print 'Test data: ', x_test.shape
print 'Train class 0: {}, train class 1: {}'.format(len(y_train[y_train == 0]), len(y_train[y_train == 1]))
print 'Test class 0: {}, test class 1: {}'.format(len(y_test[y_test == 0]), len(y_test[y_test == 1]))

Train data:  (11134, 1802)
Test data:  (7424, 1802)
Train class 0: 10113, train class 1: 1021
Test class 0: 6735, test class 1: 689


#### Step 1.3: Feature Selection

From the merged datasets we can see we have over 1800 features. Going through the 1800 would be a very time consuming task so let us apply some algorithims to find the best features that we can use to build the model. In our exploration phase we did use PCA to find a subset of components but chose not to use those components in our base models. The exploration phase can be seen [here](20_exploratory_data_analysis_02.ipynb). However we may chose to use PCA during model tuning and evaluating model performance phase.

In [5]:
# Best features
num_of_features = 15
features = fs.SelectKBest(fs.f_regression, k=num_of_features) #k is number of features.
features.fit(x_train, y_train)

selected_features = features.get_support()
print "Selected Features:"
selected_features_columns =  ncds_data_no_indicators.columns[selected_features].values
print selected_features_columns

Selected Features:
['n400' 'n1827' 'n604' 'n35' 'n39' 'n1400' 'n825' 'n2598' 'n1896' 'n1897'
 'n1898' 'n2009' 'n2010' 'OUTCME01' 'OUTCME02']


  result = getitem(key)


#### Step 1.4: Build various models

In [6]:
# Function for computing the accuracy a given model on the entire test set,
# the accuracy on class 0 in the test set
# and the accuracy on class 1
score = lambda model, x_test, y_test: pd.Series([model.score(x_test, y_test), 
                                                 model.score(x_test[y_test==0], y_test[y_test==0]),
                                                 model.score(x_test[y_test==1], y_test[y_test==1])],
                                                index=['overall accuracy', 'accuracy on class 0', 'accuracy on class 1'])


In [7]:
# Split data for selected features only
x = ncds_data_no_indicators[selected_features_columns].values[:,:]
y = ncds_data_no_indicators.values[:, -1]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=42)

#Print some useful info for our test, train sets
print 'Train data: ', x_train.shape
print 'Test data: ', x_test.shape
print 'Train class 0: {}, train class 1: {}'.format(len(y_train[y_train == 0]), len(y_train[y_train == 1]))
print 'Test class 0: {}, test class 1: {}'.format(len(y_test[y_test == 0]), len(y_test[y_test == 1]))

Train data:  (11134, 15)
Test data:  (7424, 15)
Train class 0: 10113, train class 1: 1021
Test class 0: 6735, test class 1: 689


#### Logistic Regression:

In [8]:
# Unweighted logistic regression
unweighted_logistic = LogisticRegression()
unweighted_logistic.fit(x_train, y_train)
unweighted_log_scores = score(unweighted_logistic, x_test, y_test)

# Weighted logistic regression
weighted_logistic = LogisticRegression(class_weight='balanced')
weighted_logistic.fit(x_train, y_train)
weighted_log_scores = score(weighted_logistic, x_test, y_test)

print "Logistic regression (Unweighted):"
print unweighted_log_scores
print "Logistic regression (Weighted):"
print weighted_log_scores

Logistic regression (Unweighted):
overall accuracy       0.908675
accuracy on class 0    0.997327
accuracy on class 1    0.042090
dtype: float64
Logistic regression (Weighted):
overall accuracy       0.648303
accuracy on class 0    0.657313
accuracy on class 1    0.560232
dtype: float64


#### Linear Discriminant Analysis:

In [9]:
# LDA
lda = LDA()
lda.fit(x_train, y_train)
lda_scores = score(lda, x_test, y_test)

print "LDA:"
print lda_scores

LDA:
overall accuracy       0.906115
accuracy on class 0    0.990943
accuracy on class 1    0.076923
dtype: float64


#### Quadratic Discriminant Analysis

In [10]:
#QDA
qda = QDA()
qda.fit(x_train, y_train)
qda_scores = score(qda, x_test, y_test)

print "QDA:"
print qda_scores

QDA:
overall accuracy       0.847791
accuracy on class 0    0.911210
accuracy on class 1    0.227866
dtype: float64


#### Decision Trees

In [11]:
#Decision Trees
tree = DecisionTree()
tree.fit(x_train, y_train)
tree_scores = score(tree, x_test, y_test)

print "Decision Trees:"
print tree_scores

Decision Trees:
overall accuracy       0.896013
accuracy on class 0    0.974759
accuracy on class 1    0.126270
dtype: float64


#### Random Forest

In [12]:
# Random Forest
rf = RandomForest(class_weight='balanced')
rf.fit(x_train, y_train)
rf_scores = score(rf, x_test, y_test)

print "Random Forest:"
print rf_scores

Random Forest:
overall accuracy       0.791891
accuracy on class 0    0.843207
accuracy on class 1    0.290276
dtype: float64


#### Support Vector Classification

In [13]:
# SVC
svc = SVC(probability=True,class_weight='balanced')
svc.fit(x_train, y_train)
svc_scores = score(svc, x_test, y_test)

print "SVC:"
print svc_scores

SVC:
overall accuracy       0.760776
accuracy on class 0    0.794655
accuracy on class 1    0.429608
dtype: float64


#### K-Nearest Neighbors

In [14]:
# KNN
knn = KNN()
knn.fit(x_train, y_train)
knn_scores = score(knn, x_test, y_test)

print "KNN:"
print knn_scores

KNN:
overall accuracy       0.905711
accuracy on class 0    0.990794
accuracy on class 1    0.074020
dtype: float64


In [21]:
# Overall Score Dataframe
performance_metric = pd.DataFrame({'Unweighted Logistic': unweighted_log_scores,
                         'Weighted Logistic': weighted_log_scores,
                         'LDA': lda_scores,
                         'QDA': qda_scores,
                        'KNN': knn_scores,
                         'Tree': tree_scores,
                         'RF': rf_scores,'SVC':svc_scores})


#### Step 1.4: Define Performance Metric

Our performance metric will be to build a model whos results are better than the base models we have built so far

In [22]:
print "Our Performance Metric is to get a better score the following:"
performance_metric

Our Performance Metric is to get a better score the following:


Unnamed: 0,KNN,LDA,QDA,RF,SVC,Tree,Unweighted Logistic,Weighted Logistic
overall accuracy,0.905711,0.906115,0.847791,0.791891,0.760776,0.896013,0.908675,0.648303
accuracy on class 0,0.990794,0.990943,0.91121,0.843207,0.794655,0.974759,0.997327,0.657313
accuracy on class 1,0.07402,0.076923,0.227866,0.290276,0.429608,0.12627,0.04209,0.560232


In [23]:
### Write the Performance Metric dataframe to a csv file
performance_metric.to_csv('datasets/performance_metric.csv', sep=',',index=False)