In [2]:
###Hide
import numpy as np
import pandas as pd
import scipy as sp
from sklearn import preprocessing
from sklearn.cross_validation import KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DecisionTree
from sklearn.ensemble import RandomForestClassifier as RandomForest
from sklearn.ensemble import AdaBoostClassifier as AdaBoost
from sklearn.svm import SVC
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn import grid_search
from sklearn.decomposition import PCA
from sklearn import feature_selection as fs
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

## Tuning to Maximize Model Performance

#### Step 2.1: Get Performance Metric

In [3]:
# Load the performance metric
performance_metric = pd.read_csv('datasets/performance_metric.csv', delimiter=',')

print "Our Performance Metric is to get a better score the following:"
performance_metric

Our Performance Metric is to get a better score the following:


Unnamed: 0,KNN,LDA,QDA,RF,SVC,Tree,Unweighted Logistic,Weighted Logistic
0,0.905711,0.906115,0.847791,0.791891,0.760776,0.896013,0.908675,0.648303
1,0.990794,0.990943,0.91121,0.843207,0.794655,0.974759,0.997327,0.657313
2,0.07402,0.076923,0.227866,0.290276,0.429608,0.12627,0.04209,0.560232


#### Step 2.2: Select Top Performing Models

Pick the top 3 top performing models whos accuracy score is high in both classes. From the metric table we can see that Weighted Logistic, SVC, and Random Forest performs the best. So let take them as the base model and work towards tuning them to get better results.

#### Step 2.3: Tune Selected Model

In [4]:
# Load the clean data
ncds_data_no_indicators = pd.read_csv('datasets/ncds_data_no_indicators.csv', delimiter=',', low_memory=False)

x = ncds_data_no_indicators.values[:, :-1]
y = ncds_data_no_indicators.values[:, -1]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=42)

#Print some useful info for our test, train sets
print 'Train data: ', x_train.shape
print 'Test data: ', x_test.shape
print 'Train class 0: {}, train class 1: {}'.format(len(y_train[y_train == 0]), len(y_train[y_train == 1]))
print 'Test class 0: {}, test class 1: {}'.format(len(y_test[y_test == 0]), len(y_test[y_test == 1]))

Train data:  (11134, 1802)
Test data:  (7424, 1802)
Train class 0: 10113, train class 1: 1021
Test class 0: 6735, test class 1: 689


In [6]:
# Function for computing the accuracy a given model on the entire test set,
# the accuracy on class 0 in the test set
# and the accuracy on class 1
score = lambda model, x_test, y_test: pd.Series([model.score(x_test, y_test), 
                                                 model.score(x_test[y_test==0], y_test[y_test==0]),
                                                 model.score(x_test[y_test==1], y_test[y_test==1])],
                                                index=['overall accuracy', 'accuracy on class 0', 'accuracy on class 1'])


##### Weighted Logistic Regression Model Tuning

In [None]:
### Logistic Model Tuning

def tune_logistic_reg_model(x_train,y_train,x_test,y_test):
    c_list = [0.00000001,0.0000001,0.000001,0.00001,0.0001, 0.001,0.01,0.015,0.02,0.03,0.04,0.08]
    l1_overall = []
    l1_class_0 = []
    l1_class_1 = []
    l2_overall = []
    l2_class_0 = []
    l2_class_1 = []

    for c in c_list:
        logistic = LogisticRegression(class_weight='balanced',C=c,penalty='l1')
        logistic.fit(x_train, y_train)
        logistic_scores = score(logistic, x_test, y_test)
        #print logistic_scores
        l1_overall.append(logistic_scores['overall accuracy'])
        l1_class_0.append(logistic_scores['accuracy on class 0'])
        l1_class_1.append(logistic_scores['accuracy on class 1'])
        logistic = LogisticRegression(class_weight='balanced',C=c,penalty='l2')
        logistic.fit(x_train, y_train)
        logistic_scores = score(logistic, x_test, y_test)
        #print logistic_scores
        l2_overall.append(logistic_scores['overall accuracy'])
        l2_class_0.append(logistic_scores['accuracy on class 0'])
        l2_class_1.append(logistic_scores['accuracy on class 1'])


    best_l1_class_1_accuracy = np.argmax(l1_class_1)
    best_l2_class_1_accuracy = np.argmax(l2_class_1)
    print "Best Class 1 accuracy is for L1 and C: " , c_list[best_l1_class_1_accuracy] , " Class 1: ",round(l1_class_1[best_l1_class_1_accuracy],5) , " Class 0: ",round(l1_class_0[best_l1_class_1_accuracy],5)
    print "Best Class 1 accuracy is for L2 and C: " , c_list[best_l2_class_1_accuracy] , " Class 1: ",round(l2_class_1[best_l2_class_1_accuracy],5) , " Class 0: ",round(l2_class_0[best_l2_class_1_accuracy],5)

    # Plot the results
    fig, ax = plt.subplots(1, 2, figsize=(20, 7))
    ax[0].plot(c_list,l1_overall,color='r',label='Overall')
    ax[0].plot(c_list,l1_class_0,color='b',label='Class 0')
    ax[0].plot(c_list,l1_class_1,color='g',label='Class 1')
    ax[0].set_title("Log Reg, Balanced with penalty=l1")
    ax[0].set_xlabel('C')
    ax[0].set_ylabel('Accuracy')
    ax[0].legend(loc = 'best')

    ax[1].plot(c_list,l2_overall,color='r',label='Overall')
    ax[1].plot(c_list,l2_class_0,color='b',label='Class 0')
    ax[1].plot(c_list,l2_class_1,color='g',label='Class 1')
    ax[1].set_title("Log Reg, Balanced with penalty=l2")
    ax[1].set_xlabel('C')
    ax[1].set_ylabel('Accuracy')
    ax[1].legend(loc = 'best')

    plt.tight_layout()
    plt.show()

# Tune without standarization
print "Tuning params without standardization of predictors:"
tune_logistic_reg_model(x_train,y_train,x_test,y_test)

# Standardize predictors
sc = preprocessing.StandardScaler()
x_train_std = sc.fit_transform(x_train)
x_test_std = sc.transform(x_test)

print "Tuning params with standardization of predictors:"
tune_logistic_reg_model(x_train_std,y_train,x_test_std,y_test)

Tuning params without standardization of predictors:
