In [12]:
import sys
from __future__ import division
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn import svm
from sklearn.preprocessing import scale
from StringIO import StringIO
from collections import Counter
from sklearn.metrics import accuracy_score
from inspect import getmembers
from IPython.core.display import Image
from numpy.random import randn
import itertools, operator, sys
from sklearn.linear_model import SGDClassifier
import sklearn.preprocessing

In [13]:
train_data = pd.read_csv('train_data_with_time_series_split_by_judge.csv')
test_data = pd.read_csv('test_data_with_time_series_split_by_judge.csv')

In [14]:
bool_cols = train_data.dtypes[train_data.dtypes == "bool"]
train_data[bool_cols.index] = train_data[bool_cols.index].astype(np.float)
test_data[bool_cols.index] = test_data[bool_cols.index].astype(np.float)

In [15]:
train_data.replace(-1*np.inf, -1000., inplace=True)
test_data.replace(-1*np.inf, -1000., inplace=True)

In [16]:
#Split the 'grantraw' label from train and test records
#collist = [x for x in train_data.columns if x not in ('grantraw')]

X_train = train_data.drop("grantraw", axis=1)
y_train = train_data["grantraw"]

X_test  = test_data.drop("grantraw", axis=1)
y_test  = test_data["grantraw"]

In [17]:
identical = []
for col in X_train.columns:
    if len(X_train[col].unique()) == 1:
        identical = identical + [col]

In [18]:
binary = X_train.columns[X_train.max(axis=0) - X_train.min(axis=0) == 1]
nonbinary = [col for col in X_train.columns if col not in (list(binary)+identical)]

In [19]:
#Scale all the values between [0,1]
X_train[nonbinary] = sklearn.preprocessing.normalize(X_train[nonbinary], axis=0) 
X_test[nonbinary] = sklearn.preprocessing.normalize(X_test[nonbinary], axis=0) 

In [20]:
#X_train_complete = X_train.copy()
#y_train_complete = y_train.copy()
#X_train = X_train[0:100000]
#y_train = y_train[0:100000]

In [21]:
%%timeit
#SIMPLE DECISION TREE MODEL 
clf    = DecisionTreeClassifier()
clf.fit(X_train,y_train)

#Calculate percent Train error
y_tr       = clf.predict(X_train)
train_err  = 100 - (accuracy_score(y_train, y_tr, normalize = True) * 100)
print " Train percent error: ", train_err
        
#Calculate percent Test error
y_te      = clf.predict(X_test)
test_err  = 100 - (accuracy_score(y_test, y_te, normalize = True) * 100)
print " Test percent error: ", test_err

 Train percent error:  0.0
 Test percent error:  45.4483204259
 Train percent error:  0.0
 Test percent error:  45.2911165334
 Train percent error:  0.0
 Test percent error:  44.3951507904
 Train percent error:  0.0
 Test percent error:  45.7675504162
1 loops, best of 3: 59.3 s per loop


In [61]:
#DECISION TREE MODELS FOR DIFFERENT PARAMETER VALUES
min_percnt_test_err  = 101
min_percnt_train_err = 101

min_percnt_test_err_parameters = " "
min_percnt_train_err_parameters = " "

#count = 1
for criterion_type in ['entropy', 'gini']:
    for split_type in ['best', 'random']:        
        for min_leaf in range(1,4):    
            for min_split in range(2,4):
                for depth in range(1,20): 
                    
                    #print "\nModel-%d" %count 
                    #count = count + 1                    
                    
                    string = "Criterion: "+criterion_type +", splitter: "+split_type+", min_num_leaf: "+str(min_leaf) 
                    string = string+", min_samples_split: "+str(min_split)+", tree_depth: "+str(depth)
                    #print string
                   
                    #Build decision tree
                    clf = DecisionTreeClassifier(max_depth=depth, criterion=criterion_type, splitter=split_type, min_samples_leaf=min_leaf, min_samples_split=min_split)
                    clf.fit(X_train,y_train)

                    #Calculate percent train error
                    y_tr       = clf.predict(X_train)
                    train_err  = 100 - (accuracy_score(y_train, y_tr, normalize = True) * 100)
                    #print "Train Error: ", train_err
        
                    #Check and save the min train error                    
                    if train_err < min_percnt_train_err:
                        min_percnt_train_err            = train_err 
                        min_percnt_train_err_parameters = string
               
                    #Calculate percent test error
                    y_te      = clf.predict(X_test)
                    test_err  = 100 - (accuracy_score(y_test, y_te, normalize = True) * 100)
                    #print "Test Error: ", test_err
                    
                    #Check and save the min test error
                    if test_err < min_percnt_test_err:
                        min_percnt_test_err            = test_err 
                        min_percnt_test_err_parameters = string

#Print the minimum percent test and train errors along with their model parameter values
print "\nMinimum Train Error: ", min_percnt_train_err
print "Model parameters-> ", min_percnt_train_err_parameters

print "\nMinimum Test Error: ", min_percnt_test_err
print "Model parameters-> ", min_percnt_test_err_parameters


Minimum Train Error:  12.6571286801
Model parameters->  Criterion: gini, splitter: random, min_num_leaf: 1, min_samples_split: 2, tree_depth: 19

Minimum Test Error:  28.2472548466
Model parameters->  Criterion: entropy, splitter: random, min_num_leaf: 2, min_samples_split: 3, tree_depth: 6


In [23]:
%%timeit
#SIMPLE RANDOM FOREST MODEL 
clf    = RandomForestClassifier()
clf.fit(X_train,y_train)

#Calculate percent Train error
y_tr       = clf.predict(X_train)
train_err  = 100 - (accuracy_score(y_train, y_tr, normalize = True) * 100)
print " Train Error: ", train_err
        
#Calculate percent Test error
y_te      = clf.predict(X_test)
test_err  = 100 - (accuracy_score(y_test, y_te, normalize = True) * 100)
print " Test Error: ", test_err



 Train Error:  0.282862650554
 Test Error:  38.0645596844
 Train Error:  0.290412365426
 Test Error:  36.9236259126
 Train Error:  0.291167336914
 Test Error:  34.9089085421
 Train Error:  0.288399108127
 Test Error:  36.5928226296
1 loops, best of 3: 25.3 s per loop


In [None]:
#RANDOM FOREST MODEL FOR DIFFERENT PARAMETER VALUES

num_features = len(train_data.columns)

min_percnt_test_err  = 101
min_percnt_train_err = 101

min_percnt_test_err_parameters = " "
min_percnt_train_err_parameters = " "

#count = 1
for criterion_type in ['entropy', 'gini']:
    for max_feature in ['sqrt', 'log2', 0.3]: 
        for min_leaf in range(1,4):    
            for min_split in range(2,4):
                for depth in range(8,17): 
                    for n_est in range (7, 14 ):
                        for bootstr in [True, False]:
                            for warm_start_b in [True, False]:
                  
                                #print "\nModel-%d" %count 
                                #count = count + 1                    
                    
                                string = "Criterion: " +criterion_type+ ", max_feature: "+ str(max_feature) 
                                string = string+", min_num_leaf: " +str(min_leaf)+ ", min_samples_split: "+str(min_split)
                                string = string+", tree_depth: "+str(depth)+ ", n_estimators: "+str(n_est)
                                string = string+", bootstr: "+str(bootstr)+ ", warm_start_bool: "+str(warm_start_b)
                                #print string
                   
                                #Build Random Forest
                                clf = RandomForestClassifier(criterion=criterion_type, max_features=max_feature, min_samples_leaf=min_leaf, min_samples_split=min_split,max_depth=depth,n_estimators=n_est,bootstrap=bootstr,warm_start=warm_start_b)
                                clf.fit(X_train,y_train)

                                #Calculate percent train error
                                y_tr       = clf.predict(X_train)
                                train_err  = 100 - (accuracy_score(y_train, y_tr, normalize = True) * 100)
                                #print "Train Error: ", train_err
        
                                #Check and save the min train error                    
                                if train_err < min_percnt_train_err:
                                    min_percnt_train_err            = train_err 
                                    min_percnt_train_err_parameters = string
               
                                #Calculate percent test error
                                y_te      = clf.predict(X_test)
                                test_err  = 100 - (accuracy_score(y_test, y_te, normalize = True) * 100)
                                #print "Test Error: ", test_err
                    
                                #Check and save the min test error
                                if test_err < min_percnt_test_err:
                                    min_percnt_test_err            = test_err 
                                    min_percnt_test_err_parameters = string

#Print the minimum percent test and train errors along with their model parameter values
print "\nMinimum Train Error: ", min_percnt_train_err
print "Model parameters-> ", min_percnt_train_err_parameters

print "\nMinimum Test Error: ", min_percnt_test_err
print "Model parameters-> ", min_percnt_test_err_parameters

In [58]:
#DEFINE perform_adaboost FUNCTION
def perform_adaboost(base_estimator, X_train, y_train, X_test, y_test, num_steps, depth, algorithm_t):
        
    weight = 1/len(X_train)
    weight = [weight]*len(X_train)

    train_err_list = list()
    test_err_list  = list()    
    
    if (base_estimator == "Decision_Tree"):
        classifier = DecisionTreeClassifier(max_depth=depth)
    elif (base_estimator == "Random_Forest"):
        classifier = RandomForestClassifier(max_depth=depth)
        
    for step in range(1,num_steps+1):       
        
        bdt = AdaBoostClassifier(classifier,algorithm=algorithm_t,n_estimators=step)     
        bdt.fit(X_train, y_train)         
        
        #Calculate train and test misclassification percent
        y_tr       = bdt.predict(X_train)
        train_err  = 100 - (accuracy_score(y_train, y_tr, normalize = True) * 100)    
        train_err_list.append(train_err)
              
        y_te      = bdt.predict(X_test)
        test_err  = 100 - (accuracy_score(y_test, y_te, normalize = True) * 100)        
        test_err_list.append(test_err) 
          
    return train_err_list, test_err_list
        

In [59]:
#BUILD ADABOOST BASED MODEL WITH DECISION TREE AS A WEAK CLASSIFIER
base_estimator       = "Decision_Tree"
num_steps            = 10
depth                = 5
algorithm_t          = "SAMME"

train_err_list, test_err_list = perform_adaboost(base_estimator, X_train, y_train, X_test, y_test, num_steps, depth, algorithm_t)
print "minimum train error: ", min(train_err_list)
print "minimum test error: ", min(test_err_list)


minimum train error:  24.2350314257
minimum test error:  29.8710899214


In [60]:
#BUILD ADABOOST BASED MODEL WITH RANDOM FOREST AS A WEAK CLASSIFIER
base_estimator       = "Random_Forest"
num_steps            = 10
depth                = 5
algorithm_t          = "SAMME"

train_err_list, test_err_list = perform_adaboost(base_estimator, X_train, y_train, X_test, y_test, num_steps, depth, algorithm_t)
print "minimum train error: ", min(train_err_list)
print "minimum test error: ", min(test_err_list)


minimum train error:  24.1344139376
minimum test error:  28.7489409771


In [6]:
#DEFINE FUNCTION TO PERFORM STOCHASTIC GRADIENT DESCENT
def perform_SGD(X_train, y_train, X_test, y_test, loss_type, penalty_type, shuffle_bool):
    
    clf = SGDClassifier(loss=loss_type, penalty=penalty_type, shuffle=shuffle_bool)
    clf.fit(X_train, y_train)
    
    #Calculate train percent error
    y_tr       = clf.predict(X_train)
    train_err  = 100 - (accuracy_score(y_train, y_tr, normalize = True) * 100)    
    #print "train_err: ", train_err
    
    #Calculate test percent error
    y_te      = clf.predict(X_test)
    test_err  = 100 - (accuracy_score(y_test, y_te, normalize = True) * 100)        
    #print "test_err: ", test_err
    
    return train_err, test_err

In [9]:
#BUILD MODEL WITH STOCHASTIC GRADIENT DESCENT FOR DIFFERENT LOSS FUNCTIONS AND PENALTY TYPES
#LOST TYPE OF 'HINGE' WILL GIVE SOFT-MARGIN LINEAR SVM MODEL

min_percnt_test_err  = 101
min_percnt_train_err = 101

min_percnt_test_err_parameters = " "
min_percnt_train_err_parameters = " "

for loss in ["hinge", "modified_huber", "squared_hinge", "log"]:
    for penalty in ["l1", "l2", "elasticnet"]:
        for shuffle in [True, False]:
            string = "Loss: "+loss+", Penalty: "+penalty+", shuffle:"+str(shuffle)
            train_error, test_error = perform_SGD(X_train, y_train, X_test, y_test, loss, penalty, shuffle)
           
            #Check and save the min errors 
            if train_error < min_percnt_train_err:
                min_percnt_train_err            = train_error
                min_percnt_train_err_parameters = string 
            
            if test_error < min_percnt_test_err:
                min_percnt_test_err            = test_error
                min_percnt_test_err_parameters = string 

#Print the minimum percent test and train errors along with their model parameter values
print "\nMinimum Train Error: ", min_percnt_train_err
print "Model parameters-> ", min_percnt_train_err_parameters

print "\nMinimum Test Error: ", min_percnt_test_err
print "Model parameters-> ", min_percnt_test_err_parameters            



Minimum Train Error:  25.7897783659
Model parameters->  Loss: hinge, Penalty: l1, shuffle:True

Minimum Test Error:  41.9888034287
Model parameters->  Loss: squared_hinge, Penalty: l1, shuffle:False
