In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import minimize
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix
import os
import sys



### Read csv file using pandas

In [4]:
train = pd.read_csv("Data_Files/train.csv")
print("Training set has {0[0]} rows and {0[1]} columns".format(train.shape))

labels = train['target']
train.drop(['target', 'id'], axis=1, inplace=True)

print(train.head())

Training set has 61878 rows and 95 columns
   feat_1  feat_2  feat_3  feat_4  feat_5  feat_6  feat_7  feat_8  feat_9  \
0       1       0       0       0       0       0       0       0       0   
1       0       0       0       0       0       0       0       1       0   
2       0       0       0       0       0       0       0       1       0   
3       1       0       0       1       6       1       5       0       0   
4       0       0       0       0       0       0       0       0       0   

   feat_10   ...     feat_84  feat_85  feat_86  feat_87  feat_88  feat_89  \
0        0   ...           0        1        0        0        0        0   
1        0   ...           0        0        0        0        0        0   
2        0   ...           0        0        0        0        0        0   
3        1   ...          22        0        1        2        0        0   
4        0   ...           0        1        0        0        0        0   

   feat_90  feat_91  feat_92  f

In [5]:
sss = StratifiedShuffleSplit(labels, test_size=0.05, random_state=1234)
for train_index, test_index in sss:
    break

train_x, train_y = train.values[train_index], labels.values[train_index]
test_x, test_y = train.values[test_index], labels.values[test_index]

In [7]:
test = pd.read_csv("Data_Files/test.csv")
print("Test set has {0[0]} rows and {0[1]} columns".format(test.shape))

test.drop(['id'], axis=1, inplace=True)

print(test.head())

Test set has 144368 rows and 94 columns
   feat_1  feat_2  feat_3  feat_4  feat_5  feat_6  feat_7  feat_8  feat_9  \
0       0       0       0       0       0       0       0       0       0   
1       2       2      14      16       0       0       0       0       0   
2       0       1      12       1       0       0       0       0       0   
3       0       0       0       1       0       0       0       0       0   
4       1       0       0       1       0       0       1       2       0   

   feat_10   ...     feat_84  feat_85  feat_86  feat_87  feat_88  feat_89  \
0        3   ...           0        0       11        1       20        0   
1        0   ...           0        0        0        0        0        4   
2        0   ...           0        0        0        0        2        0   
3        0   ...           0        3        1        0        0        0   
4        3   ...           0        0        0        0        0        0   

   feat_90  feat_91  feat_92  feat

### Tune hyperparameters of logistic regression classifier using gridSearch with 2-fold cross validation

In [None]:
logreg = LogisticRegression()
param_grid = [{'C': np.logspace(-4, 4, 6),
               'penalty': ['l2'],
               'class_weight': [None, 'balanced'],
               'solver': ['newton-cg', 'lbfgs', 'liblinear'],
               'multi_class': ['ovr']},
              {'C': np.logspace(-4, 4, 6),
               'penalty': ['l2'],
               'class_weight': [None, 'balanced'],
               'solver': ['lbfgs'],
               'multi_class': ['multinomial']}]

gs_cv = GridSearchCV(logreg, param_grid, cv = 3, scoring = 'neg_log_loss')
gs_cv.fit(train, labels)

In [13]:
gs_cv.best_params_ # hyperparameters which yield the best cv result

'raise'

In [42]:
gs_cv.score(train, labels)

-0.66052988885894437

In [41]:
grid_scores = gs_cv.grid_scores_      # scores of all the combinations of hyperparameters tested
grid_scores

[mean: -0.90393, std: 0.00031, params: {'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.0001, 'solver': 'newton-cg', 'class_weight': None},
 mean: -0.90393, std: 0.00031, params: {'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.0001, 'solver': 'lbfgs', 'class_weight': None},
 mean: -0.97029, std: 0.00213, params: {'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.0001, 'solver': 'liblinear', 'class_weight': None},
 mean: -0.90393, std: 0.00031, params: {'penalty': 'l1', 'multi_class': 'ovr', 'C': 0.0001, 'solver': 'newton-cg', 'class_weight': None},
 mean: -0.90393, std: 0.00031, params: {'penalty': 'l1', 'multi_class': 'ovr', 'C': 0.0001, 'solver': 'lbfgs', 'class_weight': None},
 mean: -1.77075, std: 0.00292, params: {'penalty': 'l1', 'multi_class': 'ovr', 'C': 0.0001, 'solver': 'liblinear', 'class_weight': None},
 mean: -1.33726, std: 0.00081, params: {'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.0001, 'solver': 'newton-cg', 'class_weight': 'auto'},
 mean: -1.33726, std: 0.00081, params: {

### Rearrange data to properly save it into a .csv file

In [43]:
mean, std = [], []
penalty, multi_class, C, solver, class_weight = [], [], [], [], []
for score in grid_scores:
    mean.append(score.mean_validation_score)
    std.append(np.std(score.cv_validation_scores))
    penalty.append(score.parameters['penalty'])
    multi_class.append(score.parameters['multi_class'])
    C.append(score.parameters['C'])
    solver.append(score.parameters['solver'])
    class_weight.append(score.parameters['class_weight'])
    

In [44]:
df = pd.DataFrame({'mean': mean,
                   'std': std,
                   'penalty': penalty,
                   'multi_class': multi_class,
                   'C': C,
                   'solver': solver,
                   'class_weight': class_weight})
df

Unnamed: 0,C,class_weight,mean,multi_class,penalty,solver,std
0,0.000100,,-0.903932,ovr,l2,newton-cg,0.000309
1,0.000100,,-0.903931,ovr,l2,lbfgs,0.000309
2,0.000100,,-0.970287,ovr,l2,liblinear,0.002134
3,0.000100,,-0.903932,ovr,l1,newton-cg,0.000309
4,0.000100,,-0.903931,ovr,l1,lbfgs,0.000309
5,0.000100,,-1.770745,ovr,l1,liblinear,0.002923
6,0.000100,auto,-1.337263,ovr,l2,newton-cg,0.000807
7,0.000100,auto,-1.337264,ovr,l2,lbfgs,0.000807
8,0.000100,auto,-1.038647,ovr,l2,liblinear,0.001175
9,0.000100,auto,-1.337263,ovr,l1,newton-cg,0.000807


In [45]:
df.to_csv("grid_search_1.csv")

### Narrow down search of optimal hyperparameters and increase cross-validation folds

In [7]:
logreg = LogisticRegression()
param_grid = {'C': np.logspace(-2, 1, 15),
               'penalty': ['l2'],
               'solver': ['newton-cg', 'lbfgs', 'liblinear'],
               'multi_class': ['ovr']}

gs_cv = GridSearchCV(logreg, param_grid, cv = 5, scoring = 'log_loss')
gs_cv.fit(train, labels)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'penalty': ['l2'], 'multi_class': ['ovr'], 'C': array([  0.01   ,   0.01638,   0.02683,   0.04394,   0.07197,   0.11788,
         0.19307,   0.31623,   0.51795,   0.84834,   1.3895 ,   2.27585,
         3.72759,   6.1054 ,  10.     ]), 'solver': ['newton-cg', 'lbfgs', 'liblinear']},
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring='log_loss', verbose=0)

In [8]:
gs_cv.best_params_                     # hyperparameters which yield the best cv result

{'C': 1.3894954943731375,
 'multi_class': 'ovr',
 'penalty': 'l2',
 'solver': 'newton-cg'}

In [10]:
gs_cv.score(train, labels)

-0.66063389869520694

In [11]:
grid_scores = gs_cv.grid_scores_      # scores of all the combinations of hyperparameters tested
grid_scores

[mean: -0.68143, std: 0.00581, params: {'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.01, 'solver': 'newton-cg'},
 mean: -0.68144, std: 0.00581, params: {'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.01, 'solver': 'lbfgs'},
 mean: -0.68400, std: 0.00585, params: {'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.01, 'solver': 'liblinear'},
 mean: -0.67810, std: 0.00575, params: {'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.016378937069540637, 'solver': 'newton-cg'},
 mean: -0.67812, std: 0.00575, params: {'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.016378937069540637, 'solver': 'lbfgs'},
 mean: -0.67963, std: 0.00577, params: {'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.016378937069540637, 'solver': 'liblinear'},
 mean: -0.67584, std: 0.00570, params: {'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.02682695795279726, 'solver': 'newton-cg'},
 mean: -0.67587, std: 0.00569, params: {'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.02682695795279726, 'solver': 'lbfgs'},
 mean: -0.67674, std: 

In [35]:
mean, std = [], []
penalty, multi_class, C, solver = [], [], [], []
for score in grid_scores:
    mean.append(score.mean_validation_score)
    std.append(np.std(score.cv_validation_scores))
    penalty.append(score.parameters['penalty'])
    multi_class.append(score.parameters['multi_class'])
    C.append(score.parameters['C'])
    solver.append(score.parameters['solver'])    

In [36]:
df = pd.DataFrame({'mean': mean,
                   'std': std,
                   'penalty': penalty,
                   'multi_class': multi_class,
                   'C': C,
                   'solver': solver})
df

Unnamed: 0,C,mean,multi_class,penalty,solver,std
0,0.01,-0.681434,ovr,l2,newton-cg,0.005811
1,0.01,-0.681437,ovr,l2,lbfgs,0.005806
2,0.01,-0.684001,ovr,l2,liblinear,0.005849
3,0.016379,-0.678101,ovr,l2,newton-cg,0.005752
4,0.016379,-0.678118,ovr,l2,lbfgs,0.00575
5,0.016379,-0.679628,ovr,l2,liblinear,0.005767
6,0.026827,-0.675841,ovr,l2,newton-cg,0.005699
7,0.026827,-0.675866,ovr,l2,lbfgs,0.005692
8,0.026827,-0.676744,ovr,l2,liblinear,0.005708
9,0.04394,-0.674316,ovr,l2,newton-cg,0.005652


In [37]:
df.to_csv("grid_search_2.csv")

### using LogisticRegressionCV does not lead to any significance improvement in performance

In [3]:
cs = np.logspace(-1,1,10)
log_reg1 = LogisticRegressionCV(Cs=cs, cv=5, scoring='log_loss', solver='lbfgs')
log_reg2 = LogisticRegressionCV(Cs=cs, cv=5, scoring='log_loss', solver='newton-cg')
log_reg3 = LogisticRegressionCV(Cs=cs, cv=5, scoring='log_loss', solver='liblinear')

In [4]:
log_reg1.fit(train, labels)
log_reg2.fit(train, labels)
log_reg3.fit(train, labels)

LogisticRegressionCV(Cs=array([  0.1    ,   0.16681,   0.27826,   0.46416,   0.77426,   1.29155,
         2.15443,   3.59381,   5.99484,  10.     ]),
           class_weight=None, cv=5, dual=False, fit_intercept=True,
           intercept_scaling=1.0, max_iter=100, multi_class='ovr',
           n_jobs=1, penalty='l2', refit=True, scoring='log_loss',
           solver='liblinear', tol=0.0001, verbose=0)

In [32]:
cf1 = confusion_matrix(labels, log_reg1.predict(train))
cf2 = confusion_matrix(labels, log_reg2.predict(train))
cf3 = confusion_matrix(labels, log_reg3.predict(train))

In [35]:
cf3          # predictions wrong!

array([[    0,  1810,     0,     0,     0,   119,     0,     0,     0],
       [    0, 14184,     0,     0,     0,  1938,     0,     0,     0],
       [    0,  6376,     0,     0,     0,  1628,     0,     0,     0],
       [    0,  2591,     0,     0,     0,   100,     0,     0,     0],
       [    0,  2721,     0,     0,     0,    18,     0,     0,     0],
       [    0, 14016,     0,     0,     0,   119,     0,     0,     0],
       [    0,  2705,     0,     0,     0,   134,     0,     0,     0],
       [    0,  7762,     0,     0,    24,   653,     0,    25,     0],
       [    0,  4872,     0,     0,     0,    83,     0,     0,     0]])

## Use of bagging

In [9]:
from sklearn.ensemble import BaggingClassifier

In [10]:
log_reg = LogisticRegression(C=1.3894954943731375, multi_class='ovr', penalty='l2', solver='newton-cg')

In [11]:
bag_c = BaggingClassifier(base_estimator=log_reg, n_estimators=3, max_samples=0.6, random_state=9876)
bag_c.fit(train, labels)

BaggingClassifier(base_estimator=LogisticRegression(C=1.38949549437, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', penalty='l2', random_state=None,
          solver='newton-cg', tol=0.0001, verbose=0),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=0.6, n_estimators=3, n_jobs=1, oob_score=False,
         random_state=9876, verbose=0)

In [13]:
score_bag = log_loss(labels, bag_c.predict_proba(train))
cf4 = confusion_matrix(labels, bag_c.predict(train))
print score_bag
print cf4

0.659694000021
[[  642   231     3     0     2   176    45   379   451]
 [    8 14500  1320    49    47    43    88    49    18]
 [    0  5703  2081    30     0    16   129    35    10]
 [    0  1744   261   469    19   150    43     4     1]
 [    0   125     2     0  2608     1     0     3     0]
 [   56   231    22    11     1 13152   155   304   203]
 [   38   593   167     9     9   250  1578   183    12]
 [  114   186    39     0     2   255    74  7688   106]
 [   79   239     6     3     4   188    21   248  4167]]


In [16]:
n_est = [10, 20, 30, 40, 50, 60, 70, 90, 100]       # number of estimators
max_s = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

In [None]:
bag_c = BaggingClassifier(base_estimator=log_reg, n_estimators=n, max_samples=m)
bag_c.fit(train_x, train_y)
train_score = log_loss(train_y, bag_c.predict_proba(train_x))
test_score = log_loss(test_y, bag_c.predict_proba(test_x))
res.append([n, m, train_score, test_score])
print train_score, test_score

0.654936900271 0.668139462191
0.655017078165 0.669857704246
0.655875945141 0.672478952523
0.656360762279 0.669820284849
0.657121205607 0.670441685149
0.656800677343 0.670773306021
0.657676004928 0.669524605602
0.654552579659 0.669458531732
0.655129451528 0.668836293226
0.655410502777 0.668418321814
0.655979799127 0.670010415059
0.656088615685 0.670183117844
0.65603354775 0.669185342342
0.657094276036 0.671359991742
0.653538340599 0.669039778708
0.655076574529 0.66803208383
0.654894539176 0.667946146569
0.655811997011 0.669578551486
0.656530186429 0.670220589293
0.656460584618

In [74]:
param_grid = {'n_estimators': np.arange(1,3,1),
               'max_samples': np.arange(0.7, 0.8, 0.1)}
gs_bag = GridSearchCV(bag_c, param_grid, cv = 2, scoring = 'log_loss')
gs_bag.fit(train, labels)

GridSearchCV(cv=2, error_score='raise',
       estimator=BaggingClassifier(base_estimator=LogisticRegression(C=1.38949549437, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', penalty='l2', random_state=None,
          solver='newton-cg', tol=0.0001, verbose=0),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=0.6, n_estimators=3, n_jobs=1, oob_score=False,
         random_state=9876, verbose=0),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'n_estimators': array([1, 2]), 'max_samples': array([ 0.7,  0.8])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring='log_loss', verbose=0)

In [77]:
gs_bag.grid_scores_

[mean: -0.69087, std: 0.00371, params: {'n_estimators': 1, 'max_samples': 0.69999999999999996},
 mean: -0.67974, std: 0.00196, params: {'n_estimators': 2, 'max_samples': 0.69999999999999996},
 mean: -0.68851, std: 0.00304, params: {'n_estimators': 1, 'max_samples': 0.79999999999999993},
 mean: -0.67916, std: 0.00126, params: {'n_estimators': 2, 'max_samples': 0.79999999999999993}]

In [8]:
import sklearn.cross_validation
# print(dir(sklearn.cross_validation))
# help(sklearn.cross_validation)
sklearn.cross_validation??