## Decision Tree 

In [34]:
adult['relationship'].unique()

array(['Not-in-family', 'Husband', 'Wife', 'Own-child', 'Unmarried',
       'Other-relative'], dtype=object)

In [35]:
adult['marital_status'].unique()

array(['Never-married', 'Married', 'Not-married', 'Widowed'], dtype=object)

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn as sk
import sklearn.model_selection as model_selection

adult = pd.read_csv('clean_census.csv', index_col = 0)
adult.head()

Unnamed: 0,age,workclass,education,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,over_50k
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-Employed,Bachelors,Married,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,Not-married,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,Below-HS,Married,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,Married,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [2]:
y = adult['over_50k']== '>50K'
y.head()

0    False
1    False
2    False
3    False
4    False
Name: over_50k, dtype: bool

In [3]:
adultX = adult.iloc[:,:12]
X = pd.get_dummies(adultX)
X.head()

Unnamed: 0,age,capital_gain,capital_loss,hours_per_week,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-Employed,workclass_State-gov,workclass_Without-pay,...,native_country_Portugal,native_country_Puerto-Rico,native_country_Scotland,native_country_South,native_country_Taiwan,native_country_Thailand,native_country_Trinadad&Tobago,native_country_United-States,native_country_Vietnam,native_country_Yugoslavia
0,39,2174,0,40,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,50,0,0,13,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,0,0,40,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,0,0,40,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,0,0,40,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = .2, random_state = 3)

In [12]:
import sklearn.tree as tree
clf = tree.DecisionTreeClassifier()
clf.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [13]:
sk.metrics.accuracy_score(y_test, clf.predict(X_test))

0.81700646444554947

## Logistic Regression

### Default Settings

In [5]:
from sklearn import linear_model
logRegModel = linear_model.LogisticRegression()
logRegModel.fit(X_train,y_train)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [87]:
sk.metrics.accuracy_score(y_test, logRegModel.predict(X_test))

0.84319575667163926

In [36]:
pd.DataFrame([X_train.columns,np.transpose(logRegModel.coef_)]).transpose()

Unnamed: 0,0,1
0,age,[0.0237305504387]
1,capital_gain,[0.000322944633378]
2,capital_loss,[0.00066041881271]
3,hours_per_week,[0.0287133679705]
4,workclass_Federal-gov,[0.241213600272]
5,workclass_Local-gov,[-0.460780750659]
6,workclass_Private,[-0.313320468364]
7,workclass_Self-Employed,[-0.560623770847]
8,workclass_State-gov,[-0.585106352572]
9,workclass_Without-pay,[-0.104970744817]


### Penalty adjustment


In [21]:
logRegModelL1 = linear_model.LogisticRegression(penalty = 'l1')
logRegModelL1 = logRegModelL1.fit(X_train,y_train)
sk.metrics.accuracy_score(y_test, logRegModelL1.predict(X_test))

0.84269849162937183

L1 vs. L2 has minimal performance difference, so we will stick with the more standard squared penalty L2.

### Regularization parameter optimization with K-Fold CV

##### Manual Parameter Search

In [6]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

def cv_score(clf, x, y, score_func=accuracy_score):
    result = 0
    nfold = 5
    for train, cv in KFold(nfold).split(x): # split data into train/test groups, 5 times
        clf.fit(x.iloc[train], y.iloc[train]) # fit
        result += score_func(clf.predict(x.iloc[cv]), y.iloc[cv]) # evaluate score function on held-out data
    return result / nfold # average

In [23]:
Cs = [0.001, .05, 0.1, .5, 1, 5, 10]
for c in Cs:
    clf = linear_model.LogisticRegression(C = c)
    print((c,cv_score(clf,X_train,y_train)))

(0.001, 0.80898515596649401)
(0.05, 0.84740353310758432)
(0.1, 0.84756925891490176)
(0.5, 0.84736200502889147)
(1, 0.84740343862798451)
(5, 0.84682322222675521)
(10, 0.84698902533556364)


In [24]:
logRegModel = linear_model.LogisticRegression(C = .1)
logRegModel.fit(X_train,y_train)
accuracy_score(logRegModel.predict(X_test), y_test)

0.84186971655892595

##### GridSearchCV

In [10]:
from sklearn.grid_search import GridSearchCV
clf2 = linear_model.LogisticRegression()
grid = {"C": [0.001, .05, 0.1, .5, 1, 5, 10]}
gridSearch = GridSearchCV(clf2, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_train,y_train)
clf2 = linear_model.LogisticRegression(C = gridSearch.best_params_['C'])
clf2.fit(X_train,y_train)
print((gridSearch.best_params_['C'],accuracy_score(clf2.predict(X_test),y_test)))



(0.1, 0.84186971655892595)


regularization, combinations of variables

### Feature Selection 

##### By importance coefficient 

In [77]:
coeff = pd.DataFrame([X_train.columns,np.transpose(logRegModel.coef_)]).transpose()
coeff.columns = ['label','value']
coeff = coeff.set_index('label')
coeff = coeff.drop(['capital_gain','capital_loss','age','hours_per_week'])
coeff = coeff.reset_index()
coeff['abs'] = np.abs(coeff['value'])
coeff = coeff.sort_values('abs')
coeff.head(25)

Unnamed: 0,label,value,abs
58,native_country_Holand-Netherlands,[-0.00261264423675],[0.00261264423675]
60,native_country_Hong,[0.00325161682289],[0.00325161682289]
64,native_country_Ireland,[0.00614815670064],[0.00614815670064]
59,native_country_Honduras,[-0.00803719355832],[0.00803719355832]
80,native_country_Thailand,[0.010270293489],[0.010270293489]
57,native_country_Haiti,[-0.010598363792],[0.010598363792]
79,native_country_Taiwan,[0.0118123048848],[0.0118123048848]
74,native_country_Poland,[0.0148479192916],[0.0148479192916]
77,native_country_Scotland,[-0.0155164395032],[0.0155164395032]
45,native_country_Canada,[0.0180026723966],[0.0180026723966]


In [95]:
threshold = .2
weakCoeff = coeff[coeff['abs'] < threshold]['label'].tolist()

In [114]:
thresholds = [.16, .17, .18,.19,.20,.25,.3,.35,.4]
for t in thresholds:
    weakCoeff = coeff[coeff['abs'] < t]['label'].tolist()
    clfFS = linear_model.LogisticRegression()
    X_FS = X.drop(weakCoeff, axis = 1)
    X_FStrain, X_FStest, y_FStrain, y_FStest = model_selection.train_test_split(X_FS, y, test_size = .2, random_state = 3)
    print(t, cv_score(clfFS,X_FStrain,y_FStrain))
    

0.16 0.847942204252
0.17 0.847942204252
0.18 0.848149415193
0.19 0.848149415193
0.2 0.847734950366
0.25 0.84665748783
0.3 0.847113326133
0.35 0.846698887073
0.4 0.846491624598


In [115]:
threshold = .19
weakCoeff = coeff[coeff['abs'] < threshold]['label'].tolist()
X_FS = X.drop(weakCoeff, axis = 1)
X_FStrain, X_FStest, y_FStrain, y_FStest = model_selection.train_test_split(X_FS, y, test_size = .2, random_state = 3)
grid = {"C": [0.001, .05, 0.1, .5, 1, 5, 10]}
gridSearch = GridSearchCV(clfFS, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_FStrain,y_FStrain)
clf2 = linear_model.LogisticRegression(C = gridSearch.best_params_['C'])
clf2.fit(X_FStrain,y_FStrain)
print((gridSearch.best_params_['C'],accuracy_score(clf2.predict(X_FStest),y_FStest)))

(0.5, 0.84220122658710428)


### By category

##### No Native Country 

In [30]:
adultNoCountry = adult.drop('native_country', axis = 1)
XnoCountry = pd.get_dummies(adultNoCountry.iloc[:,:11])
YnoCountry = adultNoCountry.iloc[:,11]
XnoCountry.head()

Unnamed: 0,age,capital_gain,capital_loss,hours_per_week,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-Employed,workclass_State-gov,workclass_Without-pay,...,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Female,sex_Male
0,39,2174,0,40,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1
1,50,0,0,13,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,1
2,38,0,0,40,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,1
3,53,0,0,40,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
4,28,0,0,40,0,0,1,0,0,0,...,0,0,1,0,0,1,0,0,1,0


In [31]:
X_nctrain, X_nctest, y_nctrain, y_nctest = model_selection.train_test_split(XnoCountry, YnoCountry, test_size = .2, random_state = 3)

In [32]:
clfNC = linear_model.LogisticRegression()
grid = {"C": [0.001, .05, 0.1, .5, 1, 5, 10]}
gridSearch = GridSearchCV(clfNC, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_nctrain,y_nctrain)
clf2 = linear_model.LogisticRegression(C = gridSearch.best_params_['C'])
clf2.fit(X_nctrain,y_nctrain)
print((gridSearch.best_params_['C'],accuracy_score(clf2.predict(X_nctest),y_nctest)))

(0.1, 0.84303000165755015)


##### No Native Country, Race 

In [133]:
adultncnr = adult.drop(['native_country','race'], axis = 1)
Xncnr = pd.get_dummies(adultncnr.iloc[:,:10])
Yncnr = adultncnr['over_50k'] == '>50K'
Xncnr.head()

Unnamed: 0,age,capital_gain,capital_loss,hours_per_week,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-Employed,workclass_State-gov,workclass_Without-pay,...,occupation_Tech-support,occupation_Transport-moving,relationship_Husband,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,sex_Female,sex_Male
0,39,2174,0,40,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
1,50,0,0,13,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,1
2,38,0,0,40,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,53,0,0,40,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,28,0,0,40,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,1,0


In [134]:
X_ncnrtrain, X_ncnrtest, y_ncnrtrain, y_ncnrtest = model_selection.train_test_split(Xncnr, Yncnr, test_size = .2, random_state = 3)

In [135]:
clfncnr = linear_model.LogisticRegression()
grid = {"C": [0.001, .05, 0.1, .5, 1, 5, 10]}
gridSearch = GridSearchCV(clfncnr, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_ncnrtrain,y_ncnrtrain)
clf2 = linear_model.LogisticRegression(C = gridSearch.best_params_['C'])
clf2.fit(X_ncnrtrain,y_ncnrtrain)
print((gridSearch.best_params_['C'],accuracy_score(clf2.predict(X_ncnrtest),y_ncnrtest)))

(0.5, 0.84402453174208525)


###### No Native Country, Relationship

In [33]:
adultNoCNoRel = adult.drop(['native_country','relationship'], axis = 1)
XnCnRel = pd.get_dummies(adultNoCNoRel.iloc[:,:10])
YnCnRel = adultNoCNoRel['over_50k'] == '>50K'

In [34]:
X_nCnReltrain, X_nCnReltest, y_nCnReltrain, y_nCnReltest = model_selection.train_test_split(XnCnRel, YnCnRel, test_size = .2, random_state = 3)

In [35]:
clfnCnRel = linear_model.LogisticRegression()
grid = {"C": [0.001, .05, 0.1, .5, 1, 5, 10]}
gridSearch = GridSearchCV(clfnCnRel, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_nCnReltrain,y_nCnReltrain)
clf2 = linear_model.LogisticRegression(C = gridSearch.best_params_['C'])
clf2.fit(X_nCnReltrain,y_nCnReltrain)
print((gridSearch.best_params_['C'],accuracy_score(clf2.predict(X_nCnReltest),y_nCnReltest)))

(10, 0.84220122658710428)


##### No Native Country, Race, Workclass

In [136]:
adultncnrnwc = adult.drop(['native_country','race','workclass'], axis = 1)
Xncnrnwc = pd.get_dummies(adultncnrnwc.iloc[:,:9])
Yncnrnwc = adultncnrnwc['over_50k'] == '>50K'
Xncnrnwc.head()

Unnamed: 0,age,capital_gain,capital_loss,hours_per_week,education_Associates,education_Bachelors,education_Below-HS,education_Doctorate,education_HS-grad,education_Masters,...,occupation_Tech-support,occupation_Transport-moving,relationship_Husband,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,sex_Female,sex_Male
0,39,2174,0,40,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
1,50,0,0,13,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,38,0,0,40,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
3,53,0,0,40,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,28,0,0,40,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0


In [137]:
X_ncnrnwctrain, X_ncnrnwctest, y_ncnrnwctrain, y_ncnrnwctest = model_selection.train_test_split(Xncnrnwc, Yncnrnwc, test_size = .2, random_state = 3)

In [138]:
clfncnrnwc = linear_model.LogisticRegression()
grid = {"C": [0.001, .05, 0.1, .5, 1, 5, 10]}
gridSearch = GridSearchCV(clfncnrnwc, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_ncnrnwctrain,y_ncnrnwctrain)
clf2 = linear_model.LogisticRegression(C = gridSearch.best_params_['C'])
clf2.fit(X_ncnrnwctrain,y_ncnrnwctrain)
print((gridSearch.best_params_['C'],accuracy_score(clf2.predict(X_ncnrnwctest),y_ncnrnwctest)))

(5, 0.84203547157301506)


#####  No Native Country, Relationship, Race

In [36]:
adultNoCNoRelnRa = adult.drop(['native_country','relationship','race'], axis = 1)
XnCnRelnRa = pd.get_dummies(adultNoCNoRelnRa.iloc[:,:9])
YnCnRelnRa = adultNoCNoRelnRa['over_50k'] == '>50K'
XnCnRelnRa.head()

Unnamed: 0,age,capital_gain,capital_loss,hours_per_week,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-Employed,workclass_State-gov,workclass_Without-pay,...,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,sex_Female,sex_Male
0,39,2174,0,40,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,50,0,0,13,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,38,0,0,40,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,53,0,0,40,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,28,0,0,40,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [37]:
X_nCnRelnRatrain, X_nCnRelnRatest, y_nCnRelnRatrain, y_nCnRelnRatest = model_selection.train_test_split(XnCnRelnRa, YnCnRelnRa, test_size = .2, random_state = 3)

In [41]:
clfnCnRelnRa = linear_model.LogisticRegression()
grid = {"C": [0.001, .05, 0.1, .5, 1, 5, 10]}
gridSearch = GridSearchCV(clfnCnRelnRa, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_nCnRelnRatrain,y_nCnRelnRatrain)
clf3 = linear_model.LogisticRegression(C = gridSearch.best_params_['C'])
clf3.fit(X_nCnRelnRatrain,y_nCnRelnRatrain)
print((gridSearch.best_params_['C'],accuracy_score(clf3.predict(X_nCnRelnRatest),y_nCnRelnRatest)))

(0.1, 0.84220122658710428)


##### No Native Country, Relationship, Race, Occupation

In [48]:
adultnOcc= adult.drop(['native_country','relationship','race','occupation'], axis = 1)
XnOcc = pd.get_dummies(adultnOcc.iloc[:,:8])
YnOcc = adultnOcc['over_50k'] == '>50K'
XnOcc.head()

Unnamed: 0,age,capital_gain,capital_loss,hours_per_week,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-Employed,workclass_State-gov,workclass_Without-pay,...,education_Doctorate,education_HS-grad,education_Masters,education_Prof-school,marital_status_Married,marital_status_Never-married,marital_status_Not-married,marital_status_Widowed,sex_Female,sex_Male
0,39,2174,0,40,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,1
1,50,0,0,13,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
2,38,0,0,40,0,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,1
3,53,0,0,40,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
4,28,0,0,40,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [50]:
X_nOcctrain, X_nOcctest, y_nOcctrain, y_nOcctest = model_selection.train_test_split(XnOcc, YnOcc, test_size = .2, random_state = 3)

In [52]:
clfnOcc = linear_model.LogisticRegression()
grid = {"C": [0.001, .05, 0.1, .5, 1, 5, 10]}
gridSearch = GridSearchCV(clfnOcc, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_nOcctrain,y_nOcctrain)
clf3 = linear_model.LogisticRegression(C = gridSearch.best_params_['C'])
clf3.fit(X_nOcctrain,y_nOcctrain)
print((gridSearch.best_params_['C'],accuracy_score(clf3.predict(X_nOcctest),y_nOcctest)))

(0.1, 0.83689706613625059)


###### No Native Country, Relationship, Race, Age

In [54]:
adultnAge= adult.drop(['native_country','relationship','race','age'], axis = 1)
XnAge = pd.get_dummies(adultnAge.iloc[:,:8])
YnAge = adultnAge['over_50k'] == '>50K'
XnAge.head()

Unnamed: 0,capital_gain,capital_loss,hours_per_week,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-Employed,workclass_State-gov,workclass_Without-pay,education_Associates,...,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,sex_Female,sex_Male
0,2174,0,40,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,13,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,40,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,40,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,40,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [55]:
X_nAgetrain, X_nAgetest, y_nAgetrain, y_nAgetest = model_selection.train_test_split(XnAge, YnAge, test_size = .2, random_state = 3)

In [56]:
clfnAge = linear_model.LogisticRegression()
grid = {"C": [0.001, .05, 0.1, .5, 1, 5, 10]}
gridSearch = GridSearchCV(clfnAge, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_nAgetrain,y_nAgetrain)
clf3 = linear_model.LogisticRegression(C = gridSearch.best_params_['C'])
clf3.fit(X_nAgetrain,y_nAgetrain)
print((gridSearch.best_params_['C'],accuracy_score(clf3.predict(X_nAgetest),y_nAgetest)))

(10, 0.83888612630532078)


##### No Native Country, Relationship, Race, Hours

In [58]:
adultnHours= adult.drop(['native_country','relationship','race','hours_per_week'], axis = 1)
XnHours = pd.get_dummies(adultnHours.iloc[:,:8])
YnHours = adultnHours['over_50k'] == '>50K'
XnHours.head()

Unnamed: 0,age,capital_gain,capital_loss,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-Employed,workclass_State-gov,workclass_Without-pay,education_Associates,...,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,sex_Female,sex_Male
0,39,2174,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,50,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,38,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,53,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,28,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [60]:
X_nHourstrain, X_nHourstest, y_nHourstrain, y_nHourstest = model_selection.train_test_split(XnHours, YnHours, test_size = .2, random_state = 3)

In [61]:
clfnHours = linear_model.LogisticRegression()
grid = {"C": [0.001, .05, 0.1, .5, 1, 5, 10]}
gridSearch = GridSearchCV(clfnHours, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_nHourstrain,y_nHourstrain)
clf3 = linear_model.LogisticRegression(C = gridSearch.best_params_['C'])
clf3.fit(X_nHourstrain,y_nHourstrain)
print((gridSearch.best_params_['C'],accuracy_score(clf3.predict(X_nHourstest),y_nHourstest)))

(0.5, 0.83954914636167743)


##### No Native Country, Relationship, Race, Workclass

In [7]:
adultnWC= adult.drop(['native_country','relationship','race','workclass'], axis = 1)
XnWC = pd.get_dummies(adultnWC.iloc[:,:8])
YnWC = adultnWC['over_50k'] == '>50K'
XnWC.head()

Unnamed: 0,age,capital_gain,capital_loss,hours_per_week,education_Associates,education_Bachelors,education_Below-HS,education_Doctorate,education_HS-grad,education_Masters,...,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,sex_Female,sex_Male
0,39,2174,0,40,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,50,0,0,13,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,38,0,0,40,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,53,0,0,40,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,28,0,0,40,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [8]:
X_nWCtrain, X_nWCtest, y_nWCtrain, y_nWCtest = model_selection.train_test_split(XnWC, YnWC, test_size = .2, random_state = 3)

In [11]:
clfnWC = linear_model.LogisticRegression()
grid = {"C": [0.001, .05, 0.1, .5, 1, 5, 10]}
gridSearch = GridSearchCV(clfnWC, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_nWCtrain,y_nWCtrain)
clf3 = linear_model.LogisticRegression(C = gridSearch.best_params_['C'])
clf3.fit(X_nWCtrain,y_nWCtrain)
print((gridSearch.best_params_['C'],accuracy_score(clf3.predict(X_nWCtest),y_nWCtest)))

(0.1, 0.84286424664346093)


In [68]:
adultnWC.columns

Index(['age', 'education', 'marital_status', 'occupation', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'over_50k'],
      dtype='object')

##### No Native Country, Relationship, Race, Workclass, Education

In [82]:
adultnEd= adult.drop(['native_country','relationship','race','workclass','education'], axis = 1)
XnEd = pd.get_dummies(adultnEd.iloc[:,:7])
YnEd = adultnEd['over_50k'] == '>50K'
XnEd.head()

Unnamed: 0,age,capital_gain,capital_loss,hours_per_week,marital_status_Married,marital_status_Never-married,marital_status_Not-married,marital_status_Widowed,occupation_Adm-clerical,occupation_Armed-Forces,...,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,sex_Female,sex_Male
0,39,2174,0,40,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,50,0,0,13,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,38,0,0,40,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,53,0,0,40,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,28,0,0,40,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [83]:
X_nEdtrain, X_nEdtest, y_nEdtrain, y_nEdtest = model_selection.train_test_split(XnEd, YnEd, test_size = .2, random_state = 3)

In [84]:
clfnEd = linear_model.LogisticRegression()
grid = {"C": [0.001, .05, 0.1, .5, 1, 5, 10]}
gridSearch = GridSearchCV(clfnEd, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_nEdtrain,y_nEdtrain)
clf3 = linear_model.LogisticRegression(C = gridSearch.best_params_['C'])
clf3.fit(X_nEdtrain,y_nEdtrain)
print((gridSearch.best_params_['C'],accuracy_score(clf3.predict(X_nEdtest),y_nEdtest)))

(5, 0.84087518647439086)


#####  No Native Country, Relationship, Race, Workclass, Marital Status

In [88]:
adultnMS= adult.drop(['native_country','relationship','race','workclass','marital_status'], axis = 1)
XnMS = pd.get_dummies(adultnMS.iloc[:,:7])
YnMS = adultnMS['over_50k'] == '>50K'
XnMS.head()

Unnamed: 0,age,capital_gain,capital_loss,hours_per_week,education_Associates,education_Bachelors,education_Below-HS,education_Doctorate,education_HS-grad,education_Masters,...,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,sex_Female,sex_Male
0,39,2174,0,40,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,50,0,0,13,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,38,0,0,40,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,53,0,0,40,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,28,0,0,40,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [89]:
X_nMStrain, X_nMStest, y_nMStrain, y_nMStest = model_selection.train_test_split(XnMS, YnMS, test_size = .2, random_state = 3)

In [90]:
clfnMS = linear_model.LogisticRegression()
grid = {"C": [0.001, .05, 0.1, .5, 1, 5, 10]}
gridSearch = GridSearchCV(clfnMS, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_nMStrain,y_nMStrain)
clf3 = linear_model.LogisticRegression(C = gridSearch.best_params_['C'])
clf3.fit(X_nMStrain,y_nMStrain)
print((gridSearch.best_params_['C'],accuracy_score(clf3.predict(X_nMStest),y_nMStest)))

(0.05, 0.8228078899386706)


#####  No Native Country, Relationship, Race, Workclass, Sex

In [93]:
adultnSx= adult.drop(['native_country','relationship','race','workclass','sex'], axis = 1)
XnSx = pd.get_dummies(adultnSx.iloc[:,:7])
YnSx = adultnSx['over_50k'] == '>50K'
XnSx.head()

Unnamed: 0,age,capital_gain,capital_loss,hours_per_week,education_Associates,education_Bachelors,education_Below-HS,education_Doctorate,education_HS-grad,education_Masters,...,occupation_Farming-fishing,occupation_Handlers-cleaners,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving
0,39,2174,0,40,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,50,0,0,13,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,38,0,0,40,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
3,53,0,0,40,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,28,0,0,40,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [94]:
X_nSxtrain, X_nSxtest, y_nSxtrain, y_nSxtest = model_selection.train_test_split(XnSx, YnSx, test_size = .2, random_state = 3)

In [95]:
clfnSx = linear_model.LogisticRegression()
grid = {"C": [0.001, .05, 0.1, .5, 1, 5, 10]}
gridSearch = GridSearchCV(clfnSx, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_nSxtrain,y_nSxtrain)
clf3 = linear_model.LogisticRegression(C = gridSearch.best_params_['C'])
clf3.fit(X_nSxtrain,y_nSxtrain)
print((gridSearch.best_params_['C'],accuracy_score(clf3.predict(X_nSxtest),y_nSxtest)))

(0.5, 0.84153820653074751)


#####  No Native Country, Relationship, Race, Workclass, Combined Capital Gains

In [121]:
cols = ['age','education','marital_status','occupation','sex','hours_per_week','total_capital','capital_gain','capital_loss','over_50k']
adultCCG= adult.drop(['native_country','relationship','race','workclass'], axis = 1)
adultCCG['total_capital'] = adultCCG['capital_gain'] - adultCCG['capital_loss']
adultCCG = adultCCG[cols]
XCCG = pd.get_dummies(adultCCG.iloc[:,:7])
YCCG = adultCCG['over_50k'] == '>50K'
adultCCG.head()

Unnamed: 0,age,education,marital_status,occupation,sex,hours_per_week,total_capital,capital_gain,capital_loss,over_50k
0,39,Bachelors,Never-married,Adm-clerical,Male,40,2174,2174,0,<=50K
1,50,Bachelors,Married,Exec-managerial,Male,13,0,0,0,<=50K
2,38,HS-grad,Not-married,Handlers-cleaners,Male,40,0,0,0,<=50K
3,53,Below-HS,Married,Handlers-cleaners,Male,40,0,0,0,<=50K
4,28,Bachelors,Married,Prof-specialty,Female,40,0,0,0,<=50K


In [122]:
XCCG.columns

Index(['age', 'hours_per_week', 'total_capital', 'education_Associates',
       'education_Bachelors', 'education_Below-HS', 'education_Doctorate',
       'education_HS-grad', 'education_Masters', 'education_Prof-school',
       'marital_status_Married', 'marital_status_Never-married',
       'marital_status_Not-married', 'marital_status_Widowed',
       'occupation_Adm-clerical', 'occupation_Armed-Forces',
       'occupation_Craft-repair', 'occupation_Exec-managerial',
       'occupation_Farming-fishing', 'occupation_Handlers-cleaners',
       'occupation_Machine-op-inspct', 'occupation_Other-service',
       'occupation_Priv-house-serv', 'occupation_Prof-specialty',
       'occupation_Protective-serv', 'occupation_Sales',
       'occupation_Tech-support', 'occupation_Transport-moving', 'sex_Female',
       'sex_Male'],
      dtype='object')

In [123]:
X_CCGtrain, X_CCGtest, y_CCGtrain, y_CCGtest = model_selection.train_test_split(XCCG, YCCG, test_size = .2, random_state = 3)

In [124]:
clfCCG = linear_model.LogisticRegression()
grid = {"C": [0.001, .05, 0.1, .5, 1, 5, 10]}
gridSearch = GridSearchCV(clfCCG, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_CCGtrain,y_CCGtrain)
clf3 = linear_model.LogisticRegression(C = gridSearch.best_params_['C'])
clf3.fit(X_CCGtrain,y_CCGtrain)
print((gridSearch.best_params_['C'],accuracy_score(clf3.predict(X_CCGtest),y_CCGtest)))

(1, 0.83772584120669646)


### Best performance: 
No Native Country, Race: C = 0.5, accuracy =  0.844


## k-NN 

In [140]:
import sklearn.neighbors as neighbors
neigh = neighbors.KNeighborsClassifier(n_neighbors = 5)
neigh.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [141]:
accuracy_score(neigh.predict(X_test),y_test)

0.83888612630532078

#### Optimizing K

In [144]:
clfN = neighbors.KNeighborsClassifier()
grid = {"n_neighbors": [3,4,5,6,7,8,9,10,11,12]}
gridSearch = GridSearchCV(clfN, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_train,y_train)
clfN2 = neighbors.KNeighborsClassifier(n_neighbors = gridSearch.best_params_['n_neighbors'])
clfN2.fit(X_train,y_train)
print((gridSearch.best_params_['n_neighbors'],accuracy_score(clfN2.predict(X_test),y_test)))

(10, 0.84054367644621253)


#### Optimizing K with Feature Selection (No Native Country, Race)

In [146]:
clfN = neighbors.KNeighborsClassifier()
grid = {"n_neighbors": [3,4,5,6,7,8,9,10,11,12]}
gridSearch = GridSearchCV(clfN, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_ncnrtrain,y_ncnrtrain)
clfN2 = neighbors.KNeighborsClassifier(n_neighbors = gridSearch.best_params_['n_neighbors'])
clfN2.fit(X_ncnrtrain,y_ncnrtrain)
print((gridSearch.best_params_['n_neighbors'],accuracy_score(clfN2.predict(X_ncnrtest),y_ncnrtest)))

(10, 0.8413724515166584)


### Best performance:
No Native Country, Race: K = 10, accuracy = .841

## Random Forest

In [147]:
import sklearn.ensemble as ensemble
randF = ensemble.RandomForestClassifier()
randF.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [148]:
accuracy_score(randF.predict(X_test),y_test)

0.83971490137576665

#### Optimizing Number of Trees

In [150]:
clfRF = ensemble.RandomForestClassifier()
grid = {"n_estimators": [5,10,20,40,50,60,70,80,90,100]}
gridSearch = GridSearchCV(clfRF, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_train,y_train)
clfRF2 = ensemble.RandomForestClassifier(n_estimators = gridSearch.best_params_['n_estimators'])
clfRF2.fit(X_train,y_train)
print((gridSearch.best_params_['n_estimators'],accuracy_score(clfRF2.predict(X_test),y_test)))

(80, 0.84286424664346093)


#### Optimizing Tree Numbers with Feature Selection (No Native Country, Race)

In [151]:
clfRF = ensemble.RandomForestClassifier()
grid = {"n_estimators": [5,10,20,40,50,60,70,80,90,100]}
gridSearch = GridSearchCV(clfRF, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_ncnrtrain,y_ncnrtrain)
clfRF2 = ensemble.RandomForestClassifier(n_estimators = gridSearch.best_params_['n_estimators'])
clfRF2.fit(X_ncnrtrain,y_ncnrtrain)
print((gridSearch.best_params_['n_estimators'],accuracy_score(clfRF2.predict(X_ncnrtest),y_ncnrtest)))

(50, 0.83971490137576665)


### Best Performance:
Full Feature Set: number of trees: 80, accuracy: .843

## Support Vector Machine 

In [152]:
from sklearn import svm
clfSVM = svm.SVC()
clfSVM.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [14]:
accuracy_score(clfSVM.predict(X_test), y_test)

0.85463285264379252

## Gradient Boosting 

In [153]:
import sklearn.ensemble as ensemble
clfBoost = ensemble.GradientBoostingClassifier()
clfBoost.fit(X_train,y_train)
accuracy_score(clfBoost.predict(X_test),y_test)

0.86010276810873532

#### Optimizing min_impurity_split 

In [17]:
clfGB = ensemble.GradientBoostingClassifier()
grid = {"min_impurity_split":[.005,.0075,.01,.0125,.015]}
gridSearch = GridSearchCV(clfGB, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_train,y_train)
clfGB2 = ensemble.GradientBoostingClassifier(min_impurity_split =  gridSearch.best_params_['min_impurity_split'])
clfGB2.fit(X_train,y_train)
print((gridSearch.best_params_['min_impurity_split'],accuracy_score(clfGB2.predict(X_test),y_test)))

(0.01, 0.86010276810873532)


#### Using Feature Selection (No Native Country, Relationship, Race, Workclass)

In [154]:
clfGB = ensemble.GradientBoostingClassifier()
grid = {"min_impurity_split":[.0001,.0025,.005]}
gridSearch = GridSearchCV(clfGB, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_nWCtrain,y_nWCtrain)
clfGB2 = ensemble.GradientBoostingClassifier(min_impurity_split =  gridSearch.best_params_['min_impurity_split'])
clfGB2.fit(X_nWCtrain,y_nWCtrain)
print((gridSearch.best_params_['min_impurity_split'],accuracy_score(clfGB2.predict(X_nWCtest),y_nWCtest)))

(0.0001, 0.8610972981932703)


#### Using Feature Selection (No Native Country, Race)

In [157]:
clfGB = ensemble.GradientBoostingClassifier()
grid = {"min_impurity_split":[.0001,.0025,.005]}
gridSearch = GridSearchCV(clfGB, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_ncnrtrain,y_ncnrtrain)
clfGB2 = ensemble.GradientBoostingClassifier(min_impurity_split =  gridSearch.best_params_['min_impurity_split'])
clfGB2.fit(X_ncnrtrain,y_ncnrtrain)
print((gridSearch.best_params_['min_impurity_split'],accuracy_score(clfGB2.predict(X_ncnrtest),y_ncnrtest)))

(0.0025, 0.85977125808055699)


### Best performance:
No Native Country, Relationship, Race, Workclass: min_impurity_split: .0025, accuracy: .860

## AdaBoost 

In [20]:
clfAda = ensemble.AdaBoostClassifier()
clfAda.fit(X_train,y_train)
accuracy_score(clfAda.predict(X_test),y_test)

0.85645615779877338

#### Optimizing Estimators

In [38]:
clfAB = ensemble.AdaBoostClassifier()
grid = {"n_estimators":[250,260,270,280,290]}
gridSearch = GridSearchCV(clfAB, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_train,y_train)
clfAB2 = ensemble.AdaBoostClassifier(n_estimators =  gridSearch.best_params_['n_estimators'])
clfAB2.fit(X_train,y_train)
print((gridSearch.best_params_['n_estimators'],accuracy_score(clfAB2.predict(X_test),y_test)))

(270, 0.86441239847505391)


#####  Optimizing Estimators with Feature Selection

###### No Native Country, Race

In [158]:
clfAB = ensemble.AdaBoostClassifier()
grid = {"n_estimators":[250,260,270,280,290]}
gridSearch = GridSearchCV(clfAB, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_ncnrtrain,y_ncnrtrain)
clfAB2 = ensemble.AdaBoostClassifier(n_estimators =  gridSearch.best_params_['n_estimators'])
clfAB2.fit(X_ncnrtrain,y_ncnrtrain)
print((gridSearch.best_params_['n_estimators'],accuracy_score(clfAB2.predict(X_ncnrtest),y_ncnrtest)))

(290, 0.86374937841869714)


###### Using Feature Selection (No Native Country, Relationship, Race, Workclass)

In [160]:
clfAB = ensemble.AdaBoostClassifier()
grid = {"n_estimators":[250,260,270,280,290]}
gridSearch = GridSearchCV(clfAB, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_nWCtrain,y_nWCtrain)
clfAB2 = ensemble.AdaBoostClassifier(n_estimators =  gridSearch.best_params_['n_estimators'])
clfAB2.fit(X_nWCtrain,y_nWCtrain)
print((gridSearch.best_params_['n_estimators'],accuracy_score(clfAB2.predict(X_nWCtest),y_nWCtest)))

(270, 0.86391513343278636)


#### Best Performance:
No Native Country, Relationship, Race, Workclass: number of estimators: 270, accuracy: .864

## Ensemble Methods

#### Logistic Regression with AdaBoost (isotonic regression)

In [161]:
clfncnr = linear_model.LogisticRegression()
grid = {"C": [0.001, .05, 0.1, .5, 1, 5, 10]}
gridSearch = GridSearchCV(clfncnr, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_ncnrtrain,y_ncnrtrain)
clfncnr2 = linear_model.LogisticRegression(C = gridSearch.best_params_['C'])
clfncnr2.fit(X_ncnrtrain,y_ncnrtrain)
print((gridSearch.best_params_['C'],accuracy_score(clfncnr2.predict(X_ncnrtest),y_ncnrtest)))

(0.5, 0.84402453174208525)


In [162]:
clfAB = ensemble.AdaBoostClassifier()
grid = {"n_estimators":[250,260,270,280,290]}
gridSearch = GridSearchCV(clfAB, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_nWCtrain,y_nWCtrain)
clfAB2 = ensemble.AdaBoostClassifier(n_estimators =  gridSearch.best_params_['n_estimators'])
clfAB2.fit(X_nWCtrain,y_nWCtrain)
print((gridSearch.best_params_['n_estimators'],accuracy_score(clfAB2.predict(X_nWCtest),y_nWCtest)))

(270, 0.86391513343278636)


In [176]:
clfncnr2.predict_proba(X_ncnrtest)[:,0]

array([ 0.96657841,  0.99698336,  0.53770326, ...,  0.49966787,
        0.96344782,  0.93369909])

In [181]:
from sklearn import calibration
clfAB = ensemble.AdaBoostClassifier()
grid = {"n_estimators":[250,260,270,280,290]}
gridSearch = GridSearchCV(clfAB, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_nWCtrain,y_nWCtrain)
clfAB2 = ensemble.AdaBoostClassifier(n_estimators =  gridSearch.best_params_['n_estimators'])
clfAB3 = calibration.CalibratedClassifierCV(clfAB2, method = 'isotonic', cv = 5)
clfAB3.fit(X_nWCtrain,y_nWCtrain)
clfAB3.predict_proba(X_nWCtest)

array([[ 0.97208232,  0.02791768],
       [ 0.99769176,  0.00230824],
       [ 0.45451097,  0.54548903],
       ..., 
       [ 0.57869991,  0.42130009],
       [ 0.99301341,  0.00698659],
       [ 0.99330307,  0.00669693]])

In [218]:
logProbs = clfncnr2.predict_proba(X_ncnrtest)[:,0]
adaProbs = clfAB3.predict_proba(X_nWCtest)[:,0]
probas = pd.DataFrame([logProbs,adaProbs]).transpose()
probas.columns = ['logProbs', 'adaProbs']
probas['avgProbs'] = (logProbs + adaProbs)/2
probas['prediction'] =  np.logical_not(probas['avgProbs'] > .5)
probas.head()

Unnamed: 0,logProbs,adaProbs,avgProbs,prediction
0,0.966578,0.972082,0.96933,False
1,0.996983,0.997692,0.997338,False
2,0.537703,0.454511,0.496107,True
3,0.707859,0.618518,0.663189,False
4,0.974429,0.997625,0.986027,False


In [220]:
accuracy_score(probas['prediction'],y_nWCtest)

0.85943974805237855

Conclusion: Worse performance than with AdaBoost alone 

#### Logistic Regression with AdaBoost (sigmoid regression)

In [221]:
clfABs = ensemble.AdaBoostClassifier()
grid = {"n_estimators":[250,260,270,280,290]}
gridSearch = GridSearchCV(clfAB, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_nWCtrain,y_nWCtrain)
clfABs2 = ensemble.AdaBoostClassifier(n_estimators =  gridSearch.best_params_['n_estimators'])
clfABs3 = calibration.CalibratedClassifierCV(clfAB2, method = 'sigmoid', cv = 5)
clfABs3.fit(X_nWCtrain,y_nWCtrain)
clfABs3.predict_proba(X_nWCtest)

array([[ 0.96573377,  0.03426623],
       [ 0.99812939,  0.00187061],
       [ 0.46272974,  0.53727026],
       ..., 
       [ 0.59423566,  0.40576434],
       [ 0.9917445 ,  0.0082555 ],
       [ 0.99128693,  0.00871307]])

In [222]:
logProbs = clfncnr2.predict_proba(X_ncnrtest)[:,0]
adaProbs = clfABs3.predict_proba(X_nWCtest)[:,0]
probas = pd.DataFrame([logProbs,adaProbs]).transpose()
probas.columns = ['logProbs', 'adaProbs']
probas['avgProbs'] = (logProbs + adaProbs)/2
probas['prediction'] =  np.logical_not(probas['avgProbs'] > .5)
accuracy_score(probas['prediction'],y_nWCtest)

0.85877672799602189

#### Gradient Boost with AdaBoost (isotonic regression)

In [223]:
clfGB = ensemble.GradientBoostingClassifier()
grid = {"min_impurity_split":[.0001,.0025,.005]}
gridSearch = GridSearchCV(clfGB, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_nWCtrain,y_nWCtrain)
clfGB2 = ensemble.GradientBoostingClassifier(min_impurity_split =  gridSearch.best_params_['min_impurity_split'])
clfGB3 = calibration.CalibratedClassifierCV(clfGB2, method = 'isotonic', cv = 5)
clfGB3.fit(X_nWCtrain,y_nWCtrain)

CalibratedClassifierCV(base_estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=0.0025, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False),
            cv=5, method='isotonic')

In [224]:
clfAB = ensemble.AdaBoostClassifier()
grid = {"n_estimators":[250,260,270,280,290]}
gridSearch = GridSearchCV(clfAB, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_nWCtrain,y_nWCtrain)
clfAB2 = ensemble.AdaBoostClassifier(n_estimators =  gridSearch.best_params_['n_estimators'])
clfAB3 = calibration.CalibratedClassifierCV(clfAB2, method = 'isotonic', cv = 5)
clfAB3.fit(X_nWCtrain,y_nWCtrain)

CalibratedClassifierCV(base_estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=270, random_state=None),
            cv=5, method='isotonic')

In [227]:
gbProbs = clfGB3.predict_proba(X_nWCtest)[:,0]
adaProbs = clfAB3.predict_proba(X_nWCtest)[:,0]
probas = pd.DataFrame([gbProbs,adaProbs]).transpose()
probas.columns = ['gbProbs', 'adaProbs']
probas['avgProbs'] = (gbProbs + adaProbs)/2
probas['prediction'] =  np.logical_not(probas['avgProbs'] > .5)
accuracy_score(probas['prediction'],y_nWCtest)

0.86441239847505391

#### Gradient Boost with AdaBoost (sigmoid regression)

In [228]:
clfGB = ensemble.GradientBoostingClassifier()
grid = {"min_impurity_split":[.0001,.0025,.005]}
gridSearch = GridSearchCV(clfGB, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_nWCtrain,y_nWCtrain)
clfGB2 = ensemble.GradientBoostingClassifier(min_impurity_split =  gridSearch.best_params_['min_impurity_split'])
clfGB3 = calibration.CalibratedClassifierCV(clfGB2, method = 'sigmoid', cv = 5)
clfGB3.fit(X_nWCtrain,y_nWCtrain)

CalibratedClassifierCV(base_estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=0.0025, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False),
            cv=5, method='sigmoid')

In [229]:
clfAB = ensemble.AdaBoostClassifier()
grid = {"n_estimators":[250,260,270,280,290]}
gridSearch = GridSearchCV(clfAB, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_nWCtrain,y_nWCtrain)
clfAB2 = ensemble.AdaBoostClassifier(n_estimators =  gridSearch.best_params_['n_estimators'])
clfAB3 = calibration.CalibratedClassifierCV(clfAB2, method = 'sigmoid', cv = 5)
clfAB3.fit(X_nWCtrain,y_nWCtrain)

CalibratedClassifierCV(base_estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=270, random_state=None),
            cv=5, method='sigmoid')

In [230]:
gbProbs = clfGB3.predict_proba(X_nWCtest)[:,0]
adaProbs = clfAB3.predict_proba(X_nWCtest)[:,0]
probas = pd.DataFrame([gbProbs,adaProbs]).transpose()
probas.columns = ['gbProbs', 'adaProbs']
probas['avgProbs'] = (gbProbs + adaProbs)/2
probas['prediction'] =  np.logical_not(probas['avgProbs'] > .5)
accuracy_score(probas['prediction'],y_nWCtest)

0.86292060334825127

#### Logistic Regression with k-NN

In [233]:
clfN = neighbors.KNeighborsClassifier()
grid = {"n_neighbors": [8,9,10,11]}
gridSearch = GridSearchCV(clfN, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_ncnrtrain,y_ncnrtrain)
clfN2 = neighbors.KNeighborsClassifier(n_neighbors = gridSearch.best_params_['n_neighbors'])
clfN2.fit(X_ncnrtrain,y_ncnrtrain)
print((gridSearch.best_params_['n_neighbors'],accuracy_score(clfN2.predict(X_ncnrtest),y_ncnrtest)))

(10, 0.8413724515166584)


In [234]:
logProbs = clfncnr2.predict_proba(X_ncnrtest)[:,0]
kNNProbs = clfN2.predict_proba(X_ncnrtest)[:,0]
probas = pd.DataFrame([logProbs,kNNProbs]).transpose()
probas.columns = ['logProbs', 'kNNProbs']
probas['avgProbs'] = (logProbs + kNNProbs)/2
probas['prediction'] =  np.logical_not(probas['avgProbs'] > .5)
accuracy_score(probas['prediction'],y_ncnrtest)

0.86010276810873532

#### Logistic Regression with Random Forest

In [238]:
clfRF = ensemble.RandomForestClassifier()
grid = {"n_estimators": [50,60,70,80,90,100]}
gridSearch = GridSearchCV(clfRF, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_train,y_train)
clfRF2 = ensemble.RandomForestClassifier(n_estimators = gridSearch.best_params_['n_estimators'])
clfRF2.fit(X_train,y_train)
print((gridSearch.best_params_['n_estimators'],accuracy_score(clfRF2.predict(X_test),y_test)))

(80, 0.84319575667163926)


In [240]:
logProbs = clfncnr2.predict_proba(X_ncnrtest)[:,0]
RFProbs = clfRF2.predict_proba(X_test)[:,0]
probas = pd.DataFrame([logProbs,RFProbs]).transpose()
probas.columns = ['logProbs', 'RFProbs']
probas['avgProbs'] = (logProbs + RFProbs)/2
probas['prediction'] =  np.logical_not(probas['avgProbs'] > .5)
accuracy_score(probas['prediction'],y_ncnrtest)

0.8489971821647605

#### k-NN with Random Forest

In [243]:
kNNProbs = clfN2.predict_proba(X_ncnrtest)[:,0]
RFProbs = clfRF2.predict_proba(X_test)[:,0]
probas = pd.DataFrame([kNNProbs,RFProbs]).transpose()
probas.columns = ['kNNProbs', 'RFProbs']
probas['avgProbs'] = (kNNProbs + RFProbs)/2
probas['prediction'] =  np.logical_not(probas['avgProbs'] > .5)
accuracy_score(probas['prediction'],y_ncnrtest)

0.85413558760152497

#### Logisitic, k-NN, Random Forest, Gradient Boosting, AdaBoost 

In [260]:
clfGB = ensemble.GradientBoostingClassifier()
grid = {"min_impurity_split":[.0001,.0025,.005]}
gridSearch = GridSearchCV(clfGB, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_nWCtrain,y_nWCtrain)
clfGB2 = ensemble.GradientBoostingClassifier(min_impurity_split =  gridSearch.best_params_['min_impurity_split'])
clfGB3 = calibration.CalibratedClassifierCV(clfGB2, method = 'isotonic', cv = 5)
clfGB3.fit(X_nWCtrain,y_nWCtrain)

CalibratedClassifierCV(base_estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=0.0001, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False),
            cv=5, method='isotonic')

In [261]:
clfAB = ensemble.AdaBoostClassifier()
grid = {"n_estimators":[250,260,270,280,290]}
gridSearch = GridSearchCV(clfAB, param_grid=grid, cv=5, scoring="accuracy")
gridSearch.fit(X_nWCtrain,y_nWCtrain)
clfAB2 = ensemble.AdaBoostClassifier(n_estimators =  gridSearch.best_params_['n_estimators'])
clfAB3 = calibration.CalibratedClassifierCV(clfAB2, method = 'isotonic', cv = 5)
clfAB3.fit(X_nWCtrain,y_nWCtrain)

CalibratedClassifierCV(base_estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=270, random_state=None),
            cv=5, method='isotonic')

In [264]:
gbProbs = clfGB3.predict_proba(X_nWCtest)[:,0]
adaProbs = clfAB3.predict_proba(X_nWCtest)[:,0]
logProbs = clfncnr2.predict_proba(X_ncnrtest)[:,0]
kNNProbs = clfN2.predict_proba(X_ncnrtest)[:,0]
RFProbs = clfRF2.predict_proba(X_test)[:,0]
probas = pd.DataFrame([gbProbs,adaProbs, logProbs, kNNProbs, RFProbs]).transpose()
probas.columns = ['gbProbs', 'adaProbs','logProbs','kNNProbs', 'RFProbs']
probas['avgProbs'] = probas.iloc[:,:].mean(axis = 1)
probas['prediction'] =  np.logical_not(probas['avgProbs'] > .5)
accuracy_score(probas['prediction'],y_nWCtest)

0.86242333830598372

In [285]:
total_results = probas
total_results['actual'] = y_nWCtest
total_results = total_results.dropna(axis = 0, how = 'any')
total_results[total_results['prediction'] != total_results['actual']].head()

Unnamed: 0,gbProbs,adaProbs,logProbs,kNNProbs,RFProbs,avgProbs,prediction,actual
11,0.522653,0.56514,0.411821,1.0,0.702143,0.640351,False,True
26,0.100343,0.153939,0.2334,0.1,0.0,0.117536,True,False
55,0.0,0.002953,0.035336,0.0,0.015625,0.010783,True,False
99,0.311834,0.328312,0.381655,0.3,0.627917,0.389943,True,False
127,0.471937,0.411748,0.377536,0.4,0.375,0.407244,True,False
