In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.loc[df['Income.Group']=='<=50K','Income.Group'] = 0
df.loc[df['Income.Group']=='>50K','Income.Group'] = 1
df['Income.Group'] = df['Income.Group'].astype(int)

In [4]:
df1 = pd.get_dummies(df)

In [8]:
df1.columns

Index([u'ID', u'Age', u'Hours.Per.Week', u'Income.Group',
       u'Workclass_Federal-gov', u'Workclass_Local-gov',
       u'Workclass_Never-worked', u'Workclass_Private',
       u'Workclass_Self-emp-inc', u'Workclass_Self-emp-not-inc',
       ...
       u'Native.Country_Portugal', u'Native.Country_Puerto-Rico',
       u'Native.Country_Scotland', u'Native.Country_South',
       u'Native.Country_Taiwan', u'Native.Country_Thailand',
       u'Native.Country_Trinadad&Tobago', u'Native.Country_United-States',
       u'Native.Country_Vietnam', u'Native.Country_Yugoslavia'],
      dtype='object', length=103)

In [6]:
#from sklearn.model_selection import train_test_split
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [9]:
features = list(set(df1.columns) - set(['ID','Income.Group']))

In [10]:
X = df1[features]
Y = df1['Income.Group']

In [11]:
trainX, testX, trainY, testY =  train_test_split(X, Y, test_size = .3, random_state = 166)

In [12]:
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier

In [13]:
rf = RandomForestClassifier(class_weight='balanced',random_state = 1,n_estimators=10)
model = rf.fit(trainX, trainY)

In [14]:
preds = model.predict(testX)

In [15]:
accuracy = accuracy_score(testY, preds)
precision = precision_score(testY, preds)
recall = recall_score(testY, preds)
print accuracy,precision,recall

0.80608365019 0.624824684432 0.529726516052


In [16]:
gbm = GradientBoostingClassifier(random_state =1)

In [17]:
model = gbm.fit(trainX, trainY)
preds = model.predict(testX)

In [18]:
accuracy = accuracy_score(testY, preds)
precision = precision_score(testY, preds)
recall = recall_score(testY, preds)
print accuracy,precision,recall

0.83109096227 0.711307137129 0.527348394768


### Hyper parameter Tuning

In [19]:
from sklearn.grid_search import GridSearchCV

In [20]:
parameters = {
    'criterion': ['gini', 'entropy'],
    'max_features': [None, 'auto', 'sqrt', 'log2'],
    'max_depth': [None, 1, 2, 5, 10],
    'min_samples_split': [1.0, 2, 5, 10]
}

In [21]:
model = RandomForestClassifier(random_state=1)

In [22]:
clf = GridSearchCV(model, parameters, cv=5, scoring='recall', verbose=True)

In [23]:
clf.fit(trainX,trainY)

Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:   18.9s
[Parallel(n_jobs=1)]: Done 199 tasks       | elapsed:   30.8s
[Parallel(n_jobs=1)]: Done 449 tasks       | elapsed:  1.3min
[Parallel(n_jobs=1)]: Done 799 tasks       | elapsed:  2.0min
[Parallel(n_jobs=1)]: Done 800 out of 800 | elapsed:  2.0min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_features': [None, 'auto', 'sqrt', 'log2'], 'min_samples_split': [1.0, 2, 5, 10], 'criterion': ['gini', 'entropy'], 'max_depth': [None, 1, 2, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, scoring='recall', verbose=True)

In [24]:
print('Best score for data1:', clf.best_score_) 

('Best score for data1:', 0.57555189961777298)


In [25]:
clf.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)