## Implementaion of Use Case using RF ScikitLearn. 

The use case is about predicting the income (binary class >50K/<=50k) based on census data. Sample Adult data i have taken from UCI machine Learning Repository.


In [None]:
# Import all the required libraries
import numpy as np
import pandas as pd
from sklearn import cross_validation, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import roc_auc_score


In [203]:
# Reading the train data 
adultTrain = pd.read_table('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', sep=',',header=None)


In [205]:
# Atteibutes names
colNames = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation','relationship', 'race',
            'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country','wage_class']


In [207]:
# overwritnig the column names for train data and test data
adultTrain.columns = colNames


In [241]:
adultTrain.head()


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [8]:
adultTrain.shape


(32561, 15)

In [177]:
adultTrain.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [209]:
# Data summary
adultTrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education_num     32561 non-null int64
marital_status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital_gain      32561 non-null int64
capital_loss      32561 non-null int64
hours_per_week    32561 non-null int64
native_country    32561 non-null object
wage_class        32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [18]:
# Read the test data 
adultTest = pd.read_table('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test',sep=',',skiprows=1,header=None)
adultTest.columns = colNames


In [19]:
adultTest.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [20]:
adultTest.shape

(16281, 15)

In [27]:
# Remove the unknown data points
adultTrainNoMiss = adultTrain.replace(' ?', np.nan).dropna()
adultTestNoMiss = adultTest.replace(' ?', np.nan).dropna()



In [223]:
adultTrainNoMiss.wage_class.unique()


array([' <=50K', ' >50K'], dtype=object)

In [31]:
adultTestNoMiss.wage_class.unique()

array([' <=50K.', ' >50K.'], dtype=object)

In [224]:
# replace the test data wage class values same as train data
adultTestNoMiss['wage_class'] = adultTestNoMiss.wage_class.replace({' <=50K.' : ' <=50K', ' >50K.' : ' >50K'})

In [225]:
# check if got replaced correctly
adultTestNoMiss.wage_class.unique()

array([' <=50K', ' >50K'], dtype=object)

In [188]:
# Ordinal Encoding to Categoricals
# Next step is coverting all the categorical variables into dummy variables in the form of numbers only. as in python every feature
# vectors require to be in digit only
# first we will combine both the data sets and then apply the transformation so that same categories present in both the data sets.
#combinedSet = pd.concat([adultTrainNoMiss, adultTestNoMiss], axis=0)
combinedSet = pd.concat([adultTrain, adultTest], axis=0)

In [226]:
# using pandas categorical codes to encode all the features those are not numerical
for feature in combinedSet.columns:
    if combinedSet[feature].dtype == 'object':
        combinedSet[feature] = pd.Categorical(combinedSet[feature]).codes  # replace the srings with codes
    

In [184]:
combinedSet.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0


In [48]:
adultTrainNoMiss[:2]

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


In [49]:
# need to split back this combined data into training and testing data sets
finalTrainData = combinedSet[:adultTrainNoMiss.shape[0]]  # get the data upto last training records
finalTestData = combinedSet[adultTrainNoMiss.shape[0]:]   # start from the end of training records

In [50]:
finalTrainData.shape

(30162, 15)

In [52]:
finalTestData.shape

(15060, 15)

In [55]:
# Seperate the target variable from train and test data
yTrain = finalTrainData.pop('wage_class')
yTest  = finalTestData.pop('wage_class')


In [70]:
xTrain = finalTrainData
xTest  = finalTestData

In [237]:
# instantiate the Randome Forest Estimator with defined paramters values, the parameters not defined will get default values.
# modelRF = RandomForestClassifier(n_estimators = 200, oob_score = True, n_jobs = -1,random_state =50, max_features = 12,
#                                  min_samples_leaf = 5)
modelRF = RandomForestClassifier(n_estimators = 200, min_samples_leaf = 5)

In [238]:
# fit the model with the training data
modelRF.fit(xTrain, yTrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [239]:
# predict the model for the xTrain and store in yPred
yPredTrain = modelRF.predict(xTrain)

# Classification Accuracy fo training data prediction vs actual
print(metrics.accuracy_score(yPredTrain, yTrain))


0.893044227836


In [230]:
# predict the model for the xTrain and store in yPred
yPredTest = modelRF.predict(xTest)
# Calssification Accuracy of testing data prediction vs actual
print(metrics.accuracy_score(yPredTest, yTest))

0.850863213811


### Tuning Parameters for Randome Forest  model Performance

- **1) n_estimators :**
- This is the number of trees you want to build before taking the maximum voting or averages of predictions.
- here not much for optimization, the more number trees we give, the better it will perform but for higher number of trees code performance get slower. Between 500-1000 would be sufficient.

- **2) max_features:  (default is n/3 for regression and squrt(n) for classification)**
-   These are the maximum number of features Random Forest is allowed to try in individual tree during node slit.
-   There is lot much tuning scope for this max_features and it may has larger impact on the behavior of Random Forest as this value decide the number of predictors will be used during the decision node identification.
-   Sqrt(n) is the default and it is good starting point but to find the actual sweet spot need to try with different values as it significantly impact on the tree behaviors.

- ** 3) min_sample_leaf : (defaulut =1 )**

-   The minimum number of samples in newly created leaves. Leaf is the end node of a decision tree. A smaller leaf makes the model more prone to capturing noise in train data. 






In [231]:
# we can start building Random forest model
param_grid = { "n_estimators"      : [50],             # I have just selected very less number of tree to run quickly, default=500
               "max_features"      : [3, 4, 5, 7],     # default is squrt(n) for classificatoin, n/3 for regression
               "min_samples_leaf"  : [40, 50]
                }



In [232]:
# GridSearch to find the optimial tuning RF parameters
optimizedRF = GridSearchCV(RandomForestClassifier(), param_grid, scoring = 'accuracy', cv = 5, n_jobs = -1)

In [233]:
optimizedRF.fit(xTrain, yTrain)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [50], 'max_features': [3, 4, 5, 7], 'min_samples_leaf': [40, 50]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [219]:
optimizedRF.grid_scores_

[mean: 0.85442, std: 0.00418, params: {'n_estimators': 50, 'max_features': 3, 'min_samples_leaf': 40},
 mean: 0.85329, std: 0.00566, params: {'n_estimators': 50, 'max_features': 3, 'min_samples_leaf': 50},
 mean: 0.85531, std: 0.00545, params: {'n_estimators': 50, 'max_features': 4, 'min_samples_leaf': 40},
 mean: 0.85459, std: 0.00471, params: {'n_estimators': 50, 'max_features': 4, 'min_samples_leaf': 50},
 mean: 0.85478, std: 0.00501, params: {'n_estimators': 50, 'max_features': 5, 'min_samples_leaf': 40},
 mean: 0.85508, std: 0.00433, params: {'n_estimators': 50, 'max_features': 5, 'min_samples_leaf': 50},
 mean: 0.85571, std: 0.00491, params: {'n_estimators': 50, 'max_features': 7, 'min_samples_leaf': 40},
 mean: 0.85372, std: 0.00422, params: {'n_estimators': 50, 'max_features': 7, 'min_samples_leaf': 50}]

In [234]:
# using the best model with 86% Training accuracy
finalRF = RandomForestClassifier(n_estimators = 500, max_features = 3,min_samples_leaf = 40, oob_score=True)
finalRF.fit(xTrain, yTrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=3, max_leaf_nodes=None,
            min_samples_leaf=40, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [235]:
# Analyzing performance on the test data
yPredTest = finalRF.predict(xTest)

In [236]:
# Calssification Accuracy of testing data prediction vs actual
print(metrics.accuracy_score(yPredTest, yTest))



0.854847277556


**So final accuracy is 86%, 14% error. there is probable we could imporve the accuracy bit more by tuning other hyperparameter.**