# Random Forest example

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.precision',3)

## Problem: Adult Data set

In [2]:
# for Jupyter-book, we copy data from GitHub, locally, to save Internet traffic,
# you can specify the data/ folder from the root of your cloned
# https://github.com/Yorko/mlcourse.ai repo, to save Internet traffic
DATA_PATH = "https://raw.githubusercontent.com/Yorko/mlcourse.ai/main/data/"

In [3]:
df = pd.read_csv(DATA_PATH + "adult_train.csv", sep=";")
df.tail()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education_Num,Martial_Status,Occupation,Relationship,Race,Sex,Capital_Gain,Capital_Loss,Hours_per_week,Country,Target
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [4]:
df.drop(columns=['Workclass', 'Occupation', 'Country'],inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Age             32561 non-null  int64 
 1   fnlwgt          32561 non-null  int64 
 2   Education       32561 non-null  object
 3   Education_Num   32561 non-null  int64 
 4   Martial_Status  32561 non-null  object
 5   Relationship    32561 non-null  object
 6   Race            32561 non-null  object
 7   Sex             32561 non-null  object
 8   Capital_Gain    32561 non-null  int64 
 9   Capital_Loss    32561 non-null  int64 
 10  Hours_per_week  32561 non-null  int64 
 11  Target          32561 non-null  object
dtypes: int64(6), object(6)
memory usage: 3.0+ MB


In [40]:
df['TargetNew'] = df['Target'].map({' <=50K':0, ' >50K':1 })
df

Unnamed: 0,Age,fnlwgt,Education,Education_Num,Martial_Status,Relationship,Race,Sex,Capital_Gain,Capital_Loss,Hours_per_week,Target,TargetNew
0,39,77516,Bachelors,13,Never-married,Not-in-family,White,Male,2174,0,40,<=50K,0
1,50,83311,Bachelors,13,Married-civ-spouse,Husband,White,Male,0,0,13,<=50K,0
2,38,215646,HS-grad,9,Divorced,Not-in-family,White,Male,0,0,40,<=50K,0
3,53,234721,11th,7,Married-civ-spouse,Husband,Black,Male,0,0,40,<=50K,0
4,28,338409,Bachelors,13,Married-civ-spouse,Wife,Black,Female,0,0,40,<=50K,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,Assoc-acdm,12,Married-civ-spouse,Wife,White,Female,0,0,38,<=50K,0
32557,40,154374,HS-grad,9,Married-civ-spouse,Husband,White,Male,0,0,40,>50K,1
32558,58,151910,HS-grad,9,Widowed,Unmarried,White,Female,0,0,40,<=50K,0
32559,22,201490,HS-grad,9,Never-married,Own-child,White,Male,0,0,20,<=50K,0


In [5]:
df['Target'].unique()

array([' <=50K', ' >50K'], dtype=object)

In [8]:
import sklearn.tree as tree
from sklearn.ensemble import RandomForestClassifier

featureNames = ['Age', 'Education_Num','Hours_per_week']

x = np.array(df[featureNames])
y = np.array(df['TargetNew'])


In [9]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)



In [10]:
forest = RandomForestClassifier()
forest.fit(x_train,y_train)

In [11]:
y_train_predict = forest.predict(x_train)

In [12]:
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [13]:
confusion_matrix(y_train,y_train_predict)

array([[18721,  1053],
       [ 2885,  3389]], dtype=int64)

In [14]:
accuracy_score(y_train,y_train_predict)

0.8488175675675675

In [15]:
# validation data
y_test_predict = forest.predict(x_test)
accuracy_score(y_test,y_test_predict)


0.7710732381391064

In [29]:
RandomForestClassifier??

[1;31mInit signature:[0m
[0mRandomForestClassifier[0m[1;33m([0m[1;33m
[0m    [0mn_estimators[0m[1;33m=[0m[1;36m100[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mcriterion[0m[1;33m=[0m[1;34m'gini'[0m[1;33m,[0m[1;33m
[0m    [0mmax_depth[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_split[0m[1;33m=[0m[1;36m2[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_leaf[0m[1;33m=[0m[1;36m1[0m[1;33m,[0m[1;33m
[0m    [0mmin_weight_fraction_leaf[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mmax_features[0m[1;33m=[0m[1;34m'sqrt'[0m[1;33m,[0m[1;33m
[0m    [0mmax_leaf_nodes[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmin_impurity_decrease[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mbootstrap[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0moob_score[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mn_jobs[0m[1;33m=[0m[1;32mN

# Parameter tuning with Sklearn

In [31]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators':[5,10,50,100,200],
    'criterion' : ["gini", "entropy", "log_loss"],
    # 'min_samples_split':min_samples_split, 
    # 'min_samples_leaf':min_samples_leaf 
}

rf = RandomForestClassifier()
gs = GridSearchCV(estimator=rf, param_grid=param_grid)
result = gs.fit(x_train,y_train)

In [32]:
#see results details
gs.cv_results_

{'mean_fit_time': array([0.07633266, 0.15182424, 0.78528633, 1.47329297, 2.9786952 ,
        0.07560058, 0.15418944, 0.77341084, 1.59628253, 3.47864227,
        0.1115344 , 0.18841457, 0.89413562, 1.80858274, 3.37278485]),
 'std_fit_time': array([0.0042286 , 0.01407287, 0.06836372, 0.07270395, 0.19032673,
        0.00932301, 0.0166088 , 0.03176207, 0.07752819, 0.51035843,
        0.01832211, 0.01035224, 0.06624824, 0.14162974, 0.1739003 ]),
 'mean_score_time': array([0.00519872, 0.01100416, 0.04704418, 0.08293176, 0.17543483,
        0.00590277, 0.00903978, 0.04332089, 0.09937906, 0.17553225,
        0.00809422, 0.0153193 , 0.05018439, 0.10437598, 0.18401799]),
 'std_score_time': array([0.00074527, 0.00167398, 0.0063608 , 0.00593118, 0.0102283 ,
        0.00111169, 0.00109887, 0.00201897, 0.02375991, 0.02448022,
        0.00344755, 0.0023882 , 0.01059645, 0.02162398, 0.03151856]),
 'param_criterion': masked_array(data=['gini', 'gini', 'gini', 'gini', 'gini', 'entropy',
                

In [33]:
gs.best_estimator_

In [34]:
y_train_predict = gs.predict(x_train)
accuracy_score(y_train,y_train_predict)

0.8488175675675675

In [35]:
y_test_predict = gs.predict(x_test)
accuracy_score(y_test,y_test_predict)

0.7730692461231383

In [39]:
for i, j in enumerate(gs.cv_results_['rank_test_score']):
    print(i,j, gs.cv_results_['params'][i])                       

0 13 {'criterion': 'gini', 'n_estimators': 5}
1 9 {'criterion': 'gini', 'n_estimators': 10}
2 5 {'criterion': 'gini', 'n_estimators': 50}
3 1 {'criterion': 'gini', 'n_estimators': 100}
4 3 {'criterion': 'gini', 'n_estimators': 200}
5 15 {'criterion': 'entropy', 'n_estimators': 5}
6 12 {'criterion': 'entropy', 'n_estimators': 10}
7 11 {'criterion': 'entropy', 'n_estimators': 50}
8 2 {'criterion': 'entropy', 'n_estimators': 100}
9 8 {'criterion': 'entropy', 'n_estimators': 200}
10 14 {'criterion': 'log_loss', 'n_estimators': 5}
11 10 {'criterion': 'log_loss', 'n_estimators': 10}
12 4 {'criterion': 'log_loss', 'n_estimators': 50}
13 7 {'criterion': 'log_loss', 'n_estimators': 100}
14 6 {'criterion': 'log_loss', 'n_estimators': 200}


# Question: How do we tell the gridSearch method to check for the validation data ??