## Random_forest_classifier with GridSearchCV

In [1]:
# read the dataset
import pandas as pd 
dataset = pd.read_csv("Social_Network_Ads.csv")
dataset.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [2]:
# convert categorical into numerical 
dataset = pd.get_dummies(dataset, drop_first=True, dtype=int)
dataset = dataset.drop(["User ID"],axis=1)
dataset.head()

Unnamed: 0,Age,EstimatedSalary,Purchased,Gender_Male
0,19,19000,0,1
1,35,20000,0,1
2,26,43000,0,0
3,27,57000,0,0
4,19,76000,0,1


In [3]:
# split input and output 
independent = dataset[[ "Age",	"EstimatedSalary", "Gender_Male"]]
dependent = dataset[[ "Purchased" ]]

In [7]:
# Check dataset is balanced or Imbalanced
dataset["Purchased"].value_counts()    # imbalanced 

Purchased
0    257
1    143
Name: count, dtype: int64

In [4]:
# split train and test
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(independent,dependent, test_size=0.20,random_state=0)
print(x_train,x_test,y_train,y_test )

     Age  EstimatedSalary  Gender_Male
336   58           144000            1
64    59            83000            0
55    24            55000            0
106   26            35000            0
300   58            38000            0
..   ...              ...          ...
323   48            30000            0
192   29            43000            1
117   36            52000            1
47    27            54000            0
172   26           118000            0

[320 rows x 3 columns]      Age  EstimatedSalary  Gender_Male
132   30            87000            1
309   38            50000            0
341   35            75000            1
196   30            79000            0
246   35            50000            0
..   ...              ...          ...
14    18            82000            1
363   42            79000            0
304   40            60000            0
361   53            34000            0
329   47           107000            0

[80 rows x 3 columns]      Purchased
33

In [9]:
y_train = y_train.values.ravel()

In [10]:
# model creation + grid cv
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier 

param_grid = {
"criterion": ["gini", "entropy", "log_loss"],
"max_features": ["sqrt", "log2", None],
"class_weight": ["balanced", "balanced_subsample"]
}
model = RandomForestClassifier()

grid_model = GridSearchCV(model, param_grid, refit=True, verbose=3  , n_jobs =-1, scoring='f1_weighted') 
grid_model.fit(x_train,y_train)


Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [11]:
# model prediction + grid cv
grid_prediction = grid_model.predict(x_test)
grid_prediction

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1])

In [12]:
# model evaluation --- Performance on the unseen TEST dataset
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test,grid_prediction)
clf_report = classification_report(y_test, grid_prediction)
print(cm)
print(clf_report)

[[54  4]
 [ 2 20]]
              precision    recall  f1-score   support

           0       0.96      0.93      0.95        58
           1       0.83      0.91      0.87        22

    accuracy                           0.93        80
   macro avg       0.90      0.92      0.91        80
weighted avg       0.93      0.93      0.93        80



## ROC AUC 

In [13]:
print(grid_model.predict_proba(x_test)[:,1])

[0.1  0.01 0.   0.08 0.01 0.03 0.   0.95 0.01 0.78 0.01 0.   0.06 0.02
 0.01 0.9  0.02 0.   0.93 0.   0.02 0.97 0.02 0.98 0.   0.93 0.1  0.
 0.01 0.03 0.03 0.02 0.87 0.06 0.   0.   0.   0.   0.   0.99 0.06 0.
 0.04 0.03 0.37 0.01 0.16 0.61 0.   0.79 0.96 0.01 0.   0.83 0.53 0.85
 0.   0.01 0.97 0.03 0.01 0.92 0.   0.97 0.01 0.92 0.1  0.   0.   0.2
 1.   0.31 0.   0.8  0.06 0.   0.64 0.   1.   0.86]


In [14]:
from sklearn.metrics import roc_auc_score     # key metric for binary classification 
roc_auc_score = roc_auc_score(y_test,grid_model.predict_proba(x_test)[:,1] )
roc_auc_score

np.float64(0.9670846394984326)

## f1 score 

In [15]:
from sklearn.metrics import f1_score
f1_score = f1_score(y_test,grid_prediction, average='weighted' )
f1_score

0.9259725400457665

## GridSearchCV results 

In [16]:
grid_results = grid_model.cv_results_
print(grid_results)

{'mean_fit_time': array([0.77919483, 0.61081758, 0.78915806, 0.51387787, 0.63935366,
       0.53621302, 0.51772337, 0.49996901, 0.52110066, 0.6657835 ,
       0.65691342, 0.66399922, 0.6932344 , 0.6721036 , 0.6864953 ,
       0.66912665, 0.67267685, 0.6945231 ]), 'std_fit_time': array([0.11780362, 0.12093186, 0.16623303, 0.02244916, 0.13587361,
       0.01369136, 0.0204689 , 0.00719486, 0.01496253, 0.01513931,
       0.01309684, 0.01066998, 0.01518473, 0.00592108, 0.01232786,
       0.0165996 , 0.01340971, 0.02921627]), 'mean_score_time': array([0.06466899, 0.05600953, 0.07434359, 0.03930478, 0.04379821,
       0.03738894, 0.03650804, 0.03589945, 0.03407655, 0.03397231,
       0.03290138, 0.03254147, 0.03308835, 0.03315144, 0.03373675,
       0.03684931, 0.03314118, 0.03871989]), 'std_score_time': array([0.02276089, 0.03340419, 0.02597184, 0.00658171, 0.01193713,
       0.00512753, 0.00297525, 0.00288437, 0.00268958, 0.00325256,
       0.00161872, 0.00061842, 0.00047403, 0.00089652, 0.

In [17]:
# table
table = pd.DataFrame.from_dict(grid_results)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_features,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.779195,0.117804,0.064669,0.022761,balanced,gini,sqrt,"{'class_weight': 'balanced', 'criterion': 'gin...",0.890987,0.842259,0.891711,0.907389,0.921526,0.890774,0.026745,8
1,0.610818,0.120932,0.05601,0.033404,balanced,gini,log2,"{'class_weight': 'balanced', 'criterion': 'gin...",0.890987,0.858747,0.891711,0.922651,0.905355,0.89389,0.021009,3
2,0.789158,0.166233,0.074344,0.025972,balanced,gini,,"{'class_weight': 'balanced', 'criterion': 'gin...",0.875759,0.825502,0.876518,0.861321,0.920683,0.871957,0.030596,17
3,0.513878,0.022449,0.039305,0.006582,balanced,entropy,sqrt,"{'class_weight': 'balanced', 'criterion': 'ent...",0.875759,0.842259,0.891711,0.907389,0.921526,0.887729,0.027406,11
4,0.639354,0.135874,0.043798,0.011937,balanced,entropy,log2,"{'class_weight': 'balanced', 'criterion': 'ent...",0.875,0.858747,0.891711,0.907389,0.921526,0.890875,0.022348,5
5,0.536213,0.013691,0.037389,0.005128,balanced,entropy,,"{'class_weight': 'balanced', 'criterion': 'ent...",0.875759,0.858747,0.876518,0.906923,0.905355,0.884661,0.01866,12
6,0.517723,0.020469,0.036508,0.002975,balanced,log_loss,sqrt,"{'class_weight': 'balanced', 'criterion': 'log...",0.890987,0.875,0.891711,0.907389,0.888956,0.890809,0.010287,7
7,0.499969,0.007195,0.035899,0.002884,balanced,log_loss,log2,"{'class_weight': 'balanced', 'criterion': 'log...",0.875,0.842259,0.891711,0.907389,0.953307,0.893933,0.036715,2
8,0.521101,0.014963,0.034077,0.00269,balanced,log_loss,,"{'class_weight': 'balanced', 'criterion': 'log...",0.875759,0.825502,0.876518,0.906923,0.888956,0.874732,0.027079,16
9,0.665784,0.015139,0.033972,0.003253,balanced_subsample,gini,sqrt,"{'class_weight': 'balanced_subsample', 'criter...",0.890987,0.842259,0.876518,0.907389,0.921526,0.887736,0.027322,10


In [18]:
grid_model.best_params_

{'class_weight': 'balanced_subsample',
 'criterion': 'log_loss',
 'max_features': 'sqrt'}

In [19]:
grid_model.best_estimator_

In [20]:
""" Mean cross-validation score on the TRAINING data """
grid_model.best_score_

np.float64(0.8940884439535285)

## model prediction with real time data

In [30]:
## user input 
Gender=input("Enter your gender (0-female/1-male): ")
Age	= int(input("Enter your age: "))
EstimatedSalary = int(input("Enter your estimated salary: "))


""" test input data's
Male	19	19000	- 0
Female	46	41000	- 1
Female	50	20000	- 1
Male	36	33000	- 0
"""

Enter your gender (0-female/1-male):  1
Enter your age:  36
Enter your estimated salary:  33000


" test input data's\nMale\t19\t19000\t- 0\nFemale\t46\t41000\t- 1\nFemale\t50\t20000\t- 1\nMale\t36\t33000\t- 0\n"

In [32]:
## user input predction 
user_input_prediction = grid_model.predict([[Gender,Age,EstimatedSalary ]])
if user_input_prediction==1:
    print("user will purchase")
else:
    print("user will not purchase")

print(user_input_prediction)

user will purchase
[1]
