In [1]:
# Import basic library every time in ml
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
#To load dataset bank.csv
df=pd.read_csv("telecom_churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# to check how many rows and columns in our dataset
df.shape

(7043, 21)

In [4]:
# to check null values in dataset
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [5]:
#To check datatype
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [6]:
df["TotalCharges"].value_counts()

          11
20.2      11
19.75      9
20.05      8
19.9       8
          ..
6849.4     1
692.35     1
130.15     1
3211.9     1
6844.5     1
Name: TotalCharges, Length: 6531, dtype: int64

In [7]:
# since there is unwanted column in total charges we are replacing it will null
df["TotalCharges"].replace(" ",np.nan,inplace=True)

In [8]:
df.isnull().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [9]:
# change datatype of tatalcharges from object to float
df["TotalCharges"]=df["TotalCharges"].astype("float")

In [10]:
# find the mean of TotalCharges
m=df["TotalCharges"].mean()
# fill null values of TotalCharges from mean of totalcharges
df["TotalCharges"].fillna(m,inplace=True)
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [11]:
# to remove unwanted columns customerId permanently
df.drop("customerID",axis=1,inplace=True)

In [12]:
# sepearte all object type data and hold it in df_cat
df_cat = df.select_dtypes("object")
#separate all numeric datatype type data and hold in df_num
df_num=df.select_dtypes(["float64","int64"])

In [13]:
from sklearn.preprocessing import LabelEncoder

In [14]:
# to convert object type data into number using LabelEncoder
for col in df_cat:
    lr=LabelEncoder()
    df_cat[col]=lr.fit_transform(df_cat[[col]])

In [15]:
df_cat

Unnamed: 0,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn
0,0,1,0,0,1,0,0,2,0,0,0,0,0,1,2,0
1,1,0,0,1,0,0,2,0,2,0,0,0,1,0,3,0
2,1,0,0,1,0,0,2,2,0,0,0,0,0,1,3,1
3,1,0,0,0,1,0,2,0,2,2,0,0,1,0,0,0
4,0,0,0,1,0,1,0,0,0,0,0,0,0,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,1,1,1,2,0,2,0,2,2,2,2,1,1,3,0
7039,0,1,1,1,2,1,0,2,2,0,2,2,1,1,1,0
7040,0,1,1,0,1,0,2,0,0,0,0,0,0,1,2,0
7041,1,1,0,1,2,1,0,0,0,0,0,0,0,1,3,1


In [16]:
# to concatenate df_num and df_cat and hold in new data df_new
df_new=pd.concat([df_num,df_cat],axis=1)
df_new.head()


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn
0,0,1,29.85,29.85,0,1,0,0,1,0,0,2,0,0,0,0,0,1,2,0
1,0,34,56.95,1889.5,1,0,0,1,0,0,2,0,2,0,0,0,1,0,3,0
2,0,2,53.85,108.15,1,0,0,1,0,0,2,2,0,0,0,0,0,1,3,1
3,0,45,42.3,1840.75,1,0,0,0,1,0,2,0,2,2,0,0,1,0,0,0
4,0,2,70.7,151.65,0,0,0,1,0,1,0,0,0,0,0,0,0,1,2,1


In [17]:
df_new["Churn"].value_counts()

0    5174
1    1869
Name: Churn, dtype: int64

In [18]:
# select Input and output from dataset df_new
X=df_new.drop("Churn",axis=1)
Y=df_new["Churn"]

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
# Create object of train test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=1)

In [21]:
# Standardization
from sklearn.preprocessing import StandardScaler
#fit_transform on training data X_train and 
#transform on testing_data means X_test :
#create object of StandardScaler class
ss=StandardScaler()
X_train=ss.fit_transform(X_train)
X_test=ss.transform(X_test)

In [22]:
#Dataset is a classification dataset , we predict customer will churn or not
#First Baseline model : 
#create a function 
def create_model(model):
    model.fit(X_train,Y_train)#train the model
    Y_pred=model.predict(X_test) #test
    print(classification_report(Y_test,Y_pred))
    print("Confusion Matrix : ")
    #Confusion matrix
    print(confusion_matrix(Y_test,Y_pred))
    return model

In [23]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [24]:
#Base Line Model means use Logistic Regression(we predict yes/no values then 
#use classification algorithm)
from sklearn.linear_model import LogisticRegression

In [25]:
# Create the object of Logistic Regression
lr=LogisticRegression()

In [26]:
#call function for train and test model
lr=create_model(lr)

              precision    recall  f1-score   support

           0       0.86      0.88      0.87      1585
           1       0.63      0.59      0.61       528

    accuracy                           0.81      2113
   macro avg       0.75      0.73      0.74      2113
weighted avg       0.81      0.81      0.81      2113

Confusion Matrix : 
[[1401  184]
 [ 219  309]]


In [27]:
#if data is imbalance then use sampling Technique
#We handle Imbalance data ,We take some Samling Technique.
''' There are 2 types of Sampling Technique : -
1. Random Over Sampling TEchnique  2.Random Under Sampling TEchnique'''

' There are 2 types of Sampling Technique : -\n1. Random Over Sampling TEchnique  2.Random Under Sampling TEchnique'

In [28]:
#If you are basically reducing the majority class 
#that means removing record randomly from majority class that is known as 
#Random Undersampling ,
#and if you are increasing the minority class means creating a duplicate 
#row randomly that is known as Random over sampling

In [29]:
#install package imblearn only one time
#!pip install imblearn

In [30]:
# random over sampling
from imblearn.over_sampling import RandomOverSampler

In [31]:
#RandomOversampler
#create object of  RandomOverSampler() class
ros = RandomOverSampler(random_state=1)

In [32]:
X_train_ros,Y_train_ros=ros.fit_resample(X_train,Y_train)
#fit_resample() inbuilt method of RandomOverSampler class

In [33]:
pd.Series(Y_train).value_counts() # check record before balance

0    3589
1    1341
Name: Churn, dtype: int64

In [34]:
pd.Series(Y_train_ros).value_counts() # check record after balance

0    3589
1    3589
Name: Churn, dtype: int64

In [35]:
# apply randomoversampling on 30% testing data
X_test_ros,Y_test_ros=ros.fit_resample(X_test,Y_test)
#fit_resample() inbuilt method of RandomOverSampler class

In [36]:
pd.Series(Y_test).value_counts() # check test record before balance

0    1585
1     528
Name: Churn, dtype: int64

In [37]:
pd.Series(Y_test_ros).value_counts() # check test record after balance

0    1585
1    1585
Name: Churn, dtype: int64

In [38]:
#Dataset is a classification dataset , we predict customer will churn or not
#First Baseline model : 
#create a function 
def create_model1(model):
    model.fit(X_train_ros,Y_train_ros)#train the model
    Y_pred=model.predict(X_test_ros) #test
    print(classification_report(Y_test_ros,Y_pred))
    print("Confusion Matrix : ")
    #Confusion matrix
    print(confusion_matrix(Y_test_ros,Y_pred))
    return model

In [39]:
# base line model : Logistic Regression
# create a model of Logistic Regression class
lr=LogisticRegression()

In [40]:
# call function
lr=create_model1(lr)

              precision    recall  f1-score   support

           0       0.80      0.73      0.77      1585
           1       0.75      0.82      0.79      1585

    accuracy                           0.78      3170
   macro avg       0.78      0.78      0.78      3170
weighted avg       0.78      0.78      0.78      3170

Confusion Matrix : 
[[1162  423]
 [ 284 1301]]


observation : the accuracy of linear regression algorithm is pretty good but lets have a look on other algorithm accuracy score

# Decision tree classifier

In [41]:
from sklearn.tree import DecisionTreeClassifier

In [42]:
# create the object of DecisionTreeClassifier class
dt=DecisionTreeClassifier(random_state=1)

In [43]:
# call function
dt=create_model1(dt)

              precision    recall  f1-score   support

           0       0.63      0.81      0.71      1585
           1       0.74      0.53      0.62      1585

    accuracy                           0.67      3170
   macro avg       0.69      0.67      0.67      3170
weighted avg       0.69      0.67      0.67      3170

Confusion Matrix : 
[[1289  296]
 [ 741  844]]


In [44]:
# checking the importannt features
IG=dt.feature_importances_

In [45]:
col=X.columns

In [46]:
dic={"columns":col,"information_gain":IG}
df=pd.DataFrame(dic)
df.sort_values("information_gain",ascending=False)

Unnamed: 0,columns,information_gain
16,Contract,0.229891
2,MonthlyCharges,0.209369
3,TotalCharges,0.179011
1,tenure,0.111326
10,OnlineSecurity,0.043394
18,PaymentMethod,0.043262
4,gender,0.021186
0,SeniorCitizen,0.021176
17,PaperlessBilling,0.020682
11,OnlineBackup,0.019066


In [47]:
from sklearn import tree
col=X.columns
size=plt.figure(figsize=(20,20))
#_=tree.plot_tree(dt,feature_names=col,filled=1)

<Figure size 1440x1440 with 0 Axes>

observation : the accuracy score is not good as compared logistic regression
so lets apply prurning technique

# prurning with max_depth and gini index

In [48]:
#apply purning tech to reduce overfitting
# max_depth=3
#create the object of decision tree classifier with respect to max depth
dt1=DecisionTreeClassifier(max_depth=2,random_state=1)

In [49]:
# call function
dt1=create_model1(dt1)

              precision    recall  f1-score   support

           0       0.76      0.75      0.76      1585
           1       0.76      0.77      0.76      1585

    accuracy                           0.76      3170
   macro avg       0.76      0.76      0.76      3170
weighted avg       0.76      0.76      0.76      3170

Confusion Matrix : 
[[1194  391]
 [ 368 1217]]


In [50]:
IG=dt.feature_importances_

In [51]:
col=X.columns

In [52]:
dic={"columns":col,"information_gain":IG}
df=pd.DataFrame(dic)
df.sort_values("information_gain",ascending=False)

Unnamed: 0,columns,information_gain
16,Contract,0.229891
2,MonthlyCharges,0.209369
3,TotalCharges,0.179011
1,tenure,0.111326
10,OnlineSecurity,0.043394
18,PaymentMethod,0.043262
4,gender,0.021186
0,SeniorCitizen,0.021176
17,PaperlessBilling,0.020682
11,OnlineBackup,0.019066


observation : accuracy is good but not as good as linear regression so we will appy next technique i.e min samplles leaf

# prurning with min_samples_leaf and gini index

In [53]:
# create the object of decision tree classifier with respect to min samples leaf
dt2=DecisionTreeClassifier(min_samples_leaf=15,random_state=1)

In [54]:
#call function
dt2=create_model(dt2)

              precision    recall  f1-score   support

           0       0.85      0.88      0.86      1585
           1       0.59      0.54      0.57       528

    accuracy                           0.79      2113
   macro avg       0.72      0.71      0.72      2113
weighted avg       0.79      0.79      0.79      2113

Confusion Matrix : 
[[1389  196]
 [ 241  287]]


In [55]:
IG=dt.feature_importances_

In [56]:
col=X.columns

In [57]:
dic={"columns":col,"information_gain":IG}
df=pd.DataFrame(dic)
df.sort_values("information_gain",ascending=False)

Unnamed: 0,columns,information_gain
16,Contract,0.229891
2,MonthlyCharges,0.209369
3,TotalCharges,0.179011
1,tenure,0.111326
10,OnlineSecurity,0.043394
18,PaymentMethod,0.043262
4,gender,0.021186
0,SeniorCitizen,0.021176
17,PaperlessBilling,0.020682
11,OnlineBackup,0.019066


In [58]:
# in gini index plurning technique max depth method is more best then min_sample_leaf

# plurning with min_samples_leaf and entrophy

In [59]:
# use entrophy method of decision yree
# means -p(yes)*log2p(Yes)-p(No)*Lof2(p(no))
#Here log2 means base of log=2
#create object of decision Tree classifier class
dt3=DecisionTreeClassifier(criterion="entropy",min_samples_leaf=50)
# by default criterion "gini" means 1-p(yes)^2-Q(no)^2

In [60]:
#call functions
dt3=create_model1(dt3)

              precision    recall  f1-score   support

           0       0.77      0.76      0.77      1585
           1       0.76      0.77      0.77      1585

    accuracy                           0.77      3170
   macro avg       0.77      0.77      0.77      3170
weighted avg       0.77      0.77      0.77      3170

Confusion Matrix : 
[[1211  374]
 [ 369 1216]]


In [61]:
IG=dt.feature_importances_

In [62]:
col=X.columns

In [63]:
dic={"columns":col,"information_gain":IG}
df=pd.DataFrame(dic)
df.sort_values("information_gain",ascending=False)

Unnamed: 0,columns,information_gain
16,Contract,0.229891
2,MonthlyCharges,0.209369
3,TotalCharges,0.179011
1,tenure,0.111326
10,OnlineSecurity,0.043394
18,PaymentMethod,0.043262
4,gender,0.021186
0,SeniorCitizen,0.021176
17,PaperlessBilling,0.020682
11,OnlineBackup,0.019066


# plurning with Max_depth and entrophy

In [64]:
# use entrophy method of decision yree
# means -p(yes)*log2p(Yes)-p(No)*Lof2(p(no))
#Here log2 means base of log=2
#create object of decision Tree classifier class
dt4=DecisionTreeClassifier(criterion="entropy",max_depth=2)
# by default criterion "gini" means 1-p(yes)^2-Q(no)^2

In [65]:
#call functions
dt4=create_model1(dt4)

              precision    recall  f1-score   support

           0       0.76      0.75      0.76      1585
           1       0.76      0.77      0.76      1585

    accuracy                           0.76      3170
   macro avg       0.76      0.76      0.76      3170
weighted avg       0.76      0.76      0.76      3170

Confusion Matrix : 
[[1194  391]
 [ 368 1217]]


In [66]:
IG=dt.feature_importances_

In [67]:
col=X.columns

In [68]:
dic={"columns":col,"information_gain":IG}
df=pd.DataFrame(dic)
df.sort_values("information_gain",ascending=False)

Unnamed: 0,columns,information_gain
16,Contract,0.229891
2,MonthlyCharges,0.209369
3,TotalCharges,0.179011
1,tenure,0.111326
10,OnlineSecurity,0.043394
18,PaymentMethod,0.043262
4,gender,0.021186
0,SeniorCitizen,0.021176
17,PaperlessBilling,0.020682
11,OnlineBackup,0.019066


In [69]:
# science the accuracy is not good as expected so we will give this dataset to randomforestclassifier

# Random forest classifier

In [70]:
from sklearn.ensemble import RandomForestClassifier

In [71]:
# create the object of AdaBoostClassifier
rfc=RandomForestClassifier(n_estimators=50,max_features=8,random_state=1)

In [72]:
# call the function
rfc=create_model1(rfc)

              precision    recall  f1-score   support

           0       0.69      0.85      0.76      1585
           1       0.80      0.62      0.70      1585

    accuracy                           0.73      3170
   macro avg       0.75      0.73      0.73      3170
weighted avg       0.75      0.73      0.73      3170

Confusion Matrix : 
[[1345  240]
 [ 608  977]]


In [73]:
# still the accuracy is as same as decision tree so now we move further and apply boosting technique

# applying boosting technique 

In [74]:
# create the object of Adaboost classifier
from sklearn.ensemble import AdaBoostClassifier
# create the object of ada boost classifier
abc=AdaBoostClassifier(n_estimators=20)

In [75]:
# call function
dbc=create_model1(abc)

              precision    recall  f1-score   support

           0       0.80      0.73      0.76      1585
           1       0.75      0.82      0.78      1585

    accuracy                           0.77      3170
   macro avg       0.78      0.77      0.77      3170
weighted avg       0.78      0.77      0.77      3170

Confusion Matrix : 
[[1157  428]
 [ 291 1294]]


In [76]:
# applying adaboost classifier is no good so we will apply gradientboosting classifier

In [77]:
from sklearn.ensemble import GradientBoostingClassifier

In [78]:
# create the object of gradient boosting classifier as gbc
gbc=GradientBoostingClassifier(n_estimators=40,random_state=1)

In [79]:
#call function
gbc=create_model1(gbc)

              precision    recall  f1-score   support

           0       0.80      0.73      0.77      1585
           1       0.75      0.82      0.79      1585

    accuracy                           0.78      3170
   macro avg       0.78      0.78      0.78      3170
weighted avg       0.78      0.78      0.78      3170

Confusion Matrix : 
[[1159  426]
 [ 282 1303]]


In [80]:
# applying extreme gradient boosting classifier
from xgboost import XGBClassifier

In [81]:
# create the object of GradientBoostClassifier class
xgb=XGBClassifier(n_estimators=15,reg_alpha=1,random_state=1)

In [82]:
# call function
xgb=create_model1(xgb)

              precision    recall  f1-score   support

           0       0.78      0.76      0.77      1585
           1       0.77      0.79      0.78      1585

    accuracy                           0.78      3170
   macro avg       0.78      0.78      0.78      3170
weighted avg       0.78      0.78      0.78      3170

Confusion Matrix : 
[[1209  376]
 [ 332 1253]]


observations : till now xgboost has perfomed very well will accuracy of 79 and recall of 73/84

# K-NN algorithm

In [83]:
from sklearn.neighbors import KNeighborsClassifier

In [84]:
# create the object of KNeighborsClassifier as knn with respect to max_depth
knn=KNeighborsClassifier(n_neighbors=5,metric="minkowski",p=2)

In [85]:
# call user defined function
knn=create_model1(knn)

              precision    recall  f1-score   support

           0       0.73      0.66      0.69      1585
           1       0.69      0.76      0.72      1585

    accuracy                           0.71      3170
   macro avg       0.71      0.71      0.71      3170
weighted avg       0.71      0.71      0.71      3170

Confusion Matrix : 
[[1045  540]
 [ 384 1201]]


observation : knn is not giving good accuracy so further moving on with the 
next algorithm i.e support vector machine
    

# support vector machine

In [86]:
from sklearn.svm import LinearSVC

In [87]:
# create the object of support vecyot machine as svm
svm=LinearSVC(random_state=1)

In [88]:
# call user defined function
svm=create_model1(svm)

              precision    recall  f1-score   support

           0       0.82      0.72      0.77      1585
           1       0.75      0.84      0.79      1585

    accuracy                           0.78      3170
   macro avg       0.78      0.78      0.78      3170
weighted avg       0.78      0.78      0.78      3170

Confusion Matrix : 
[[1143  442]
 [ 259 1326]]


In [89]:
#here recall is good but we can more better
#means can be possible ,model is overfit 
#Soft margin means to reduced overfitting situation means some error add on
#training time 
#create object of LinearSVC class
svc1=LinearSVC(random_state=1,C=0.5)

In [90]:
# call user defined function
svc1=create_model1(svc1)

              precision    recall  f1-score   support

           0       0.82      0.72      0.77      1585
           1       0.75      0.84      0.79      1585

    accuracy                           0.78      3170
   macro avg       0.78      0.78      0.78      3170
weighted avg       0.78      0.78      0.78      3170

Confusion Matrix : 
[[1143  442]
 [ 259 1326]]


In [91]:
# still there is no changes because there  might be chances that our data is not linear so we will give to kernal function in svm

In [92]:
from sklearn.svm import SVC

In [93]:
#create object of LinearSVC class
svc2=SVC(random_state=1,kernel="poly")

In [94]:
# call user defined function
svc2=create_model1(svc2)

              precision    recall  f1-score   support

           0       0.78      0.74      0.76      1585
           1       0.75      0.79      0.77      1585

    accuracy                           0.77      3170
   macro avg       0.77      0.77      0.77      3170
weighted avg       0.77      0.77      0.77      3170

Confusion Matrix : 
[[1173  412]
 [ 331 1254]]


In [95]:
radial_svc=SVC(random_state=1,kernel="rbf") # rbf means radial basis function

In [96]:
# call user defined function
radial_svc=create_model1(radial_svc)

              precision    recall  f1-score   support

           0       0.78      0.76      0.77      1585
           1       0.76      0.78      0.77      1585

    accuracy                           0.77      3170
   macro avg       0.77      0.77      0.77      3170
weighted avg       0.77      0.77      0.77      3170

Confusion Matrix : 
[[1197  388]
 [ 345 1240]]


# conclusion Xgboost has performed extremely well among all the classification algorithm with accuracy of .78% and good recall score

# applying optimization function in random forest

In [97]:
rf=RandomForestClassifier()

In [98]:
from sklearn.model_selection import RandomizedSearchCV
# number of decision tree in model
n_estimators=[x for x in range(1,100,10)]
# no of features to be considered at every split 
max_features=['auto', 'sqrt','log2']
# minimum no of level in a tree
max_depth=[X for X in np.linspace(1,8,8)]
# minimum sample required to split a node
min_samples_split=[1,3,4,5,7,8,9]
# minimum no of leaf node
min_samples_leaf=[2,3,4,6,7,8,9]

# create a space random grid
random_grid={"n_estimators":n_estimators,"max_features":max_features,"max_depth":max_depth,"min_samples_split":min_samples_split,
            "min_samples_leaf":min_samples_leaf,'criterion':['entropy','gini']}
print(random_grid)

{'n_estimators': [1, 11, 21, 31, 41, 51, 61, 71, 81, 91], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], 'min_samples_split': [1, 3, 4, 5, 7, 8, 9], 'min_samples_leaf': [2, 3, 4, 6, 7, 8, 9], 'criterion': ['entropy', 'gini']}


In [99]:
# create the pbject of RandomizedSearchCV
rcv=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,n_jobs=-1,verbose=2,random_state=1)
### fit the randomized model
rcv.fit(X_train,Y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': [1.0, 2.0, 3.0, 4.0, 5.0,
                                                      6.0, 7.0, 8.0],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [2, 3, 4, 6, 7, 8,
                                                             9],
                                        'min_samples_split': [1, 3, 4, 5, 7, 8,
                                                              9],
                                        'n_estimators': [1, 11, 21, 31, 41, 51,
                                                         61, 71, 81, 91]},
                   random_state=1, verbose=2)

In [100]:
best_param=rcv.best_estimator_

In [101]:
best_param

RandomForestClassifier(max_depth=7.0, min_samples_leaf=2, min_samples_split=4,
                       n_estimators=91)

In [102]:
from sklearn.metrics import accuracy_score
Y_pred=best_param.predict(X_test)
print(confusion_matrix(Y_test,Y_pred))
print("Accuracy Score {}".format(accuracy_score(Y_test,Y_pred)))
print("Classification report: {}".format(classification_report(Y_test,Y_pred)))

[[1432  153]
 [ 239  289]]
Accuracy Score 0.8144817794604827
Classification report:               precision    recall  f1-score   support

           0       0.86      0.90      0.88      1585
           1       0.65      0.55      0.60       528

    accuracy                           0.81      2113
   macro avg       0.76      0.73      0.74      2113
weighted avg       0.81      0.81      0.81      2113



# applying grid search cv

In [103]:
rcv.best_estimator_

RandomForestClassifier(max_depth=7.0, min_samples_leaf=2, min_samples_split=4,
                       n_estimators=91)

In [105]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': [rcv.best_params_['criterion']],
    'max_depth': [rcv.best_params_['max_depth']],
    'max_features': [rcv.best_params_['max_features']],
    'min_samples_leaf': [rcv.best_params_['min_samples_leaf'], 
                         rcv.best_params_['min_samples_leaf']+2, 
                         rcv.best_params_['min_samples_leaf'] + 4],
    'min_samples_split': [rcv.best_params_['min_samples_split'] - 2,
                          rcv.best_params_['min_samples_split'] - 1,
                          rcv.best_params_['min_samples_split'], 
                          rcv.best_params_['min_samples_split'] +1,
                          rcv.best_params_['min_samples_split'] + 2],
    'n_estimators': [rcv.best_params_['n_estimators'] - 200, rcv.best_params_['n_estimators'] - 100, 
                     rcv.best_params_['n_estimators'], 
                     rcv.best_params_['n_estimators'] + 100, rcv.best_params_['n_estimators'] + 200]
}

print(param_grid)

{'criterion': ['gini'], 'max_depth': [7.0], 'max_features': ['auto'], 'min_samples_leaf': [2, 4, 6], 'min_samples_split': [2, 3, 4, 5, 6], 'n_estimators': [-109, -9, 91, 191, 291]}


In [106]:
# Fit the grid_search to the data
rf=RandomForestClassifier()
grid_search=GridSearchCV(estimator=rf,param_grid=param_grid,cv=10,n_jobs=-1,verbose=2)
grid_search.fit(X_train,Y_train)

Fitting 10 folds for each of 75 candidates, totalling 750 fits


GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini'], 'max_depth': [7.0],
                         'max_features': ['auto'],
                         'min_samples_leaf': [2, 4, 6],
                         'min_samples_split': [2, 3, 4, 5, 6],
                         'n_estimators': [-109, -9, 91, 191, 291]},
             verbose=2)

In [108]:
best_grid=grid_search.best_estimator_

In [110]:
Y_pred=best_grid.predict(X_test)
print(confusion_matrix(Y_test,Y_pred))
print("Accuracy Score {}".format(accuracy_score(Y_test,Y_pred)))
print("Classification report: {}".format(classification_report(Y_test,Y_pred)))

[[1439  146]
 [ 251  277]]
Accuracy Score 0.8121154756270705
Classification report:               precision    recall  f1-score   support

           0       0.85      0.91      0.88      1585
           1       0.65      0.52      0.58       528

    accuracy                           0.81      2113
   macro avg       0.75      0.72      0.73      2113
weighted avg       0.80      0.81      0.80      2113

