#TARGET MARKETING

In [66]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

In [67]:
#Read training data file
trainfile = '/content/drive/MyDrive/Portugese Bank Data - TRAIN.csv'
trainData = pd.read_csv(trainfile)

#Read test data file
testfile = '/content/drive/MyDrive/Portugese Bank Data - TEST.csv'
testData = pd.read_csv(testfile)

#EXPLORATORY DATA ANALYSIS & PRE-PROCESSING

In [68]:
print(trainData.shape)
print(testData.shape)

(4521, 17)
(45211, 17)


In [69]:
print(trainData.isnull().sum())

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [70]:
trainData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [71]:
categorical_columns = trainData.select_dtypes(include=['object', 'category']).columns
TrainCols = list(categorical_columns.values)
TrainCols.remove("y")
print(TrainCols)

['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']


In [72]:

combinedData = pd.concat([trainData,testData], keys=[0,1])
combinedData["y"]=combinedData["y"].map({"yes":1,"no":0})

combinedData = pd.get_dummies(combinedData,columns=TrainCols)

trainData = combinedData.xs(0)
testData = combinedData.xs(1)


In [73]:
trainData.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,y,job_admin.,job_blue-collar,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,30,1787,19,79,1,-1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
1,33,4789,11,220,1,339,4,0,0,0,...,0,0,1,0,0,0,1,0,0,0
2,35,1350,16,185,1,330,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,30,1476,3,199,4,-1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,59,0,5,226,1,-1,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1


In [74]:
testData.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,y,job_admin.,job_blue-collar,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,58,2143,5,261,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,44,29,5,151,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,33,2,5,76,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,47,1506,5,92,1,-1,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
4,33,1,5,198,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [75]:
Xtrain = trainData.drop("y", axis=1)
Ytrain = trainData["y"]
Xtest = testData.drop("y", axis=1)
Ytest = testData["y"]
print(Ytrain.value_counts())
print(Ytest.value_counts())


0    4000
1     521
Name: y, dtype: int64
0    39922
1     5289
Name: y, dtype: int64


#BASIC DECISION TREE

In [76]:
dt = DecisionTreeClassifier(random_state = 7)
dt.fit(Xtrain, Ytrain)
dt_pred = dt.predict(Xtest)
#Model Accuracy
print("Accuracy:", accuracy_score(Ytest, dt_pred))
print("Precision:", precision_score(Ytest, dt_pred))
print("Recall:", recall_score(Ytest, dt_pred))
print("F1 Score:", f1_score(Ytest, dt_pred))
print("ROC-AUC Score:", roc_auc_score(Ytest, dt_pred))
print("Confusion Matrix:\n", confusion_matrix(Ytest, dt_pred))

Accuracy: 0.882263166043662
Precision: 0.4968910021945867
Recall: 0.5137076952164871
F1 Score: 0.5051594310681418
ROC-AUC Score: 0.7223991609692976
Confusion Matrix:
 [[37171  2751]
 [ 2572  2717]]


In [77]:
parameters={'min_samples_split' : range(150,250,20),
            'max_depth': range(10,30,5),
            'criterion':['gini','entropy'],
            #'max_leaf_nodes' : range(5,30,5),
            'max_features' : range(5,10,1)
            }

#RANDOM SEARCH FOR DECISION TREE

In [78]:
#Hyperparameter tuning done for decision tree classifier
clf_random = RandomizedSearchCV(dt,parameters,n_iter=25, cv = 5, scoring = "roc_auc", verbose = 1)
clf_random.fit(Xtrain, Ytrain)
random_parm = clf_random.best_params_
best_score = clf_random.best_score_
print(f"Best Parameters: ",random_parm)
print(f"Best Score: ",best_score)
print('\n')


#Using the parameters obtained from HyperParameterTuning in the DecisionTreeClassifier
dt = DecisionTreeClassifier(**random_parm, random_state = 7, class_weight = "balanced")
dt.fit(Xtrain, Ytrain)
rs_pred = dt.predict(Xtest)

#Obtain accuracy ,confusion matrix,classification report and AUC values for the result above.
print("Accuracy:", accuracy_score(Ytest, rs_pred))
print("Precision:", precision_score(Ytest, rs_pred))
print("Recall:", recall_score(Ytest, rs_pred))
print("F1 Score:", f1_score(Ytest, rs_pred))
print("ROC-AUC Score:", roc_auc_score(Ytest, rs_pred))
print("Confusion Matrix:\n", confusion_matrix(Ytest, rs_pred))
print('\n')

#get cross-validation report
dt_cv_score = cross_val_score(dt, Xtrain, Ytrain, cv=10, scoring="roc_auc")
print("=== All ROC-AUC Scores ===")
print(dt_cv_score)
print('\n')
print("Mean ROC-AUC Score - Decision Tree: ",dt_cv_score.mean())


Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best Parameters:  {'min_samples_split': 210, 'max_features': 8, 'max_depth': 20, 'criterion': 'gini'}
Best Score:  0.848266723901099


Accuracy: 0.7056910928756276
Precision: 0.2614981852799429
Recall: 0.8309699376063528
F1 Score: 0.39780955829109343
ROC-AUC Score: 0.7600318351926358
Confusion Matrix:
 [[27510 12412]
 [  894  4395]]


=== All ROC-AUC Scores ===
[0.84443396 0.78639423 0.77581731 0.80451923 0.87074519 0.793125
 0.87379808 0.9140625  0.80975962 0.8040625 ]


Mean ROC-AUC Score - Decision Tree:  0.8276717616110305


#GRID SEARCH FOR DECISION TREE

In [79]:
#Hyperparameter tuning done for decision tree classifier
clf_grid = GridSearchCV(dt,parameters, cv = 5, scoring = "roc_auc", verbose = 1, n_jobs = -1)
clf_grid.fit(Xtrain, Ytrain)
grid_parm = clf_grid.best_params_
best_score = clf_grid.best_score_
print(f"Best Parameters: ",grid_parm)
print(f"Best Score: ",best_score)
print('\n')

#Using the parameters obtained from HyperParameterTuning in the DecisionTreeClassifier
dt = DecisionTreeClassifier(**grid_parm, random_state = 7, class_weight = "balanced")
dt.fit(Xtrain, Ytrain)
gs_pred = dt.predict(Xtest)

#Obtain accuracy ,confusion matrix,classification report and AUC values for the result above.
print("Accuracy:", accuracy_score(Ytest, gs_pred))
print("Precision:", precision_score(Ytest, gs_pred))
print("Recall:", recall_score(Ytest, gs_pred))
print("F1 Score:", f1_score(Ytest, gs_pred))
print("ROC-AUC Score:", roc_auc_score(Ytest, gs_pred))
print("Confusion Matrix:\n", confusion_matrix(Ytest, gs_pred))
print('\n')

#get cross-validation report
dt_cv_score = cross_val_score(dt, Xtrain, Ytrain, cv=10, scoring="roc_auc")
print("=== All ROC-AUC Scores ===")
print(dt_cv_score)
print('\n')
print("Mean ROC-AUC Score - Decision Tree: ",dt_cv_score.mean())


Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Best Parameters:  {'criterion': 'entropy', 'max_depth': 15, 'max_features': 9, 'min_samples_split': 170}
Best Score:  0.8585211881868131


Accuracy: 0.7288049368516512
Precision: 0.27550231839258116
Recall: 0.8088485536018151
F1 Score: 0.4110102320219052
ROC-AUC Score: 0.763524522279591
Confusion Matrix:
 [[28672 11250]
 [ 1011  4278]]


=== All ROC-AUC Scores ===
[0.82337264 0.87076923 0.870625   0.85129808 0.90117788 0.88612981
 0.81439904 0.88       0.80639423 0.85028846]


Mean ROC-AUC Score - Decision Tree:  0.8554454372278665


#BASIC RANDOM FOREST

In [80]:
rf = RandomForestClassifier(random_state = 10, n_jobs = -1)
rf.fit(Xtrain, Ytrain)
rf_pred = dt.predict(Xtest)
#Model Accuracy
print("Accuracy:", accuracy_score(Ytest, rf_pred))
print("Precision:", precision_score(Ytest, rf_pred))
print("Recall:", recall_score(Ytest, rf_pred))
print("F1 Score:", f1_score(Ytest, rf_pred))
print("ROC-AUC Score:", roc_auc_score(Ytest, rf_pred))
print("Confusion Matrix:\n", confusion_matrix(Ytest, rf_pred))

Accuracy: 0.7288049368516512
Precision: 0.27550231839258116
Recall: 0.8088485536018151
F1 Score: 0.4110102320219052
ROC-AUC Score: 0.763524522279591
Confusion Matrix:
 [[28672 11250]
 [ 1011  4278]]


In [81]:
parameters={'min_samples_split' : range(100,150,10),
            'max_depth': range(10,20,2),
            'criterion':['gini','entropy'],
            'max_leaf_nodes' : range(30,40,2),
           # 'n_estimators' : [25,50, 75, 100]
            }

#RANDOM SEARCH FOR RANDOM FOREST

In [82]:
#Hyperparameter tuning done for random forest classifier
clf_random_rf = RandomizedSearchCV(rf,parameters,n_iter=25, cv = 5, scoring = "roc_auc", verbose = 1)
clf_random_rf.fit(Xtrain, Ytrain)
random_rf_parm = clf_random_rf.best_params_
best_score = clf_random_rf.best_score_
print(f"Best Parameters: ",random_rf_parm)
print(f"Best Score: ",best_score)
print('\n')


#Using the parameters obtained from HyperParameterTuning in the RandomForestClassifier
rfrs = RandomForestClassifier(**random_rf_parm, random_state = 7, class_weight = "balanced", n_jobs = -1 )
rfrs.fit(Xtrain, Ytrain)
rs_pred = rfrs.predict(Xtest)

#Obtain accuracy ,confusion matrix,classification report and AUC values for the result above.
print("Accuracy:", accuracy_score(Ytest, rs_pred))
print("Precision:", precision_score(Ytest, rs_pred))
print("Recall:", recall_score(Ytest, rs_pred))
print("F1 Score:", f1_score(Ytest, rs_pred))
print("ROC-AUC Score:", roc_auc_score(Ytest, rs_pred))
print("Confusion Matrix:\n", confusion_matrix(Ytest, rs_pred))
print('\n')

#get cross-validation report
rf_cv_score = cross_val_score(rfrs, Xtrain, Ytrain, cv=10, scoring="roc_auc")
print("=== All ROC-AUC Scores ===")
print(rf_cv_score)
print('\n')
print("Mean ROC-AUC Score - Random Forest: ",rf_cv_score.mean())


Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best Parameters:  {'min_samples_split': 100, 'max_leaf_nodes': 36, 'max_depth': 12, 'criterion': 'entropy'}
Best Score:  0.9031486263736264


Accuracy: 0.8310366946097189
Precision: 0.39467551093581926
Recall: 0.8324825108716204
F1 Score: 0.5354819093949529
ROC-AUC Score: 0.8316638294551478
Confusion Matrix:
 [[33169  6753]
 [  886  4403]]


=== All ROC-AUC Scores ===
[0.89669811 0.90177885 0.87125    0.86745192 0.92788462 0.92870192
 0.90028846 0.92908654 0.87524038 0.881875  ]


Mean ROC-AUC Score - Random Forest:  0.8980255805515238


#GRID SEARCH FOR RANDOM FOREST

In [83]:
#Hyperparameter tuning done for random forest classifier
clf_grid_rf = GridSearchCV(rf,parameters, cv = 5, scoring = "roc_auc", verbose = 1, n_jobs = -1)
clf_grid_rf.fit(Xtrain, Ytrain)
grid_rf_parm = clf_grid_rf.best_params_
best_score = clf_grid_rf.best_score_
print(f"Best Parameters: ",grid_rf_parm)
print(f"Best Score: ",best_score)
print('\n')

#Using the parameters obtained from HyperParameterTuning in the RandomForestClassifier
rf = RandomForestClassifier(**grid_parm, random_state = 7, class_weight = "balanced")
rf.fit(Xtrain, Ytrain)
gs_pred = rf.predict(Xtest)

#Obtain accuracy ,confusion matrix,classification report and AUC values for the result above.
print("Accuracy:", accuracy_score(Ytest, gs_pred))
print("Precision:", precision_score(Ytest, gs_pred))
print("Recall:", recall_score(Ytest, gs_pred))
print("F1 Score:", f1_score(Ytest, gs_pred))
print("ROC-AUC Score:", roc_auc_score(Ytest, gs_pred))
print("Confusion Matrix:\n", confusion_matrix(Ytest, gs_pred))
print('\n')

#get cross-validation report
rf_cv_score = cross_val_score(rf, Xtrain, Ytrain, cv=10, scoring="roc_auc")
print("=== All ROC-AUC Scores ===")
print(rf_cv_score)
print('\n')
print("Mean ROC-AUC Score - Random Forest: ",rf_cv_score.mean())


Fitting 5 folds for each of 250 candidates, totalling 1250 fits
Best Parameters:  {'criterion': 'entropy', 'max_depth': 12, 'max_leaf_nodes': 38, 'min_samples_split': 100}
Best Score:  0.8958999084249084


Accuracy: 0.819358120811307
Precision: 0.3778230599422652
Recall: 0.8413688788050672
F1 Score: 0.5214741899572274
ROC-AUC Score: 0.8289054704130039
Confusion Matrix:
 [[32594  7328]
 [  839  4450]]


=== All ROC-AUC Scores ===
[0.9020283  0.90413462 0.87658654 0.85879808 0.93019231 0.93048077
 0.90456731 0.92942308 0.88216346 0.88730769]


Mean ROC-AUC Score - Random Forest:  0.9005682148040638
