## **MLT FAT Assessment**

Name : Khemraj Gupta

Reg. No : 20MAI0079

In [3]:
# Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

## Dataset

The dataset is from UCI repository about bank marketing and it contains 45221 rows and 17 columns. In this we have used 10% of the original dataset which contain 4521 rows and 17 columns. It's older version of dataset is used

https://archive.ics.uci.edu/ml/datasets/Bank+Marketing


### **Attribute Information:**

Input variables:
**bank client data:**
* 1 - age (numeric)
* 2 - job : type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-* * employed','services','student','technician','unemployed','unknown')
* 3 - marital : marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)
* 4 - education (categorical: 'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')
* 5 - default: has credit in default? (categorical: 'no','yes','unknown')
* 6 - housing: has housing loan? (categorical: 'no','yes','unknown')
* 7 - loan: has personal loan? (categorical: 'no','yes','unknown')
**related with the last contact of the current campaign:**
* 8 - contact: contact communication type (categorical: 'cellular','telephone')
* 9 - month: last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')
* 10 - day_of_week: last contact day of the week (categorical: 'mon','tue','wed','thu','fri')
* 11 - duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
**other attributes:**
* 12 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
* 13 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
* 14 - previous: number of contacts performed before this campaign and for this client (numeric)
* 15 - poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')
**social and economic context attributes**
* 16 - emp.var.rate: employment variation rate - quarterly indicator (numeric)
* 17 - cons.price.idx: consumer price index - monthly indicator (numeric)
* 18 - cons.conf.idx: consumer confidence index - monthly indicator (numeric)
* 19 - euribor3m: euribor 3 month rate - daily indicator (numeric)
* 20 - nr.employed: number of employees - quarterly indicator (numeric)

**Output variable (desired target):**
* 21 - y - has the client subscribed a term deposit? (binary: 'yes','no')

In [11]:
# Import the data into dataframe
bank_data = pd.read_csv(r"data/bank.csv" , delimiter = ';')
bank_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [12]:
bank_data.shape

(4521, 17)

### Preprocessing of dataset

In [13]:
bank_data.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0
mean,41.170095,1422.657819,15.915284,263.961292,2.79363,39.766645,0.542579
std,10.576211,3009.638142,8.247667,259.856633,3.109807,100.121124,1.693562
min,19.0,-3313.0,1.0,4.0,1.0,-1.0,0.0
25%,33.0,69.0,9.0,104.0,1.0,-1.0,0.0
50%,39.0,444.0,16.0,185.0,2.0,-1.0,0.0
75%,49.0,1480.0,21.0,329.0,3.0,-1.0,0.0
max,87.0,71188.0,31.0,3025.0,50.0,871.0,25.0


In [14]:
bank_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   object
dtypes: int64(7), object(10)
memory usage: 600.6+ KB


age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [16]:
# find categorical columns
cat_columns  = [c for c in bank_data.columns if bank_data[c].dtypes == "O"]
bank_data[cat_columns].head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,y
0,unemployed,married,primary,no,no,no,cellular,oct,unknown,no
1,services,married,secondary,no,yes,yes,cellular,may,failure,no
2,management,single,tertiary,no,yes,no,cellular,apr,failure,no
3,management,married,tertiary,no,yes,yes,unknown,jun,unknown,no
4,blue-collar,married,secondary,no,yes,no,unknown,may,unknown,no


In [17]:
# find numerical columns
num_columns  = [c for c in bank_data.columns if bank_data[c].dtypes != "O"]
bank_data[num_columns].head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
0,30,1787,19,79,1,-1,0
1,33,4789,11,220,1,339,4
2,35,1350,16,185,1,330,1
3,30,1476,3,199,4,-1,0
4,59,0,5,226,1,-1,0


In [32]:
# Change yes to 1 and no to 0 - Encoding 
bank_data["y"] = bank_data["y"].apply(lambda i:1 if i=="yes" else 0)

In [42]:
bank_data["poutcome"].unique()

array(['unknown', 'failure', 'other', 'success'], dtype=object)

In [36]:
# Change yes to 1 and no to 0 - Encoding 
bank_data["default"] = bank_data["default"].apply(lambda i:1 if i=="yes" else 0)

In [38]:
# Change yes to 1 and no to 0 - Encoding 
bank_data["housing"] = bank_data["housing"].apply(lambda i:1 if i=="yes" else 0)

In [40]:
# Change yes to 1 and no to 0 - Encoding 
bank_data["loan"] = bank_data["loan"].apply(lambda i:1 if i=="yes" else 0)

In [18]:
X =  bank_data.drop("y", axis=1)
y = bank_data["y"]

In [24]:
# find categoorical features
cat_features = [c for c in X.columns if X[c].dtypes == "O"]
cat_features

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome']

In [43]:
categorical_features = ['job', 'marital', 'education', 'contact', 'month', 'poutcome']

In [44]:
# Import Onehotencoder 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 categorical_features)],
                                 remainder="passthrough")
X_transformed = transformer.fit_transform(X)
X_transformed

array([[0.0, 0.0, 0.0, ..., 1, -1, 0],
       [0.0, 0.0, 0.0, ..., 1, 339, 4],
       [0.0, 0.0, 0.0, ..., 1, 330, 1],
       ...,
       [0.0, 0.0, 0.0, ..., 11, -1, 0],
       [0.0, 1.0, 0.0, ..., 4, 211, 3],
       [0.0, 0.0, 1.0, ..., 2, 249, 7]], dtype=object)

In [49]:
X_new = pd.DataFrame(X_transformed)
X_new

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,38,39,40,41,42,43,44,45,46,47
0,0,0,0,0,0,0,0,0,0,0,...,30,no,1787,no,no,19,79,1,-1,0
1,0,0,0,0,0,0,0,1,0,0,...,33,no,4789,yes,yes,11,220,1,339,4
2,0,0,0,0,1,0,0,0,0,0,...,35,no,1350,yes,no,16,185,1,330,1
3,0,0,0,0,1,0,0,0,0,0,...,30,no,1476,yes,yes,3,199,4,-1,0
4,0,1,0,0,0,0,0,0,0,0,...,59,no,0,yes,no,5,226,1,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,0,0,0,0,0,0,0,1,0,0,...,33,no,-333,yes,no,30,329,5,-1,0
4517,0,0,0,0,0,0,1,0,0,0,...,57,yes,-3313,yes,yes,9,153,1,-1,0
4518,0,0,0,0,0,0,0,0,0,1,...,57,no,295,no,no,19,151,11,-1,0
4519,0,1,0,0,0,0,0,0,0,0,...,28,no,1137,no,no,6,129,4,211,3


In [50]:
# Another way of getting dummies by pd.dummies
dummies = pd.get_dummies(cat_features)
dummies

Unnamed: 0,contact,default,education,housing,job,loan,marital,month,poutcome
0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0
6,1,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,1,0
8,0,0,0,0,0,0,0,0,1


In [51]:
# Split the dataset

X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)