In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



# import modelling libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score

### 1. Load data

In [2]:
# load data
df_train = pd.read_csv('data.csv')
df_test = pd.read_csv('test_data.csv')

In [3]:
# read sample data
df_train.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,38,technician,married,tertiary,no,127,yes,no,cellular,14,oct,113,1,50,2,success,no
1,41,housemaid,married,primary,no,365,no,no,cellular,8,aug,203,5,-1,0,unknown,no
2,39,management,single,tertiary,no,2454,yes,no,cellular,4,may,716,3,263,2,failure,yes
3,49,blue-collar,married,primary,no,6215,yes,no,cellular,11,may,549,1,-1,0,unknown,no
4,37,services,married,secondary,no,1694,yes,yes,cellular,29,jan,404,2,251,6,failure,no


In [4]:
df_test.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,31,blue-collar,single,secondary,yes,477,no,no,cellular,20,nov,426,2,189,6,failure
1,49,blue-collar,married,primary,no,599,no,no,cellular,23,jul,464,1,-1,0,unknown
2,51,self-employed,single,tertiary,no,400,no,yes,cellular,27,may,200,1,-1,0,unknown
3,33,technician,married,secondary,no,488,yes,no,unknown,8,may,703,1,-1,0,unknown
4,34,admin.,married,secondary,no,40,yes,no,telephone,5,may,125,2,-1,0,unknown


In [5]:
# combine train and test data
df = pd.concat([df_train, df_test])
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,38,technician,married,tertiary,no,127,yes,no,cellular,14,oct,113,1,50,2,success,no
1,41,housemaid,married,primary,no,365,no,no,cellular,8,aug,203,5,-1,0,unknown,no
2,39,management,single,tertiary,no,2454,yes,no,cellular,4,may,716,3,263,2,failure,yes
3,49,blue-collar,married,primary,no,6215,yes,no,cellular,11,may,549,1,-1,0,unknown,no
4,37,services,married,secondary,no,1694,yes,yes,cellular,29,jan,404,2,251,6,failure,no


In [6]:
# print(df_train.columns)
# print()
# print(df_test.columns)
# print()
# print(df.columns)

### 2.Data wrangling

In [7]:
# check columns
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'deposit'],
      dtype='object')

In [8]:
# number of rows and columns
df.shape

(11162, 17)

In [9]:
# check missing values 
df.isnull().sum()

age             0
job             0
marital         0
education       0
default         0
balance         0
housing         0
loan            0
contact         0
day             0
month           0
duration        0
campaign        0
pdays           0
previous        0
poutcome        0
deposit      2791
dtype: int64

In [10]:
# check unique values in deposit
print(df['deposit'].nunique())
print("--------------------")
print(df['deposit'].value_counts())

2
--------------------
no     4428
yes    3943
Name: deposit, dtype: int64


In [11]:
# drop the rows where label depost is null
df.dropna(subset=['deposit'], inplace=True)

In [12]:
# recheck missing values
df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
deposit      0
dtype: int64

In [13]:
# check duplicates
df.duplicated().sum()

0

In [14]:
# drop unnecessary columns
df.drop(['day', 'month'], axis=1, inplace=True)

In [15]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'duration', 'campaign', 'pdays', 'previous',
       'poutcome', 'deposit'],
      dtype='object')

In [16]:
# check unique values in the columns
for col in df.columns:
    print(f'{col} : {df[col].unique()}')

age : [38 41 39 49 37 40 54 48 29 52 44 30 36 26 56 46 60 45 50 58 33 32 31 34
 55 24 47 21 43 27 25 61 57 35 42 28 70 51 79 66 69 71 59 23 72 22 53 80
 74 84 20 77 82 75 67 63 62 73 90 78 88 81 87 76 19 95 86 83 65 18 64 68
 85 92 93]
job : ['technician' 'housemaid' 'management' 'blue-collar' 'services' 'admin.'
 'unemployed' 'student' 'entrepreneur' 'retired' 'self-employed' 'unknown']
marital : ['married' 'single' 'divorced']
education : ['tertiary' 'primary' 'secondary' 'unknown']
default : ['no' 'yes']
balance : [ 127  365 2454 ... 2016 2758  -49]
housing : ['yes' 'no']
loan : ['no' 'yes']
contact : ['cellular' 'unknown' 'telephone']
duration : [ 113  203  716 ... 1262 1817 1150]
campaign : [ 1  5  3  2 17  6  4  8  7 11 24 10 15 13 21 14 32  9 16 25 18 26 12 41
 63 20 23 22 30 43 29 27 33 19]
pdays : [ 50  -1 263 251  89 307  92 227 101  99 122 181 330  94  97 341 778  85
 149 412 347 370 174 343 193 179 177 182 126 339 328 223  84 267 104 301
 368  93 230 298 313  64 100 336 334

In [17]:
# check information of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8371 entries, 0 to 8370
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        8371 non-null   int64 
 1   job        8371 non-null   object
 2   marital    8371 non-null   object
 3   education  8371 non-null   object
 4   default    8371 non-null   object
 5   balance    8371 non-null   int64 
 6   housing    8371 non-null   object
 7   loan       8371 non-null   object
 8   contact    8371 non-null   object
 9   duration   8371 non-null   int64 
 10  campaign   8371 non-null   int64 
 11  pdays      8371 non-null   int64 
 12  previous   8371 non-null   int64 
 13  poutcome   8371 non-null   object
 14  deposit    8371 non-null   object
dtypes: int64(6), object(9)
memory usage: 1.0+ MB


In [18]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'duration', 'campaign', 'pdays', 'previous',
       'poutcome', 'deposit'],
      dtype='object')

In [19]:
# store the features with string objects in a variable

# df_cat = df.select_dtypes(['object'])
# df_cat
df_cat = [col for col in df.columns if df[col].dtype == 'O']
df_cat

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'poutcome',
 'deposit']

In [20]:
# encoders of categorical variables: label encoding, ordinal encoding, one-hot-encoding/pandas dummies
# instantiate
encoder = LabelEncoder()

for cols in df_cat:
     df[col] = encoder.fit_transform(df[col])

In [23]:
# df_cat.head()