In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

## Read data into a dataframe

In [5]:
df = pd.read_csv('bank.csv', sep = ';', index_col=0)
df.head(10)

Unnamed: 0_level_0,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
45,services,married,basic.9y,unknown,no,no,telephone,may,mon,198,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
59,admin.,married,professional.course,no,no,no,telephone,may,mon,139,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
41,blue-collar,married,unknown,unknown,no,no,telephone,may,mon,217,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
24,technician,single,professional.course,no,yes,no,telephone,may,mon,380,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
25,services,single,high.school,no,yes,no,telephone,may,mon,50,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [6]:
df.info

<bound method DataFrame.info of              job  marital            education  default housing loan  \
age                                                                    
56     housemaid  married             basic.4y       no      no   no   
57      services  married          high.school  unknown      no   no   
37      services  married          high.school       no     yes   no   
40        admin.  married             basic.6y       no      no   no   
56      services  married          high.school       no      no  yes   
..           ...      ...                  ...      ...     ...  ...   
73       retired  married  professional.course       no     yes   no   
46   blue-collar  married  professional.course       no      no   no   
56       retired  married    university.degree       no     yes   no   
44    technician  married  professional.course       no      no   no   
74       retired  married  professional.course       no     yes   no   

       contact month day_of_wee

In [7]:
print(df.shape)

(41188, 20)


# What does the primary analysis of several categorical features reveal?

The bank client data that you have provided can be used for a variety of machine learning tasks, such as:

Classification: Classifying clients as high-risk or low-risk for credit default, or classifying clients as likely or unlikely to churn (cancel their account).

Regression: Predicting the amount of money that a client is likely to borrow, or predicting the number of days that a client is likely to take to repay a loan.

Clustering: Grouping similar clients together, such as grouping clients who are likely to be interested in the same products or services.

Predict whether a client is likely to default on a loan. This could be done using a classification algorithm such as logistic regression or support vector machines. You would train the algorithm on a historical dataset of clients, where each client is labeled as either having defaulted on a loan or not. Once the algorithm is trained, you can use it to predict whether new clients are likely to default.

Predict the amount of money that a client is likely to borrow. This could be done using a regression algorithm such as linear regression or random forests. You would train the algorithm on a historical dataset of clients, where each client is labeled with the amount of money that they borrowed. Once the algorithm is trained, you can use it to predict the amount of money that new clients are likely to borrow.

Segment clients into different groups based on their risk profile. This could be done using a clustering algorithm such as k-means clustering. You would train the algorithm on the client data, and it would automatically group the clients into different clusters based on their similarities. You could then use these clusters to segment your clients and target them with different marketing campaigns.

It is important to note that the duration attribute should only be included for benchmark purposes, as it is not realistic to have this information before a call is performed.

# Exploratory Data Analysis tasks:
## Check missing data

In [8]:
def check_missing_value(data):
    """
    Objective: Check missing value count and percentage in all columns
    data: input data frame
    return: Missing value data frame
    """
    
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending=False)
    missing_data = pd.concat([total,percent],axis=1,keys=['Total','Percent'])
    return missing_data

In [9]:
miss = check_missing_value(df)
miss

Unnamed: 0,Total,Percent
y,0,0.0
nr.employed,0,0.0
marital,0,0.0
education,0,0.0
default,0,0.0
housing,0,0.0
loan,0,0.0
contact,0,0.0
month,0,0.0
day_of_week,0,0.0


### No missing values

In [10]:
df.columns

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous',
       'poutcome', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx',
       'euribor3m', 'nr.employed', 'y'],
      dtype='object')

# Using Label Encoder

In [11]:
colname_cat = []
for x in df.columns:
    if df[x].dtypes =='object':
        colname_cat.append(x)
colname_cat

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'day_of_week',
 'poutcome',
 'y']

In [12]:
# for preprocessing the data
from sklearn import preprocessing

le = preprocessing.LabelEncoder()

for x in colname_cat:
    df[x]=le.fit_transform(df[x])
    
    print()
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    print('Feature', x)
    print('mapping', le_name_mapping)


Feature job
mapping {'admin.': 0, 'blue-collar': 1, 'entrepreneur': 2, 'housemaid': 3, 'management': 4, 'retired': 5, 'self-employed': 6, 'services': 7, 'student': 8, 'technician': 9, 'unemployed': 10, 'unknown': 11}

Feature marital
mapping {'divorced': 0, 'married': 1, 'single': 2, 'unknown': 3}

Feature education
mapping {'basic.4y': 0, 'basic.6y': 1, 'basic.9y': 2, 'high.school': 3, 'illiterate': 4, 'professional.course': 5, 'university.degree': 6, 'unknown': 7}

Feature default
mapping {'no': 0, 'unknown': 1, 'yes': 2}

Feature housing
mapping {'no': 0, 'unknown': 1, 'yes': 2}

Feature loan
mapping {'no': 0, 'unknown': 1, 'yes': 2}

Feature contact
mapping {'cellular': 0, 'telephone': 1}

Feature month
mapping {'apr': 0, 'aug': 1, 'dec': 2, 'jul': 3, 'jun': 4, 'mar': 5, 'may': 6, 'nov': 7, 'oct': 8, 'sep': 9}

Feature day_of_week
mapping {'fri': 0, 'mon': 1, 'thu': 2, 'tue': 3, 'wed': 4}

Feature poutcome
mapping {'failure': 0, 'nonexistent': 1, 'success': 2}

Feature y
mapping {

In [13]:
df.head()

Unnamed: 0_level_0,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
56,3,1,0,0,0,0,1,6,1,261,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
57,7,1,3,1,0,0,1,6,1,149,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
37,7,1,3,0,2,0,1,6,1,226,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
40,0,1,1,0,0,0,1,6,1,151,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
56,7,1,3,0,0,2,1,6,1,307,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0


In [14]:
df.shape

(41188, 20)

### Spliting data into dependent and independent variable

In [15]:
X = df.values[:, 0:-1]
y = df.values[:, -1]

X.shape, y.shape

((41188, 19), (41188,))

In [22]:
X

array([[ 3.0000e+00,  1.0000e+00,  0.0000e+00, ..., -3.6400e+01,
         4.8570e+00,  5.1910e+03],
       [ 7.0000e+00,  1.0000e+00,  3.0000e+00, ..., -3.6400e+01,
         4.8570e+00,  5.1910e+03],
       [ 7.0000e+00,  1.0000e+00,  3.0000e+00, ..., -3.6400e+01,
         4.8570e+00,  5.1910e+03],
       ...,
       [ 5.0000e+00,  1.0000e+00,  6.0000e+00, ..., -5.0800e+01,
         1.0280e+00,  4.9636e+03],
       [ 9.0000e+00,  1.0000e+00,  5.0000e+00, ..., -5.0800e+01,
         1.0280e+00,  4.9636e+03],
       [ 5.0000e+00,  1.0000e+00,  5.0000e+00, ..., -5.0800e+01,
         1.0280e+00,  4.9636e+03]])

In [16]:
from sklearn.model_selection import train_test_split

# split the data into test and train
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                   test_size=0.3,
                                                   random_state=10)
print("Train Size: ", X_train.shape, y_train.shape)
print("Test Size: ", X_test.shape, y_test.shape)

Train Size:  (28831, 19) (28831,)
Test Size:  (12357, 19) (12357,)


In [17]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# mean = 0 and std = 1

scaler = MinMaxScaler()
#sacler = StandardScaler()

scaler.fit(X_train)
X_train_scale = scaler.transform(X_train)
X_test_scale = scaler.transform(X_test)

#convert y into int
y = y.astype(int)

In [18]:
X_train_scale

array([[0.09090909, 0.33333333, 0.42857143, ..., 0.61506276, 0.98254364,
        1.        ],
       [0.81818182, 0.66666667, 0.71428571, ..., 0.61506276, 0.98163682,
        1.        ],
       [0.09090909, 0.66666667, 0.        , ..., 0.19246862, 0.14667876,
        0.51228733],
       ...,
       [0.90909091, 0.33333333, 0.28571429, ..., 0.15481172, 0.1747903 ,
        0.51228733],
       [0.        , 0.66666667, 0.42857143, ..., 0.43933054, 0.03990025,
        0.10586011],
       [0.45454545, 0.33333333, 0.        , ..., 0.33891213, 0.98118341,
        1.        ]])

## Running a basic Logistic Regression model

In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
%%time
#create a model
classifier = LogisticRegression()

#create a model
classifier.fit(X_train_scale, y_train)

# predict using the model
y_pred = classifier.predict(X_test_scale)

print(list(zip(y_test,y_pred)))

[(0.0, 0.0), (0.0, 0.0), (1.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (1.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (1.0, 1.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (1.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (1.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (1.0, 1.0), (0.0, 0.0), (0.0, 0.0), (0.0, 1.0), (0.0, 0.0), (0.0, 0.0), (0.0, 1.0), (1.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (1.0, 1.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (1.0, 1.0), (0.0, 0.0), (1.0, 1.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.

In [21]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

cfm =confusion_matrix(y_test,y_pred)

print(cfm)

print("Classification report: ")

print(classification_report(y_test,y_pred))

acc=accuracy_score(y_test, y_pred)
print("Accuracy of the model: ",acc)

[[10674   261]
 [  885   537]]
Classification report: 
              precision    recall  f1-score   support

         0.0       0.92      0.98      0.95     10935
         1.0       0.67      0.38      0.48      1422

    accuracy                           0.91     12357
   macro avg       0.80      0.68      0.72     12357
weighted avg       0.89      0.91      0.90     12357

Accuracy of the model:  0.9072590434571498
