## The classification goal is to predict if the client will subscribe a term deposit (variable y)

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

#### Reading in the csv file while specifying the separator

In [2]:
full_df = pd.read_csv('bank-full.csv', sep=';')
full_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


#### Subsetting for needed columns

In [3]:
df = full_df[['age','job',
                'marital',
                'education',
                'balance',
                'housing',
                'contact',
                'day',
                'month',
                'duration',
                'campaign',
                'pdays',
                'previous',
                'poutcome',
                'y'
]].copy()
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
# checking for null values in the selected columns
df.isna().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

#### 1. Mode of the education column

In [5]:
df['education'].mode()

0    secondary
Name: education, dtype: object

#### 2. Correlation matix for numerical features

In [6]:
corr_matrix = df.corr()
corr_matrix

  corr_matrix = df.corr()


Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [7]:
corr_unstack = corr_matrix.unstack()
corr_unstack = corr_unstack[corr_unstack != 1] #removes self correlation which is 1 always
highest_corr = corr_unstack.abs().sort_values(ascending=False).head(1)
print("Highest correlation is between the following features:")
print(highest_corr)

Highest correlation is between the following features:
previous  pdays    0.45482
dtype: float64


In [8]:
# Target encoding
df['y'] = df['y'].replace({'yes': 1, 'no': 0})
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,0


In [9]:
#splitting the data
from sklearn.model_selection import train_test_split

In [10]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.2, random_state=42)

In [11]:
len(df_train), len(df_val), len(df_test)

(28934, 7234, 9043)

In [12]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [13]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

del df_train['y']
del df_val['y']
del df_test['y']

#### 3. Mutual information score

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   balance    45211 non-null  int64 
 5   housing    45211 non-null  object
 6   contact    45211 non-null  object
 7   day        45211 non-null  int64 
 8   month      45211 non-null  object
 9   duration   45211 non-null  int64 
 10  campaign   45211 non-null  int64 
 11  pdays      45211 non-null  int64 
 12  previous   45211 non-null  int64 
 13  poutcome   45211 non-null  object
 14  y          45211 non-null  int64 
dtypes: int64(8), object(7)
memory usage: 5.2+ MB


In [15]:
df.columns

Index(['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact',
       'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome',
       'y'],
      dtype='object')

In [16]:
numerical = ['age','balance','day','duration','campaign','pdays','previous']
categorical = ['job', 'marital', 'education','housing', 'contact','month','poutcome']

In [17]:
from sklearn.metrics import mutual_info_score 
#this score gives information of how much an effect a variable has on the target variable

In [18]:
def mutual_info_churn_score(series):
    return round(mutual_info_score(series, df_full_train.y),2) 
#this function computes the mutual info score between the target variable and all categorical variables in the data frame 

In [19]:
mi = df_full_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)
#the mutual info when applied on the categorical columns gives information on how influential each column is on the target variable

poutcome     0.03
month        0.02
job          0.01
housing      0.01
contact      0.01
marital      0.00
education    0.00
dtype: float64

In [20]:
mi_ = df_full_train[numerical].apply(mutual_info_churn_score)
mi_.sort_values(ascending=False)

balance     0.10
duration    0.10
pdays       0.03
age         0.01
day         0.01
previous    0.01
campaign    0.00
dtype: float64

#### 4. Logistic Regression

In [21]:
from sklearn.feature_extraction import DictVectorizer

In [22]:
#one-hot encoding for all categorical variables
#this is to allow categrical variables to be included in the model training

dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [23]:
dv.get_feature_names_out()

array(['age', 'balance', 'campaign', 'contact=cellular',
       'contact=telephone', 'contact=unknown', 'day', 'duration',
       'education=primary', 'education=secondary', 'education=tertiary',
       'education=unknown', 'housing=no', 'housing=yes', 'job=admin.',
       'job=blue-collar', 'job=entrepreneur', 'job=housemaid',
       'job=management', 'job=retired', 'job=self-employed',
       'job=services', 'job=student', 'job=technician', 'job=unemployed',
       'job=unknown', 'marital=divorced', 'marital=married',
       'marital=single', 'month=apr', 'month=aug', 'month=dec',
       'month=feb', 'month=jan', 'month=jul', 'month=jun', 'month=mar',
       'month=may', 'month=nov', 'month=oct', 'month=sep', 'pdays',
       'poutcome=failure', 'poutcome=other', 'poutcome=success',
       'poutcome=unknown', 'previous'], dtype=object)

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [32]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [33]:
model.intercept_[0]

-1.0020552150077868

In [34]:
y_pred = model.predict(X_val)
y_pred

array([0, 0, 0, ..., 1, 0, 1], dtype=int64)

In [35]:
from sklearn.metrics import classification_report

In [36]:
report = classification_report(y_val, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.92      0.98      0.95      6382
           1       0.66      0.34      0.45       852

    accuracy                           0.90      7234
   macro avg       0.79      0.66      0.70      7234
weighted avg       0.89      0.90      0.89      7234



#### Regularized Logistic Regression

In [37]:
C_values = [0.01, 0.1, 1, 10, 100]


for C in C_values:
    # Create a logistic regression model with the current C value
    model = LogisticRegression(C=C, max_iter=1000, random_state=42)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the validation set
    y_pred = model.predict(X_val)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_val, y_pred)
    
    # Print the accuracy, rounded to 3 decimal digits
    print(f'Accuracy with C={C}: {round(accuracy, 3)}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy with C=0.01: 0.898


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy with C=0.1: 0.9


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy with C=1: 0.9


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy with C=10: 0.901
Accuracy with C=100: 0.9


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [38]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib

# Load your data
# Assuming X and y are your features and target variable respectively
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train the model
r_model = RandomForestClassifier()
r_model.fit(X_train, y_train)

# Save the trained model using joblib
joblib.dump(r_model, 'subscription_model.pkl')


['subscription_model.pkl']

In [34]:
r_y_pred = model.predict(X_val)
r_y_pred

array([0, 0, 0, ..., 1, 0, 1], dtype=int64)

In [35]:
report = classification_report(y_val, r_y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.92      0.98      0.95      6382
           1       0.68      0.35      0.46       852

    accuracy                           0.90      7234
   macro avg       0.80      0.66      0.70      7234
weighted avg       0.89      0.90      0.89      7234



In [31]:
#pip install streamlit


Note: you may need to restart the kernel to use updated packages.


In [42]:
#pip install joblib

Note: you may need to restart the kernel to use updated packages.


In [41]:
import streamlit as st
import joblib
import numpy as np

# Load the model
model = joblib.load('subscription_model.pkl')

# Title
st.title("Subscription Prediction App")

# Input form for customer details
st.header("Enter Customer Details")
balance = st.number_input("Balance", min_value=10, max_value=100000000, value=1000)
duration = st.number_input("Duration", min_value=1000, value=50000)
poutcome = st.text_input("Previous outcome", help="failure, succes, other, unkown", value='')
pdays = st.number_input("P Days", min_value=1, max_value=850, value=100)

# Prediction
if st.button("Predict"):
    features = np.array([[balance,duration,poutcome,pdays]])  # Adjust to your feature set
    prediction = model.predict(features)
    if prediction == 1:
        st.write("The customer is likely to subscribe.")
    else:
        st.write("The customer is not likely to subscribe.")
