In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv(r"C:\Users\USER\Documents\Dataset Exercise for Data Science\bank.csv")

In [3]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [4]:
print(df.shape)

(11162, 17)


In [5]:
#Cleaning the data by removing the outlier
q1 = df['balance'].quantile(0.9)
df = df[df['balance'] < q1]

q2 = df['duration'].quantile(0.99)
df = df[df['duration'] < q2]

q3 = df['previous'].quantile(0.99)
df = df[df['previous'] < q3]

q4 = df['campaign'].quantile(0.99)
df = df[df['campaign'] < q4]

In [6]:
# Separate features (X) and target variable (y)
X = df.drop('deposit', axis=1)
y = df['deposit']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Categorize the 'job' column in the training set
X_train.loc[~X_train['job'].isin(['unemployed']), 'job'] = 'employed'

# Categorize the 'job' column in the test set
X_test.loc[~X_test['job'].isin(['unemployed']), 'job'] = 'employed'

In [8]:
# Define the binning 'balance' categories
balance_bins = [-float('inf'), 0, 200, 401, 601, 801, 1201, 1601, 2401, 3201, float('inf')]
balance_labels = ['dormant', '0-200', '201-400', '401-600', '601-800', '801-1200', '1201-1600', '1601-2400', '2401-3200', '>3200']

# Apply binning to the 'balance' column in the training set
X_train['balance_category'] = pd.cut(X_train['balance'], bins=balance_bins, labels=balance_labels)

# Apply binning to the 'balance' column in the test set
X_test['balance_category'] = pd.cut(X_test['balance'], bins=balance_bins, labels=balance_labels)

In [9]:
# Categorize the 'contact' column in the training set
X_train.loc[~X_train['contact'].isin(['unknown']), 'contact'] = 'media'

# Categorize the 'contact' column in the test set
X_test.loc[~X_test['contact'].isin(['unknown']), 'contact'] = 'media'

In [10]:
# Define the binning 'duration' categories
duration_bins = [0, 60, 120, 180, 240, 300, 360, 420, 480, 540, 600, float('inf')]
duration_labels = ['0-60', '61-120', '121-180', '181-240', '241-300', '301-360', '361-420', '421-480', '481-540', '541-600', '>600']

# Apply binning to the 'duration' column in the training set
X_train['duration_category'] = pd.cut(X_train['duration'], bins=duration_bins, labels=duration_labels)

# Apply binning to the 'duration' column in the test set
X_test['duration_category'] = pd.cut(X_test['duration'], bins=duration_bins, labels=duration_labels)

In [11]:
# Define the binning 'campaign' categories
campaign_bins = [0, 1, 2, float('inf')]
campaign_labels = ['1', '2', 'More than 2']

# Apply binning to the 'campaign' column in the training set
X_train['campaign_category'] = pd.cut(X_train['campaign'], bins=campaign_bins, labels=campaign_labels)

# Apply binning to the 'campaign' column in the test set
X_test['campaign_category'] = pd.cut(X_test['campaign'], bins=campaign_bins, labels=campaign_labels)

In [12]:
# Define the binning 'pdays' categories
pdays_bins = [-np.inf, -0.5, 85.5, 100.5, 175.5, 190.5, np.inf]
pdays_labels = [99, 10, 1, 11, 2, 12]

# Apply binning to the 'pdays' column in the training set
X_train['pdays_category'] = pd.cut(X_train['pdays'], bins=pdays_bins, labels=pdays_labels)

# Apply binning to the 'pdays' column in the test set
X_test['pdays_category'] = pd.cut(X_test['pdays'], bins=pdays_bins, labels=pdays_labels)

In [13]:
#Change the value of [10, 11, 12] to [98] as others category
X_train['pdays_category'] = X_train['pdays_category'].replace([10, 11, 12], [98, 98, 98])
X_test['pdays_category'] = X_test['pdays_category'].replace([10, 11, 12], [98, 98, 98])

In [14]:
# Define the binning 'previous' categories
previous_bins = [-np.inf, 0.5, np.inf]
previous_labels = [0, 1]

# Apply binning to the 'previous' column in the training set
X_train['previous_category'] = pd.cut(X_train['previous'], bins=previous_bins, labels=previous_labels)

# Apply binning to the 'pdays' column in the test set
X_test['previous_category'] = pd.cut(X_test['previous'], bins=previous_bins, labels=previous_labels)

In [15]:
X_train.drop('balance', axis=1, inplace=True)
X_test.drop('balance', axis=1, inplace=True)

In [16]:
X_train.drop('duration', axis=1, inplace=True)
X_test.drop('duration', axis=1, inplace=True)

In [17]:
X_train.drop('campaign', axis=1, inplace=True)
X_test.drop('campaign', axis=1, inplace=True)

In [18]:
X_train.drop('pdays', axis=1, inplace=True)
X_test.drop('pdays', axis=1, inplace=True)

In [19]:
X_train.drop('previous', axis=1, inplace=True)
X_test.drop('previous', axis=1, inplace=True)

In [20]:
X_train.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'day', 'month', 'poutcome', 'balance_category',
       'duration_category', 'campaign_category', 'pdays_category',
       'previous_category'],
      dtype='object')

In [21]:
#Feature engineering
X_train = pd.get_dummies(X_train, drop_first = True)
X_test = pd.get_dummies(X_test, drop_first = True)

In [22]:
X_train.columns

Index(['age', 'day', 'job_unemployed', 'marital_married', 'marital_single',
       'education_secondary', 'education_tertiary', 'education_unknown',
       'default_yes', 'housing_yes', 'loan_yes', 'contact_unknown',
       'month_aug', 'month_dec', 'month_feb', 'month_jan', 'month_jul',
       'month_jun', 'month_mar', 'month_may', 'month_nov', 'month_oct',
       'month_sep', 'poutcome_other', 'poutcome_success', 'poutcome_unknown',
       'balance_category_0-200', 'balance_category_201-400',
       'balance_category_401-600', 'balance_category_601-800',
       'balance_category_801-1200', 'balance_category_1201-1600',
       'balance_category_1601-2400', 'balance_category_2401-3200',
       'balance_category_>3200', 'duration_category_61-120',
       'duration_category_121-180', 'duration_category_181-240',
       'duration_category_241-300', 'duration_category_301-360',
       'duration_category_361-420', 'duration_category_421-480',
       'duration_category_481-540', 'duration_ca

In [23]:
#Train the model using the training data
LogReg = LogisticRegression(solver = 'lbfgs')
LogReg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [24]:
#Predicting if a housing_yes subscribed to the term deposit
LogReg.score(X_test, y_test)

0.810477657935285