In [1]:
%autosave 10

Autosaving every 10 seconds


In [68]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from IPython.display import display

In [3]:
cols_to_use = ['age',
               'job',
                'marital',
                'education',
                'balance',
                'housing',
                'contact',
                'day',
                'month',
                'duration',
                'campaign',
                'pdays',
                'previous',
                'poutcome',
                'y']

In [4]:
df = pd.read_csv("../data/bank-full.csv", sep=';')
df.shape

(45211, 17)

In [5]:
df = df[cols_to_use].copy()

# Check Missing Values

In [6]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

# Question 1

In [65]:
df.education.mode()

0    secondary
Name: education, dtype: object

# Question 2

In [8]:
numerical = df.select_dtypes(include='number').columns.tolist()

In [66]:
corr_matrix = df[numerical].corr()
corr_matrix

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [73]:
print("Correlation age vs. balance")
display(corr_matrix.loc['age', 'balance'])
print("Correlation day vs. campaign")
display(corr_matrix.loc['day', 'campaign'])
print("Correlation day vs. pdays")
display(corr_matrix.loc['day', 'pdays'])
print("Correlation pdays vs. previous")
display(corr_matrix.loc['pdays', 'previous'])

Correlation age vs. balance


0.09778273937134807

Correlation day vs. campaign


0.1624902163261922

Correlation day vs. pdays


-0.09304407377294048

Correlation pdays vs. previous


0.4548196354805043

In [10]:
df['y'] = np.where(df['y']=='yes', 1, 0)

In [11]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

In [12]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [13]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

del df_train['y']
del df_val['y']
del df_test['y']

# Question 3

In [14]:
def mutual_info_score_fn(series, target=y_train):
    return np.round(mutual_info_score(series, target), 2)

In [15]:
categorical = [c for c in df.columns if c not in numerical+['y']]

In [16]:
minfo = df_train[categorical].apply(mutual_info_score_fn)
minfo.sort_values(ascending=False)

month        0.03
poutcome     0.03
job          0.01
housing      0.01
contact      0.01
marital      0.00
education    0.00
dtype: float64

# Question 4

In [22]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)


In [18]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [23]:
model.fit(X_train, y_train)

In [36]:
y_pred = model.predict_proba(X_val)[:, 1]
decision = (y_pred >= 0.5)
accuracy = np.round((y_val == decision).mean(), 2)
accuracy

0.9

# Q5: Feature Elimination

In [30]:
exlude_feats = ['age', 'balance', 'marital', 'previous']

In [52]:

def process_data(df_train, df_val, feat_to_exclude=None):
    features = categorical + numerical
    if not feat_to_exclude:
        print("Features for Base Model")
    else:
        print(f"Removing {feat_to_exclude} from Data")
        features = [x for x in features if x!=feat_to_exclude]
    print(f" Excluded {feat_to_exclude}. Number of Features {len(features)}")
    dv = DictVectorizer(sparse=False)
    train_dict = df_train[features].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)
    val_dict = df_val[features].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    return X_train, X_val
    

In [57]:
def calculate_accuracy(model_obj, X_val, label):
    y_pred = model_obj.predict_proba(X_val)[:, 1]
    decision = (y_pred >= 0.5)
    accuracy = (label == decision).mean()
    return accuracy

In [58]:
X_train_base, X_val_base = process_data(df_train=df_train, df_val=df_val)
model_base = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_base.fit(X_train_base, y_train)
accuracy_base = calculate_accuracy(model_base, X_val_base, y_val)
accuracy_base

Features for Base Model
 Excluded None. Number of Features 14


0.9011280690112807

In [61]:
elimination = {}
for feat in exlude_feats:
    X_train_ex, X_val_ex = process_data(df_train=df_train, df_val=df_val, feat_to_exclude=feat)
    model_ex = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_ex.fit(X_train_ex, y_train)
    accuracy_ex = calculate_accuracy(model_ex, X_val_ex, y_val)
    elimination[feat] = accuracy_base - accuracy_ex

Removing age from Data
 Excluded age. Number of Features 13
Removing balance from Data
 Excluded balance. Number of Features 13
Removing marital from Data
 Excluded marital. Number of Features 13
Removing previous from Data
 Excluded previous. Number of Features 13


In [62]:
elimination

{'age': -0.00011059500110588427,
 'balance': 0.00044238000442387015,
 'marital': -0.00022119000221187957,
 'previous': 0.0}

# Q6

In [63]:
reg_params = [0.01, 0.1, 1, 10, 100]
X_train, X_val = process_data(df_train=df_train, df_val=df_val)
best_acc = {}
for param in reg_params:
    model = LogisticRegression(solver='liblinear', C=param, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    accuracy_param = calculate_accuracy(model, X_val, y_val)
    best_acc[param] = np.round(accuracy_param, 3)

Features for Base Model
 Excluded None. Number of Features 14


In [64]:
best_acc

{0.01: 0.898, 0.1: 0.901, 1: 0.901, 10: 0.9, 100: 0.901}