## Imports

In [757]:
from IPython.display import display

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression

# import matplotlib.pyplot as plt
# import seaborn as sns
# %matplotlib inline

## Utilities

In [758]:
def numerical_features(df: pd.DataFrame):
    cols = df.columns[ (df.dtypes != 'object') ]
    return list(cols)

def categorical_features(df: pd.DataFrame):
    cols = df.columns[ (df.dtypes == 'object') ]
    return list(cols)

In [759]:
def validation_testing_training_full_split(dataframe: pd.DataFrame, seed: int = 42, validation: float = 0.2, testing: float = 0.2):
    assert 0 < validation and 0 < testing and 1 > (validation + testing)

    validation_of_full = validation / (1 - testing)
    if validation_of_full == 0:
        validation_of_full = None
        
    df_full,     df_testing    = train_test_split(dataframe, test_size=testing,            random_state=seed, shuffle=True)
    df_training, df_validation = train_test_split(df_full,   test_size=validation_of_full, random_state=seed, shuffle=True)
    
    df_validation = df_validation.reset_index(drop=True)
    df_testing = df_testing.reset_index(drop=True)
    df_training = df_training.reset_index(drop=True)
    df_full = df_full.reset_index(drop=True)
    
    return df_validation, df_testing, df_training, df_full

In [760]:
def y_split(dataframe: pd.DataFrame, yColumn: str, drop: list[str] = []):
    columns = set(dataframe.columns)   
    assert columns.issuperset([yColumn]), f'{yColumn} not found in dataframe'
    assert columns.issuperset(drop), f'At least one of {drop} not found in dataframe'
    
    df = dataframe.copy()
    y = df[yColumn]
    for col in drop + [yColumn]:
        del df[col]
        
    return df, y

In [761]:
def regularize(X, r=0.000000001):
    return X + np.eye(X.shape[0]) * r

In [762]:
def display_predictive_features_for_target(df: pd.DataFrame, target: str, categorical = []):
    global_target = df[target].mean()
    for c in categorical:
        df_group = df.groupby(c)[target].agg('mean','count')
        df_group['diff'] = df_group.mean - global_target
        df_group['risk'] = df_group.mean / global_target
        display(df_group)

In [763]:
# def regularize(X, r: float = 0.00000001):
#     return X if r == 1 else X + np.eye(X.shape[0]) * r
#     
# X = [
#     [1, 2, 2],
#     [2, 1, 1], # r
#     [2, 1, 1], # r
#     [0, 9, 9],
#     [1, 0, 0],
# ] #     c  c
# X = np.array(X)
# XTX = X.T.dot(X)
# print(XTX) # duplicate columns is an issue for linear / logistic regression
# np.linalg.inv(XTX)
# 
# regularize(X, r=0.01)

In [764]:
def sigmoid(score):
    return 1 / (1 + np.exp(-score))

# z = np.linspace(-5, 5, 51)
# plt.plot(z, sigmoid(z))

In [765]:
def one_hot_encode(df: pd.DataFrame, drop=[]):
    assert set(df.columns).issuperset(drop), f'At least one of {drop} is not found in the DataFrame `df`'
    
    df_encode = df.copy()
    for feature in drop:
        del df_encode[feature]
        
    dv = DictVectorizer(sparse=False)
    X = dv.fit_transform( df_encode.to_dict(orient='records') )
    
    assert len(dv.feature_names_) == X.shape[1]
    return X, dv.feature_names_

In [766]:
def model_accuracy(model: LinearRegression | LogisticRegression, df: pd.DataFrame, y: pd.Series, drop = [], threshold: float = 0.5) -> float:
    assert df.shape[0] == y.shape[0], '`df` and `y` mismatch'
    assert 0 < threshold and threshold < 1, 'Invalid threshold'
    
    X, encoded_features = one_hot_encode(df, drop)
    
    model.fit(X, y)
    # w0 = model.intercept_[0]                            # bias term / baseline / intercept
    # w = dict( zip( encoded_features, model.coef_[0]) )  # feature weights / coeficients
    # display(w0, w)
    
    y_pred = model.predict_proba(X)[:, 1]
    y_pred_decision = (y_pred >= threshold).astype(int)
    accuracy = (y_pred_decision == y).mean()

    return accuracy

## Data Preparation

In [767]:
df = pd.read_csv('./bank-full.csv', sep=';')
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


### Drop excluded features (see homework instructions)

In [768]:
for col in df.columns:
    if col not in ['age','job','marital','education','balance','housing','contact','day','month','duration','campaign','pdays','previous','poutcome','y']:
        del df[col]
        
df.head().T

Unnamed: 0,0,1,2,3,4
age,58,44,33,47,33
job,management,technician,entrepreneur,blue-collar,unknown
marital,married,single,married,married,single
education,tertiary,secondary,secondary,unknown,unknown
balance,2143,29,2,1506,1
housing,yes,yes,yes,yes,no
contact,unknown,unknown,unknown,unknown,unknown
day,5,5,5,5,5
month,may,may,may,may,may
duration,261,151,76,92,198


### Fill with zeros (0) (see homework instructions)

In [769]:
#df.fillna(0, inplace=True)
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

### Question 1

In [770]:
df.education.mode()

0    secondary
Name: education, dtype: object

What is the most frequent observation (mode) for the column `education`?

- `secondary`

## Data Exploration

### Question 2

In [771]:
feature_combinations = [
    ('age', 'balance'),
    ('day', 'campaign'),
    ('day', 'pdays'),
    ('pdays', 'previous'),
]

for k,v in feature_combinations:
    print(k, 'correlates to', v, ': ')
    display( df[k].corr( df[v] ) )


age correlates to balance : 


0.09778273937134752

day correlates to campaign : 


0.16249021632619282

day correlates to pdays : 


-0.09304407377294048

pdays correlates to previous : 


0.45481963548050097

What are the two features that have the biggest correlation?

- `pdays` and `previous`

### Make `y` binary

In [772]:
df['encoded_y'] = df.y.str.lower() == 'yes' 
df['y'] = df['encoded_y'].astype(int)
del df['encoded_y']
df.tail()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
45206,51,technician,married,tertiary,825,no,cellular,17,nov,977,3,-1,0,unknown,1
45207,71,retired,divorced,primary,1729,no,cellular,17,nov,456,2,-1,0,unknown,1
45208,72,retired,married,secondary,5715,no,cellular,17,nov,1127,5,184,3,success,1
45209,57,blue-collar,married,secondary,668,no,telephone,17,nov,508,4,-1,0,unknown,0
45210,37,entrepreneur,married,secondary,2971,no,cellular,17,nov,361,2,188,11,other,0


### Split the data

#### Training, Testing, Validation, & Full (Training + Validation)

In [773]:
df_val, df_test, df_train, df_full = validation_testing_training_full_split(df)

nTotal = len(df)
nVal = len(df_val)
nTest = len(df_test)
nTrain = len(df_train)
nFull = len(df_full)

round(nVal/nTotal, 1), round(nTest/nTotal, 1), round(nTrain/nTotal, 1), round(nFull/nTotal,1), round(nTotal/nTotal)

(0.2, 0.2, 0.6, 0.8, 1)

#### Split out `y` (target feature) from all datasets 

In [774]:
df_val, y_val = y_split(df_val, 'y')
df_test, y_test = y_split(df_test, 'y')
df_train, y_train = y_split(df_train, 'y')
df_full, y_full = y_split(df_full, 'y')

assert df_val.shape[1] == df_test.shape[1] and df_test.shape[1] == df_train.shape[1] and df_train.shape[1] == df_full.shape[1]
assert len(y_val) == df_val.shape[0] and len(y_test) == df_test.shape[0] and len(y_train) == df_train.shape[0] and len(y_full) == df_full.shape[0]

In [775]:
df_val.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome
0,38,services,divorced,secondary,-10,yes,unknown,17,jun,61,2,-1,0,unknown
1,42,management,single,tertiary,1146,yes,unknown,15,may,98,2,-1,0,unknown
2,43,management,married,tertiary,149,yes,unknown,23,jun,662,2,-1,0,unknown
3,50,management,married,tertiary,8205,yes,telephone,25,oct,293,3,508,1,other
4,43,management,married,tertiary,79,no,cellular,26,may,640,1,-1,0,unknown


### Question 3

In [776]:
categorical_features(df_train)

['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

In [777]:
for feat in categorical_features(df_train):
    display(f'{feat} correlates with y_train: { round( mutual_info_score(df_train[feat], y_train) , 2) }')

'job correlates with y_train: 0.01'

'marital correlates with y_train: 0.0'

'education correlates with y_train: 0.0'

'housing correlates with y_train: 0.01'

'contact correlates with y_train: 0.01'

'month correlates with y_train: 0.03'

'poutcome correlates with y_train: 0.03'

Which of these variables has the biggest mutual information score?
  
- `poutcome`

### One-hot encode training and validation datasets

In [778]:
dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform( df_train.to_dict(orient='records') )
X_val = dv.transform( df_val.to_dict(orient='records') )
assert len(dv.feature_names_) == X_train.shape[1] and X_train.shape[1] == X_val.shape[1]
assert len(y_train) == X_train.shape[0] and len(y_val) == X_val.shape[0]

dv.feature_names_

['age',
 'balance',
 'campaign',
 'contact=cellular',
 'contact=telephone',
 'contact=unknown',
 'day',
 'duration',
 'education=primary',
 'education=secondary',
 'education=tertiary',
 'education=unknown',
 'housing=no',
 'housing=yes',
 'job=admin.',
 'job=blue-collar',
 'job=entrepreneur',
 'job=housemaid',
 'job=management',
 'job=retired',
 'job=self-employed',
 'job=services',
 'job=student',
 'job=technician',
 'job=unemployed',
 'job=unknown',
 'marital=divorced',
 'marital=married',
 'marital=single',
 'month=apr',
 'month=aug',
 'month=dec',
 'month=feb',
 'month=jan',
 'month=jul',
 'month=jun',
 'month=mar',
 'month=may',
 'month=nov',
 'month=oct',
 'month=sep',
 'pdays',
 'poutcome=failure',
 'poutcome=other',
 'poutcome=success',
 'poutcome=unknown',
 'previous']

### Logistic Regression

In [779]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

model.fit(X_train, y_train)

model.coef_[0].round(3)

array([-2.000e-03,  0.000e+00, -8.800e-02,  2.550e-01,  7.100e-02,
       -1.231e+00,  6.000e-03,  4.000e-03, -4.190e-01, -2.450e-01,
       -5.400e-02, -1.870e-01, -8.600e-02, -8.190e-01,  7.300e-02,
       -2.460e-01, -2.400e-01, -2.810e-01, -8.100e-02,  3.330e-01,
       -2.720e-01, -1.540e-01,  2.450e-01, -1.630e-01,  1.700e-02,
       -1.350e-01, -2.970e-01, -4.530e-01, -1.550e-01,  7.300e-02,
       -6.690e-01,  3.200e-01, -3.130e-01, -9.500e-01, -9.500e-01,
        2.630e-01,  1.249e+00, -4.800e-01, -9.010e-01,  7.370e-01,
        7.140e-01, -1.000e-03, -7.620e-01, -5.410e-01,  1.492e+00,
       -1.094e+00,  6.000e-03])

In [780]:
model.intercept_[0]

-0.9051666407258808

In [781]:
#  Hard predictions
model.predict(X_train)

array([0, 0, 0, ..., 0, 1, 0])

In [782]:
# Soft predictions: 
# probability of NOT vs probability of FOR
model.predict_proba(X_train)

array([[0.96745333, 0.03254667],
       [0.96733526, 0.03266474],
       [0.93602695, 0.06397305],
       ...,
       [0.99057867, 0.00942133],
       [0.24016857, 0.75983143],
       [0.86004349, 0.13995651]])

### Question 4

In [783]:
y_train_pred = model.predict_proba(X_train)[:, 1]
y_val_pred   = model.predict_proba(X_val)[:, 1]

In [784]:
model.predict(X_val).mean()

0.06126963061269631

In [785]:
y_val_pred_decision = y_val_pred >= 0.5
y_val_pred_decision.mean()

0.06126963061269631

In [786]:
(y_val == y_val_pred_decision).mean()

0.9015704490157045

* Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

- 0.9

### Question 5

In [787]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

df_train_accuracy = model_accuracy(model, df_train, y_train)
print(f'Against an accuracy of { round(df_train_accuracy, 4) }:')

for feature in ['age', 'balance', 'marital', 'previous']:
    dropped_feature_accuracy = model_accuracy(model, df_train, y_train, drop=[feature])
    display(f' - dropping {feature} -> { round(dropped_feature_accuracy, 4) } accuracy, a change of { dropped_feature_accuracy - df_train_accuracy }')

Against an accuracy of 0.9029:


' - dropping age -> 0.9027 accuracy, a change of -0.00025805500258058167'

' - dropping balance -> 0.9032 accuracy, a change of 0.00025805500258047065'

' - dropping marital -> 0.9025 accuracy, a change of -0.00040551500405516805'

' - dropping previous -> 0.9035 accuracy, a change of 0.0005161100051610523'

Which of following feature has the smallest difference?

- `balance`

### Question 6

In [788]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
df_train_accuracy = model_accuracy(model, df_train, y_train)
print(f'Against an accuracy of { round(df_train_accuracy, 4) }:')

for r in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=r, max_iter=1000, random_state=42)
    regularized_val_accuracy = model_accuracy(model, df_val, y_val)
    display(f' - regularized with {r} -> { round(regularized_val_accuracy, 4) } accuracy, a change of { regularized_val_accuracy - df_train_accuracy }')

Against an accuracy of 0.9029:


' - regularized with 0.01 -> 0.8949 accuracy, a change of -0.007999705079997144'

' - regularized with 0.1 -> 0.901 accuracy, a change of -0.001916980019169845'

' - regularized with 1 -> 0.9015 accuracy, a change of -0.0014746000147460858'

' - regularized with 10 -> 0.9018 accuracy, a change of -0.001142815011428211'

' - regularized with 100 -> 0.9017 accuracy, a change of -0.0012534100125340952'

Which of these `C` leads to the best accuracy on the validation set?

- 10