# Homework 3

### Notebook by: Christian Cabral

In [514]:
import numpy as np
import pandas as pd

In [515]:
# according to documentation, sep parameter is ',' by default
# since the columns in the csv file is separated by ';', set sep=';'
df = pd.read_csv("../../Datasets/bank/bank-full.csv", sep=';')

In [516]:
# y = subscribed a term deposit or not. 
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


### Data Preparation

In [517]:
cols = [col for col in df.columns if col != 'default']

In [518]:
df = df[cols]

In [519]:
df.isna().any()

age          False
job          False
marital      False
education    False
balance      False
housing      False
loan         False
contact      False
day          False
month        False
duration     False
campaign     False
pdays        False
previous     False
poutcome     False
y            False
dtype: bool

In [520]:
df.value_counts()

age  job         marital   education  balance  housing  loan  contact    day  month  duration  campaign  pdays  previous  poutcome  y  
18   student     single    primary    608      no       no    cellular   12   aug    267       1         -1     0         unknown   yes    1
45   management  married   tertiary   0        no       no    cellular   11   aug    102       4         -1     0         unknown   no     1
                 divorced  tertiary   1        no       no    cellular   6    aug    490       2         -1     0         unknown   yes    1
                                      54       no       yes   cellular   21   jul    34        3         -1     0         unknown   no     1
                                      220      yes      no    unknown    18   jun    89        4         -1     0         unknown   no     1
                                                                                                                                          ..
35   admin.      m

Notice the *unknown* values

In [521]:
numerical_variables = df.select_dtypes('number').columns.to_list()
categorical_variables = ['job',
  'marital',
  'education',
  'housing',
  'loan',
  'contact',
  'month',
  'poutcome']

In [522]:
numerical_variables, categorical_variables

(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'],
 ['job',
  'marital',
  'education',
  'housing',
  'loan',
  'contact',
  'month',
  'poutcome'])

Columns with "unknown" values:

In [523]:
df[categorical_variables].apply(lambda x: print(x.name) if x.str.contains('unknown').any() else None, axis=0);

job
education
contact
poutcome


# Question 1

In [524]:
df.y = (df.y == 'yes').astype(int)

In [525]:
df.education.mode()

0    secondary
Name: education, dtype: object

# Question 2

### Correlation Score

In [526]:
# extract correlation
corr_matrix = df[numerical_variables].corr()

In [527]:
# fill diagonals with 0, by default it is 1.
np.fill_diagonal(corr_matrix.values, 0)

In [528]:
corr_matrix

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,0.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,0.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,0.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,0.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,0.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,0.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,0.0


In [529]:
corr_matrix.max()

age         0.097783
balance     0.097783
day         0.162490
duration    0.021560
campaign    0.162490
pdays       0.454820
previous    0.454820
dtype: float64

### Mutual Information Score

In [530]:
from sklearn.metrics import mutual_info_score
# from IPython.display import display

In [531]:
scores = {}

In [532]:
for column in categorical_variables:
    score = mutual_info_score(df.y, df[column])
    scores[column] = round(score, 2)

In [533]:
scores

{'job': 0.01,
 'marital': 0.0,
 'education': 0.0,
 'housing': 0.01,
 'loan': 0.0,
 'contact': 0.01,
 'month': 0.02,
 'poutcome': 0.03}

### Data Splitting

In [534]:
from sklearn.model_selection import train_test_split

In [535]:
# 60 20 20 split
df_train_full, df_test = train_test_split(df[cols], test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=len(df_test), random_state=42)

In [536]:
train_full_y = df_train_full.values
test_y = df_test.y.values
train_y = df_train.y.values
val_y = df_val.y.values

In [537]:
del df_test['y']
del df_train['y']
del df_val['y']
del df_train_full['y']

In [538]:
len(df_train), len(df_val), len(df_test)

(27125, 9043, 9043)

In [539]:
len(train_y), len(val_y), len(test_y)

(27125, 9043, 9043)

### One-hot encoding

In [540]:
from sklearn.feature_extraction import DictVectorizer

In [541]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(df_train.to_dict(orient='records'))

In [542]:
X_train

array([[3.800e+01, 0.000e+00, 1.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [4.900e+01, 3.309e+03, 2.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [3.700e+01, 2.410e+03, 1.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       ...,
       [5.400e+01, 0.000e+00, 1.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [2.500e+01, 2.311e+03, 2.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [3.000e+01, 1.500e+01, 2.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00]])

In [543]:
X_val = dv.transform(df_val.to_dict(orient='records'))
X_test = dv.transform(df_test.to_dict(orient='records'))
X_full = dv.transform(df_train_full.to_dict(orient='records'))

### Training the model

In [544]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, train_y)
val_predictions = model.predict(X_val)

# accuracy
original_accuracy = accuracy_score(val_predictions, val_y)
np.round(original_accuracy, 2), original_accuracy

(0.9, 0.899922592060157)

### Feature Elimination

In [548]:
accuracy = []

for excluded_feature in ['age', 'previous', 'balance', 'marital']:
    # train with features excluding the current feature in the loop
    dv = DictVectorizer(sparse=False)
    feature_train = dv.fit_transform(df_train.drop(columns=excluded_feature).to_dict(orient='records'))
    feature_val = dv.transform(df_val.drop(columns=excluded_feature).to_dict(orient='records'))
    
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(feature_train, train_y)
    
    predictions = model.predict(feature_val)
    acc = accuracy_score(val_y, predictions)
    accuracy.append((excluded_feature, original_accuracy, acc, original_accuracy - acc))

In [549]:
df_results = pd.DataFrame(data=accuracy, columns=["excluded_feature", "original_accuracy", "accuracy", "difference"])

In [550]:
df_results.sort_values(by="difference")

Unnamed: 0,excluded_feature,original_accuracy,accuracy,difference
2,balance,0.899923,0.900254,0.000332
1,previous,0.899923,0.900365,0.000442
0,age,0.899923,0.898817,0.001106
3,marital,0.899923,0.901028,0.001106
