# Week 3 Homework

## Dataset

In [1]:
# !wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip -O bank-marketing.zip && unzip bank-marketing.zip 'bank.zip' && unzip bank.zip 'bank-full.csv' && rm bank-marketing.zip bank.zip

In [2]:
import pandas as pd


df = pd.read_csv("bank-full.csv", sep=";")
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


## Data Preparation

Select only the features from above

In [3]:
df = df[[
    "age",
    "job",
    "marital",
    "education",
    "balance",
    "housing",
    "contact",
    "day",
    "month",
    "duration",
    "campaign",
    "pdays",
    "previous",
    "poutcome",
    "y",
]]

In [4]:
numerical = df.select_dtypes("number").columns.to_list()
categorical = df.select_dtypes("object").columns.to_list()
categorical.remove("y")

In [5]:
numerical

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [6]:
categorical

['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

Check if the missing values are presented in the features

In [7]:
df.isna().any()

age          False
job          False
marital      False
education    False
balance      False
housing      False
contact      False
day          False
month        False
duration     False
campaign     False
pdays        False
previous     False
poutcome     False
y            False
dtype: bool

There seem to be no missing values in the features. However, let's take a look at the unique values per feature:

In [8]:
df.apply(lambda x: x.unique(), axis=0)

age          [58, 44, 33, 47, 35, 28, 42, 43, 41, 29, 53, 5...
job          [management, technician, entrepreneur, blue-co...
marital                            [married, single, divorced]
education              [tertiary, secondary, unknown, primary]
balance      [2143, 29, 2, 1506, 1, 231, 447, 121, 593, 270...
housing                                              [yes, no]
contact                         [unknown, cellular, telephone]
day          [5, 6, 7, 8, 9, 12, 13, 14, 15, 16, 19, 20, 21...
month        [may, jun, jul, aug, oct, nov, dec, jan, feb, ...
duration     [261, 151, 76, 92, 198, 139, 217, 380, 50, 55,...
campaign     [1, 2, 3, 5, 4, 6, 7, 8, 9, 10, 11, 12, 13, 19...
pdays        [-1, 151, 166, 91, 86, 143, 147, 89, 140, 176,...
previous     [0, 3, 1, 4, 2, 11, 16, 6, 5, 10, 12, 7, 18, 9...
poutcome                    [unknown, failure, other, success]
y                                                    [no, yes]
dtype: object

These are the columns with missing values designated as `unknown`:

In [9]:
df.select_dtypes(include="object").apply(lambda x: print(x.name, "\n", x.unique(), end="\n\n") if x.str.contains("unknown").any() else None, axis=0);

job 
 ['management' 'technician' 'entrepreneur' 'blue-collar' 'unknown'
 'retired' 'admin.' 'services' 'self-employed' 'unemployed' 'housemaid'
 'student']

education 
 ['tertiary' 'secondary' 'unknown' 'primary']

contact 
 ['unknown' 'cellular' 'telephone']

poutcome 
 ['unknown' 'failure' 'other' 'success']



## Question 1

In [10]:
df.education.mode()

0    secondary
Name: education, dtype: object

## Question 2

In [11]:
correlation_matrix = df[numerical].corr()
correlation_matrix

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [12]:
correlation_matrix.unstack().abs()[correlation_matrix.unstack().abs().lt(1)].idxmax()

('pdays', 'previous')

## Target Encoding

In [13]:
df = (
    df
    .assign(y=(df.y == "yes").astype(int))
)

In [14]:
from sklearn.model_selection import train_test_split

df_full_train, df_test, y_full_train, y_test = train_test_split(df[categorical + numerical], df.y, test_size=0.2, random_state=42)
df_train, df_val, y_train, y_val = train_test_split(df_full_train[categorical + numerical], y_full_train, test_size=0.25, random_state=42)

In [15]:
from sklearn.metrics import mutual_info_score

for col in ["contact", "education", "housing", "poutcome"]:
    print(round(mutual_info_score(y_train, df_train[col]), 2))

0.01
0.0
0.01
0.03


`poutcome` has highest MI

## Question 4

In [16]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression


dicts_train = df_train.to_dict(orient="records")
dicts_val = df_val.to_dict(orient="records")

dv = DictVectorizer(sparse=False)
dv.fit(dicts_train)

X_train = dv.transform(dicts_train)
X_val = dv.transform(dicts_val)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred_val = model.predict(X_val)

(y_pred_val == y_val).mean().round(2)

np.float64(0.9)

## Question 5

In [35]:
dicts_train = df_train.to_dict(orient="records")
dicts_val = df_val.to_dict(orient="records")
    
dv = DictVectorizer(sparse=False)
dv.fit(dicts_train)

X_train = dv.transform(dicts_train)
X_val = dv.transform(dicts_val)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred_val = model.predict(X_val)

accuracy_baseline = (y_pred_val == y_val).mean()

accuracy_baseline

np.float64(0.9009068790090687)

In [58]:
features_to_exclude = ["age", "balance", "marital", "previous"]
results = []

for feature_to_exclude in features_to_exclude:
    
    dicts_train = df_train.drop(columns=feature_to_exclude).to_dict(orient="records")
    dicts_val = df_val.drop(columns=feature_to_exclude).to_dict(orient="records")

    dv = DictVectorizer(sparse=False)
    dv.fit(dicts_train)

    X_train = dv.transform(dicts_train)
    X_val = dv.transform(dicts_val)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred_val = model.predict(X_val)
    accuracy = (y_pred_val == y_val).mean()
    difference = abs(accuracy - accuracy_baseline)
    results.append((feature_to_exclude, set(features_to_exclude) - set(feature_to_exclude), accuracy, difference))

    print(f"Excluded feature '{feature_to_exclude}', Remaining features: {', '.join(feature for feature in features_to_exclude if feature!=feature_to_exclude)}, Accuracy: {accuracy}, Accuracy difference with baseline: {difference}")

df_results = pd.DataFrame(data=results, columns=["excluded feature", "Remaining Features", "accuracy", "difference"])

Excluded feature 'age', Remaining features: balance, marital, previous, Accuracy: 0.9013492590134926, Accuracy difference with baseline: 0.00044238000442387015
Excluded feature 'balance', Remaining features: age, marital, previous, Accuracy: 0.9010174740101747, Accuracy difference with baseline: 0.0001105950011059953
Excluded feature 'marital', Remaining features: age, balance, previous, Accuracy: 0.9009068790090687, Accuracy difference with baseline: 0.0
Excluded feature 'previous', Remaining features: age, balance, marital, Accuracy: 0.9009068790090687, Accuracy difference with baseline: 0.0


In [59]:
df_results.sort_values(by="difference")

Unnamed: 0,excluded feature,Remaining Features,accuracy,difference
3,previous,"{balance, previous, age, marital}",0.900907,0.0
2,marital,"{balance, previous, age, marital}",0.900907,0.0
1,balance,"{balance, previous, age, marital}",0.901017,0.000111
0,age,"{balance, previous, age, marital}",0.901349,0.000442


## Question 6

In [29]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression


dicts_train = df_train.to_dict(orient="records")
dicts_val = df_val.to_dict(orient="records")

dv = DictVectorizer(sparse=False)
dv.fit(dicts_train)

X_train = dv.transform(dicts_train)
X_val = dv.transform(dicts_val)

results = []

for c in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred_val = model.predict(X_val)

    accuracy = (y_pred_val == y_val).mean().round(3)

    results.append((c, accuracy))

    df_results = pd.DataFrame(data=results, columns=["C", "accuracy"])

display(df_results.sort_values("accuracy", ascending=False))


Unnamed: 0,C,accuracy
1,0.1,0.901
3,10.0,0.901
2,1.0,0.901
4,100.0,0.901
0,0.01,0.898


In [30]:
df_results.iloc[df_results["accuracy"].idxmax()]

C           0.100
accuracy    0.901
Name: 1, dtype: float64