In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("../data/bank+marketing/bank/bank-full.csv", sep=";")
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [3]:
cols = ["age", "job", "marital", "education", "balance",
        "housing", "contact", "day", "month", "duration",
        "campaign", "pdays", "previous", "poutcome", "y"]

df = df[cols]

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   balance    45211 non-null  int64 
 5   housing    45211 non-null  object
 6   contact    45211 non-null  object
 7   day        45211 non-null  int64 
 8   month      45211 non-null  object
 9   duration   45211 non-null  int64 
 10  campaign   45211 non-null  int64 
 11  pdays      45211 non-null  int64 
 12  previous   45211 non-null  int64 
 13  poutcome   45211 non-null  object
 14  y          45211 non-null  object
dtypes: int64(7), object(8)
memory usage: 5.2+ MB


In [5]:
df.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [6]:
# mode of the eduction column
df.education.mode()

0    secondary
Name: education, dtype: object

In [7]:
df.select_dtypes(exclude='object').corr()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [8]:
df["y"] = (df["y"] == "yes").astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["y"] = (df["y"] == "yes").astype(int)


In [9]:
df.y.value_counts()

y
0    39922
1     5289
Name: count, dtype: int64

In [10]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

del df_train['y']
del df_val['y']
del df_test['y']

In [11]:
# mutual information score
def calculate_mi(series):
    return mutual_info_score(series, df_train_full["y"])

df_mi = df_train_full.select_dtypes("object").apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')

df_mi.round(2)


Unnamed: 0,MI
poutcome,0.03
month,0.02
contact,0.01
housing,0.01
job,0.01
education,0.0
marital,0.0


In [12]:

# apply one hot encoding
train_dict = df_train.to_dict(orient='records')

dict_vectorizer = DictVectorizer(sparse=False, separator="_")
dict_vectorizer.fit(train_dict)

In [13]:
dict_vectorizer.get_feature_names_out()

array(['age', 'balance', 'campaign', 'contact_cellular',
       'contact_telephone', 'contact_unknown', 'day', 'duration',
       'education_primary', 'education_secondary', 'education_tertiary',
       'education_unknown', 'housing_no', 'housing_yes', 'job_admin.',
       'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed',
       'job_services', 'job_student', 'job_technician', 'job_unemployed',
       'job_unknown', 'marital_divorced', 'marital_married',
       'marital_single', 'month_apr', 'month_aug', 'month_dec',
       'month_feb', 'month_jan', 'month_jul', 'month_jun', 'month_mar',
       'month_may', 'month_nov', 'month_oct', 'month_sep', 'pdays',
       'poutcome_failure', 'poutcome_other', 'poutcome_success',
       'poutcome_unknown', 'previous'], dtype=object)

In [14]:
X_train = dict_vectorizer.transform(train_dict)
X_val = dict_vectorizer.transform(df_val.to_dict(orient='records'))
X_test = dict_vectorizer.transform(df_test.to_dict(orient='records'))

In [15]:
X_train.shape, X_val.shape, X_test.shape

((27126, 47), (9042, 47), (9043, 47))

In [16]:
y_train.shape, y_val.shape, y_test.shape

((27126,), (9042,), (9043,))

In [18]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [19]:
y_pred = model.predict_proba(X_val)[:, 1]

In [20]:
score = (y_val == (y_pred > 0.5)).mean()
score.round(2)


np.float64(0.9)

In [21]:
for col in df_train.columns:
    dict_vectorizer.fit(df_train.drop(columns=col).to_dict(orient='records'))
    X_train_ = dict_vectorizer.transform(df_train.to_dict(orient='records'))
    X_val_ = dict_vectorizer.transform(df_val.drop(columns=col).to_dict(orient='records'))
    model.fit(X_train_, y_train)
    y_pred = model.predict_proba(X_val_)[:, 1]
    diff = score - (y_val == (y_pred > 0.5)).mean()
    print(f"{col}: accuracy difference = {diff}")

age: accuracy difference = 0.00011059500110588427
job: accuracy difference = 0.0
marital: accuracy difference = 0.0
education: accuracy difference = 0.00044238000442375913
balance: accuracy difference = 0.00011059500110588427
housing: accuracy difference = 0.0009953550099535136
contact: accuracy difference = 0.0008847600088476293
day: accuracy difference = 0.00011059500110588427
month: accuracy difference = 0.0014377350143772727
duration: accuracy difference = 0.011723070117230727
campaign: accuracy difference = 0.0011059500110595089
pdays: accuracy difference = 0.00022119000221187957
previous: accuracy difference = 0.00033178500331787486
poutcome: accuracy difference = 0.007299270072992692


In [22]:
for C in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    score = (y_val == (y_pred > 0.5)).mean()
    print(f"{C=}, {score=}")

C=0.01, score=np.float64(0.8978102189781022)
C=0.1, score=np.float64(0.9007962840079629)
C=1, score=np.float64(0.9012386640123866)
C=10, score=np.float64(0.9009068790090687)
C=100, score=np.float64(0.900353904003539)
