In [255]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [187]:
df=pd.read_csv('../data/bank-full.csv', sep=';')

In [188]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [228]:
target='y'
# remove target
X=df.drop(columns=['y']).copy()
# get list of columns
cols=list(X.columns)
# start choices dictionary to capture categorical options for HTML
choices={col: None for col in cols}

# get all categorical columns
cat_cols=X.dtypes[X.dtypes=='object'].index
# get all continuous columns
cont_cols=X.dtypes[X.dtypes!='object'].index
# get all categorical features
unique_vals_count=df[cat_cols].nunique().sum()
print(f'There should be {len(cont_cols)-len(cat_cols)+unique_vals_count} columns')

There should be 42 columns


In [229]:
# set X as only continuous features
X=X.drop(columns=cat_cols).copy()

# OHE transform categorical features
ohe=OneHotEncoder(sparse_output=False)
cat_cols_transformed=ohe.fit_transform(df[cat_cols])
categories=ohe.categories_
cat_feature_names=ohe.get_feature_names_out()

# map options to each categorical variable for HTML
for k, v in zip(cat_cols, categories): 
    choices[k]=list(v)

In [231]:
X[cat_feature_names]=cat_cols_transformed
y=df[target].copy()

In [207]:
X_train, X_test, y_train, y_test=train_test_split(X, y)

In [232]:
scaler=StandardScaler()
scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [175]:
rfc=RandomForestClassifier()
rfc.fit(X_train_scaled, y_train)
print(f'Train score: {rfc.score(X_train_scaled, y_train)}')
print(f'Test score: {rfc.score(X_test_scaled, y_test)}')

Train score: 1.0
Test score: 0.9075466690259223


In [257]:
print(classification_report(y_test, rfc.predict(X_test_scaled)))

              precision    recall  f1-score   support

          no       0.98      0.99      0.99      9953
         yes       0.95      0.83      0.88      1350

    accuracy                           0.97     11303
   macro avg       0.96      0.91      0.94     11303
weighted avg       0.97      0.97      0.97     11303



In [263]:
# sample 'yes' prediction
sample_yes=pd.concat([X_test.iloc[idx] for idx, result in enumerate(results) if result=='yes'], axis=1).T
sample_yes.iloc[0]

age                      37.0
balance                5024.0
day                      25.0
duration                661.0
campaign                  4.0
pdays                    -1.0
previous                  0.0
job_admin.                0.0
job_blue-collar           1.0
job_entrepreneur          0.0
job_housemaid             0.0
job_management            0.0
job_retired               0.0
job_self-employed         0.0
job_services              0.0
job_student               0.0
job_technician            0.0
job_unemployed            0.0
job_unknown               0.0
marital_divorced          0.0
marital_married           0.0
marital_single            1.0
education_primary         0.0
education_secondary       1.0
education_tertiary        0.0
education_unknown         0.0
default_no                1.0
default_yes               0.0
housing_no                1.0
housing_yes               0.0
loan_no                   0.0
loan_yes                  1.0
contact_cellular          1.0
contact_te

In [233]:
import pickle

with open('choices.pkl', 'wb') as f: 
    pickle.dump(choices, f)
        
with open('scaler.pkl', 'wb') as f: 
    pickle.dump(scaler, f)
    
with open('ohe.pkl', 'wb') as f: 
    pickle.dump(ohe, f)    
    
with open('model.pkl', 'wb') as f: 
    pickle.dump(rfc, f)