In [1]:
import pickle

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier


In [2]:
# Parameters:

In [3]:
# Random Forest
n_estimators = 50
min_samples_leaf = 5
max_depth =10
random_state =1
t = 0.4


In [4]:
# Cross-validation
n_fold =5


In [5]:
# save file
output_file = 'model_rf_t=0{}.bin'.format(int(t*10))

In [6]:
# 1. Load the Data

In [7]:
!wget https://raw.githubusercontent.com/jcdumlao14/ML-zoomcamp-course-homework/main/Capstone%20Project-2/heart_statlog_cleveland_hungary_final.csv
     

--2023-01-20 15:26:14--  https://raw.githubusercontent.com/jcdumlao14/ML-zoomcamp-course-homework/main/Capstone%20Project-2/heart_statlog_cleveland_hungary_final.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 39689 (39K) [text/plain]
Saving to: ‘heart_statlog_cleveland_hungary_final.csv.1’


2023-01-20 15:26:16 (409 KB/s) - ‘heart_statlog_cleveland_hungary_final.csv.1’ saved [39689/39689]



In [8]:
df = pd.read_csv('heart_statlog_cleveland_hungary_final.csv')

In [9]:
# 2. Data preparation

In [10]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [11]:
categorical = ['sex','chest_pain_type','resting_ecg','st_slope']
numerical = ['age','resting_bp_s','cholesterol','fasting_blood_sugar','max_heart_rate','exercise_angina','oldpeak']

In [12]:

#categorical = df.select_dtypes(include=['object']).columns.tolist()
numerical = df.select_dtypes(include=['int64','float64']).columns.tolist()
numerical.remove('target')

In [13]:
# drop unrealistic null values


In [14]:
# train/val/test split
df_full_train, df_test = train_test_split(df, test_size=0.20, random_state=1)

In [15]:
# separate the target
y_train = df_full_train.target.values
y_test = df_test.target.values

In [16]:
# reset indexes after splitting shuffling
df_train = df_full_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
del df_test['target'] # remove the target

In [17]:
# encode and scale
dv = DictVectorizer(sparse=False)# for the categorical features
scaler = StandardScaler() # for the numerical features


In [18]:
# full training dataset
train_dict = df_train[categorical].to_dict(orient='records')
X_train_cat = dv.fit_transform(train_dict) # encode the categorical features

In [19]:
X_train_num = df_train[numerical].values
X_train_num = scaler.fit_transform(X_train_num) # scale the numerical features

In [20]:
X_train = np.column_stack([X_train_num, X_train_cat]) # join the matrices

In [21]:
# test dataset
test_dict = df_test[categorical].to_dict(orient='records')
X_test_cat = dv.transform(test_dict) # encode the categorical features

In [22]:
X_test_num = df_test[numerical].values
X_test_num = scaler.transform(X_test_num) # scale the numerical features

In [23]:
X_test = np.column_stack([X_test_num, X_test_cat]) # join the matrices

In [24]:
# 3. Model training

In [25]:
rf = RandomForestClassifier(n_estimators=n_estimators,
                           max_depth=max_depth,
                           min_samples_leaf=min_samples_leaf,
                           random_state=1)

model = rf.fit(X_train,y_train)
y_pred = rf.predict_proba(X_test)[:,1]
acc = accuracy_score(y_test, y_pred >= t)
f1 = f1_score(y_test, y_pred >= t)
auc = roc_auc_score(y_test, y_pred)
print('For the test dataset:','ACC:', acc.round(3),'F1:', f1.round(3),'ROC,AUC:', auc.round(3))

For the test dataset: ACC: 0.878 F1: 0.892 ROC,AUC: 0.967


In [26]:
# 4. save the model

with open(output_file,'wb')as f_out:
    pickle.dump((dv, scaler,model), f_out)


In [27]:
print(f'The model is saved to {output_file}')

The model is saved to model_rf_t=04.bin
