In [1]:
import os
from pathlib import Path

from dotenv import load_dotenv

load_dotenv()
root_data = os.getenv("KAGGLE_FILES_DIR")
dataset_path = Path(os.getcwd(), "..", root_data)
raw = Path(dataset_path, "raw")

#### Read raw data

In [2]:
import pandas as pd
df = pd.read_csv(Path(raw, "covid_data.csv"))

In [3]:
df.head()

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,...,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
0,2,1,1,1,03/05/2020,97,1,65,2,2,...,2,2,1,2,2,2,2,2,3,97
1,2,1,2,1,03/06/2020,97,1,72,97,2,...,2,2,1,2,2,1,1,2,5,97
2,2,1,2,2,09/06/2020,1,2,55,97,1,...,2,2,2,2,2,2,2,2,3,2
3,2,1,1,1,12/06/2020,97,2,53,2,2,...,2,2,2,2,2,2,2,2,7,97
4,2,1,2,1,21/06/2020,97,2,68,97,1,...,2,2,1,2,2,2,2,2,3,97


#### Preprocess data using script

In [4]:
from scripts.data import preprocess_data

preprocess_data(dataset_path)


Reading data from dataset path: /Users/isulim/Sages/inzynier-ai/xgboost-mlflow/notebooks/../data/raw
Dropping missing values in relevant columns.
Dropping irrelevant columns.
Mapping binary values to 0-1
Renaming columns.
Saving X and y to CSV in: /Users/isulim/Sages/inzynier-ai/xgboost-mlflow/notebooks/../data/preprocessed
Preprocessing finished.


#### Read preprocessed datasets

In [5]:
X = pd.read_csv(Path(dataset_path, "preprocessed", "X_pre.csv"))
y = pd.read_csv(Path(dataset_path, "preprocessed", "y_pre.csv"))

In [6]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

#### Split data into train, validation and test (80%/10%/10%).

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

#### Train simple model with XGBClassifier

In [9]:
from sklearn.metrics import roc_auc_score, accuracy_score
params = {
    "objective": "binary:logistic",
    "n_estimators": 300,
    "max_depth": 20,
}
model_cls = xgb.XGBClassifier(**params)
model_cls.fit(X_train, y_train, eval_set=[(X_val, y_val)])
y_pred = model_cls.predict(X_test)
acc = accuracy_score(y_test, y_pred)
roc = roc_auc_score(y_test, y_pred)
print(f"{acc=}")
print(f"{roc=}")

[0]	validation_0-logloss:0.22998
[1]	validation_0-logloss:0.19836
[2]	validation_0-logloss:0.17792
[3]	validation_0-logloss:0.16418
[4]	validation_0-logloss:0.15473
[5]	validation_0-logloss:0.14821
[6]	validation_0-logloss:0.14368
[7]	validation_0-logloss:0.14051
[8]	validation_0-logloss:0.13832
[9]	validation_0-logloss:0.13683
[10]	validation_0-logloss:0.13580
[11]	validation_0-logloss:0.13512
[12]	validation_0-logloss:0.13467
[13]	validation_0-logloss:0.13439
[14]	validation_0-logloss:0.13424
[15]	validation_0-logloss:0.13415
[16]	validation_0-logloss:0.13413
[17]	validation_0-logloss:0.13413
[18]	validation_0-logloss:0.13415
[19]	validation_0-logloss:0.13417
[20]	validation_0-logloss:0.13421
[21]	validation_0-logloss:0.13427
[22]	validation_0-logloss:0.13431
[23]	validation_0-logloss:0.13436
[24]	validation_0-logloss:0.13441
[25]	validation_0-logloss:0.13446
[26]	validation_0-logloss:0.13450
[27]	validation_0-logloss:0.13454
[28]	validation_0-logloss:0.13457
[29]	validation_0-loglos

#### Train simple model with XGB Random Forest Classifier

In [10]:
from sklearn.metrics import roc_auc_score, accuracy_score
params = {
    "objective": "binary:logistic",
    "n_estimators": 300,
    "max_depth": 20,
}
model_cls = xgb.XGBRFClassifier(**params)
model_cls.fit(X_train, y_train, eval_set=[(X_val, y_val)])
y_pred = model_cls.predict(X_test)
acc = accuracy_score(y_test, y_pred)
roc = roc_auc_score(y_test, y_pred)
print(f"{acc=}")
print(f"{roc=}")



[0]	validation_0-logloss:0.18065
acc=0.930224192576173
roc=0.7810445637887908


Conclusion: XGB-RandomForest results in very similar accuracy and much better ROC.  
So XGB-RF is the way to go in further MLFlow experiments.