In [None]:
import os
import polars as pl
import polars.selectors as cs
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt
import mlflow
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [None]:
# adding this because of a bug. See similar issue here: https://github.com/pypa/setuptools/issues/3297
os.environ["SETUPTOOLS_USE_DISTUTILS"] = "stdlib"

In [None]:
MLFLOW_TRACKING_URI = "http://localhost:5001"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("credit-card-fraud-detection")

<h2> <class="myText"> Notebook links </h2>
<p> <class="myText"> <em> (Note, these links don't tend to work when you open the notebook in PyCharm, for example. Open the notebook in your browser to navigate the notebook using the links)</em> </p>

<p> 1. <a href="#data_preparation">Data preparation</a></p>
<p> 2. <a href="#validation_framework_setup">Setting up the validation framework</a></p>
<p> 3. <a href="#eda">Exploratory data analysis</a></p>
<ul>
    <li><a href="#feature_importance">Feature importance</a></li>
</ul>
<p> 4. <a href="#encoding">One-hot encoding</a></p>
<p> 5. <a href="#model_training">Model training</a></p>
<ul>
    <li><a href="#random_forest">Random Forest</a></li>
</ul>
<p> 6. <a href="#final_training">Train the final model</a></p>

# Data preparation <a name = "data_preparation"></a>

In [None]:
# download the dataset from here: (https://www.kaggle.com/datasets/nelgiriyewithana/credit-card-fraud-detection-dataset-2023?resource=download).
df = (
    pl.scan_csv("../data/creditcard_2023.csv")
    .select(pl.all().shrink_dtype().name.to_lowercase())
    .rename({"class": "is_fraud"})
).collect()

In [None]:
df.describe()

# Setting up the validation framework <a name = "validation_framework_setup"></a>

In [None]:
df_full_train, df_test = train_test_split(
    df, test_size=0.2, shuffle=True, random_state=11
)
df_train, df_val = train_test_split(
    df_full_train, test_size=0.25, shuffle=True, random_state=11
)

# y_train = df_train.select("class").to_numpy().flatten()
y_train = df_train.select("is_fraud").to_pandas().values
y_val = df_val.select("is_fraud").to_numpy().flatten()
y_test = df_test.select("is_fraud").to_numpy().flatten()

df_train = df_train.drop("is_fraud")
df_val = df_val.drop("is_fraud")

# Exploratory data analysis <a name = "eda"></a>

In [None]:
# check for nulls
df.null_count()

## Feature importance <a name = "feature_importance"></a>

In [None]:
categorical = df_train.select(cs.string()).columns
numerical = df_train.select(cs.numeric()).columns

In [None]:
# check correlation coefficients between columns. Visualise this in a heatmap by creating a correlation matrix between all the numerical features
plt.figure(figsize=(20, 15))
sns.heatmap(
    df_full_train[numerical + ["is_fraud"]].corr(),
    annot=True,
    linewidths=0.5,
    cmap="coolwarm",
    fmt=".2f",
    xticklabels=df_full_train[numerical + ["is_fraud"]].columns,
    yticklabels=df_full_train[numerical + ["is_fraud"]].columns,
)
plt.title("Heatmap showing correlations between numerical data")
plt.show()

In [None]:
df_train.select(categorical).head()

In [None]:
df.select(numerical).head()

In [None]:
# note, I have included numerical variables as well here, but the DictVectorizer is smart enough to recognise that
dicts_train = df_train.select(categorical + numerical).to_dicts()

In [None]:
dicts_val = df_val.select(categorical + numerical).to_dicts()

# Model training <a name = "model_training"></a>

### Random Forest <a name = "random_forest"></a>

In [None]:
# hyperparameter running let's tune the max_depth parameter first


def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("developer", "james")
        mlflow.set_tag("model", "randomforestclassifier")

        # mlflow.log_param("train-data-path", "data/creditcard_2023.csv")
        mlflow.log_param("data-path", "data/creditcard_2023.csv")
        mlflow.log_params(params)

        pipeline = make_pipeline(
            DictVectorizer(sparse=False), RandomForestClassifier(**params)
        )

        pipeline.fit(dicts_train, y_train.ravel())
        y_pred = pipeline.predict_proba(dicts_val)[0:, 1]
        rmse = mean_squared_error(y_val, y_pred, squared=False)

        mlflow.log_metric("rmse", rmse)

        mlflow.sklearn.log_model(pipeline, artifact_path="model")

    return {"loss": rmse, "status": STATUS_OK}

In [None]:
search_space = {
    "n_estimators": scope.int(hp.quniform("n_estimators", 10, 21, 10)),
    "max_depth": scope.int(hp.quniform("max_depth", 4, 10, 1)),
    "random_state": 11,
    "n_jobs": -1,
}
best_result = fmin(
    fn=objective, space=search_space, algo=tpe.suggest, max_evals=1, trials=Trials()
)

# Train final model training <a name = "model_training"></a>

In [None]:
# train our model with the optimum parameters and enable autologging to see more parameter values

params = {
    "n_estimators": 170,
    "max_depth": 36,
    "random_state": 11,
}

mlflow.sklearn.autolog()

with mlflow.start_run():
    mlflow.set_tag("developer", "james")
    mlflow.set_tag("model", "randomforestclassifier")

    pipeline = make_pipeline(
        DictVectorizer(sparse=False), RandomForestClassifier(**params)
    )

    pipeline.fit(dicts_train, y_train.ravel())
    y_pred = pipeline.predict(dicts_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)

    mlflow.log_metric("rmse", rmse)
    
# TODO: check the warnings that are being returned when this cell is executed.