# Create s3 client and download data from minio

In [None]:
import logging
import os
from pathlib import Path
import tempfile
import subprocess

import boto3
import pandas as pd
from ydata_profiling import ProfileReport

MLFLOW_S3_ENDPOINT_URL = "https://minio-api-$$$$-dev.apps.$$$.openshiftapps.com" # <--- mettez ici votre endpoint minio
AWS_ACCESS_KEY_ID = "$$$"
AWS_SECRET_ACCESS_KEY = "$$$"

def load_data(path: str) -> str:
  local_path = Path("./", "data.csv")
  logging.warning(f"to path : {local_path}")

  s3_client = boto3.client(
    "s3",
    endpoint_url=MLFLOW_S3_ENDPOINT_URL,
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
  )

  s3_client.download_file("kto-titanic", path, local_path)
  df = pd.read_csv(local_path)

  profile = ProfileReport(df, title=f"Profiling Report - {local_path.stem}")
  profile_path = Path("./", "profile.html")
  profile.to_file(profile_path)

  return local_path

  from .autonotebook import tqdm as notebook_tqdm


# Random split train / test

In [2]:
import sklearn.model_selection

FEATURES = ["Pclass", "Sex", "SibSp", "Parch"]

TARGET = "Survived"


def split_train_test(data_path: str) -> tuple[str, str, str, str]:
  logging.warning(f"split on {data_path}")

  df = pd.read_csv(data_path, index_col=False)

  y = df[TARGET]
  x = df[FEATURES]
  x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size=0.3, random_state=42)

  datasets = [
    (x_train, "xtrain", "xtrain.csv"),
    (x_test, "xtest", "xtest.csv"),
    (y_train, "ytrain", "ytrain.csv"),
    (y_test, "ytest", "ytest.csv"),
  ]

  artifact_paths = []
  for data, artifact_path, filename in datasets:
    file_path = Path("./", filename)
    data.to_csv(file_path, index=False)
    artifact_paths.append(file_path)

  return tuple(artifact_paths)

# Train ML model

In [3]:
import joblib
from sklearn.ensemble import RandomForestClassifier

ARTIFACT_PATH = "model_trained"


def train(x_train_path: str, y_train_path: str, n_estimators: int, max_depth: int, random_state: int) -> str:
  logging.warning(f"train {x_train_path} {y_train_path}")
  x_train = pd.read_csv(x_train_path, index_col=False)
  y_train = pd.read_csv(y_train_path, index_col=False)

  x_train = pd.get_dummies(x_train)

  model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
  model.fit(x_train, y_train)

  model_filename = "model.joblib"

  model_path = Path("./", model_filename)
  joblib.dump(model, model_path)


  return model_path

# Evaluate ML model

In [4]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error

def validate(model_path: str, x_test_path: str, y_test_path: str) -> None:
  logging.warning(f"validate {model_path}")
  model = joblib.load(model_path)

  x_test = pd.read_csv(x_test_path, index_col=False)
  y_test = pd.read_csv(y_test_path, index_col=False)

  x_test = pd.get_dummies(x_test)

  if y_test.shape[1] == 1:
    y_test = y_test.iloc[:, 0]

  y_pred = model.predict(x_test)

  mse = mean_squared_error(y_test, y_pred)
  mae = mean_absolute_error(y_test, y_pred)
  r2 = r2_score(y_test, y_pred)
  medae = median_absolute_error(y_test, y_pred)

  feature_names = x_test.columns.tolist()

  if hasattr(model, "feature_importances_"):
    importances = model.feature_importances_
    feature_importance = {
      name: float(importance) for name, importance in zip(feature_names, importances, strict=False)
    }
  elif hasattr(model, "coef_"):
    coefs = model.coef_
    if hasattr(coefs, "shape") and len(coefs.shape) > 1:
      coefs = coefs[0]
    feature_importance = {name: float(coef) for name, coef in zip(feature_names, coefs, strict=False)}
  else:
    feature_importance = {name: 0.0 for name in feature_names}
    logging.warning("Model does not have feature importance attributes")

  logging.warning(f"mse : {mse}")
  logging.warning(f"mae : {mae}")
  logging.warning(f"r2 : {r2}")
  logging.warning(f"medae : {medae}")
  logging.warning(f"feature importance : {feature_importance}")

# Training Pipeline

In [5]:
local_path = load_data("all_titanic.csv")
xtrain_path, xtest_path, ytrain_path, ytest_path = split_train_test(local_path)
model_path = train(xtrain_path, ytrain_path, 100, 10, 42)
validate(model_path, xtest_path, ytest_path)

100%|██████████| 12/12 [00:00<00:00, 105.76it/s]<00:00, 23.85it/s, Describe variable: Embarked]
Summarize dataset: 100%|██████████| 47/47 [00:03<00:00, 13.75it/s, Completed]                       
Generate report structure: 100%|██████████| 1/1 [00:04<00:00,  4.72s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  3.49it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 148.67it/s]
  return fit_method(estimator, *args, **kwargs)
