In [1]:
import mlflow
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, f1_score

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,10)
plt.rcParams.update({'font.size': 15})

Creamos el proyecto MLflow

In [3]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("proyecto-champions")

2024/02/20 00:22:31 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2024/02/20 00:22:31 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

<Experiment: artifact_location='file:///c:/Users/velir/2C-INFORMATICA-UAX/2C-3_Inteligencia-Artificial/TAREAS/Tarea-CHAMPIONS-LEAGUE/IA_Champions/mlruns/1', creation_time=1708384953267, experiment_id='1', last_update_time=1708384953267, lifecycle_stage='active', name='proyecto-champions', tags={}>

## Dividir datos train y test

In [4]:
df_0 = pd.read_csv("data/UCL_CF_preprocesado.csv")

columnas = ['Rk', 'Team_ID', 'W', 'D', 'L', 'GD', 'Pts', 'Is_Winner']

df_1 = df_0[columnas].copy()
df_1.head()

Unnamed: 0,Rk,Team_ID,W,D,L,GD,Pts,Is_Winner
0,1,22,7.0,6.0,1.0,8.0,27.0,1
1,2,9,6.0,4.0,4.0,8.0,22.0,0
2,3,25,8.0,4.0,2.0,11.0,28.0,0
3,4,10,7.0,3.0,4.0,3.0,24.0,0
4,5,21,6.0,3.0,1.0,7.0,21.0,0


In [5]:
train_data, test_data = train_test_split(df_1, test_size=0.2)
print(f"Train dataset shape: {train_data.shape}")
print(f"Test dataset shape: {test_data.shape}")

Train dataset shape: (512, 8)
Test dataset shape: (128, 8)


Estandarizamos variables numéricas

In [6]:
VAR_NUMERICAS = ['W', 'D', 'L', 'GD', 'Pts']

X_train, X_val = train_test_split(train_data, test_size=0.2)

sc = StandardScaler()
X_train.loc[:, VAR_NUMERICAS] = sc.fit_transform(X_train[VAR_NUMERICAS])
X_val.loc[:, VAR_NUMERICAS] = sc.transform(X_val[VAR_NUMERICAS])
test_data.loc[:, VAR_NUMERICAS] = sc.transform(test_data[VAR_NUMERICAS])

## Entrenamiento de modelo RF

In [7]:
VAR_DEP = df_1.columns.tolist()
VAR_DEP.remove('Is_Winner')

VAR_OBJETIVO = 'Is_Winner'

In [8]:
mlflow.sklearn.autolog(disable=True)

with mlflow.start_run(run_name='rf_baseline'):
    params = {
        "n_estimators": 100,
        "max_depth": 20
    }

    mlflow.set_tag("model_name", "RF")
    mlflow.log_params(params)

    rf = RandomForestClassifier(n_estimators=100, max_depth=20)
    rf.fit(X_train[VAR_DEP], X_train[VAR_OBJETIVO])

    rf_preds = rf.predict(test_data[VAR_DEP])
    rf_rms = mean_squared_error(test_data[VAR_OBJETIVO], rf_preds, squared=False)

    mlflow.log_metric("test_rmse", rf_rms)
    mlflow.sklearn.log_model(rf, "sk_models")

