In [4]:
# se importa la librería de Pandas
import pandas as pd
import mlflow

In [5]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

In [6]:
# carga el dataframe a partir de un csv
df = pd.read_csv('libertadores-results-ds.csv')
df

Unnamed: 0,Edition,Round,Date,Home Club,Away Club,Home Score,AwayScore
0,2023,Final,4/11/2023,Fluminense FC,Boca Juniors,2,1
1,2023,Semifinal,6/10/2023,Palmeiras,Boca Juniors,1,1
2,2023,Semifinal,5/10/2023,Internacional,Fluminense FC,1,2
3,2023,Semifinal,29/09/2023,Boca Juniors,Palmeiras,0,0
4,2023,Semifinal,28/09/2023,Fluminense FC,Internacional,2,2
...,...,...,...,...,...,...,...
1949,2013,Qualifying Match,24/01/2013,São Paulo,Bolívar,5,0
1950,2013,Qualifying Match,24/01/2013,LDU de Quito,Grêmio,1,0
1951,2013,Qualifying Match,24/01/2013,Defensor Sporting,Olimpia,0,0
1952,2013,Qualifying Match,23/01/2013,Tigre,Deportivo Anzoátegui,2,1


In [7]:
# se visualiza si hay columnas con valores nulos
df.isnull().sum()

Edition       0
Round         0
Date          0
Home Club     0
Away Club     0
Home Score    0
AwayScore     0
dtype: int64

In [8]:
# se eliminan los resultados donde la columna Round es igual a Qualifying Match
df = df[df['Round'] != 'Qualifying Match']

In [9]:
# se eliminan las colunas de Edition y Date
df = df.drop(['Edition', 'Date'], axis=1)

In [10]:
df

Unnamed: 0,Round,Home Club,Away Club,Home Score,AwayScore
0,Final,Fluminense FC,Boca Juniors,2,1
1,Semifinal,Palmeiras,Boca Juniors,1,1
2,Semifinal,Internacional,Fluminense FC,1,2
3,Semifinal,Boca Juniors,Palmeiras,0,0
4,Semifinal,Fluminense FC,Internacional,2,2
...,...,...,...,...,...
1937,Group H,Caracas,Fluminense FC,0,1
1938,Group D,Deportes Iquique,Peñarol,1,2
1939,Group D,Emelec,Vélez Sarsfield,1,0
1940,Group G,Universidad de Chile,Deportivo Lara,2,0


In [11]:
# se buscan los registros que en el campo Round comienzan y se les asigna el valor
df.loc[df['Round'].str.contains('Group'), 'Round'] = 'Groups'
df

Unnamed: 0,Round,Home Club,Away Club,Home Score,AwayScore
0,Final,Fluminense FC,Boca Juniors,2,1
1,Semifinal,Palmeiras,Boca Juniors,1,1
2,Semifinal,Internacional,Fluminense FC,1,2
3,Semifinal,Boca Juniors,Palmeiras,0,0
4,Semifinal,Fluminense FC,Internacional,2,2
...,...,...,...,...,...
1937,Groups,Caracas,Fluminense FC,0,1
1938,Groups,Deportes Iquique,Peñarol,1,2
1939,Groups,Emelec,Vélez Sarsfield,1,0
1940,Groups,Universidad de Chile,Deportivo Lara,2,0


In [12]:
# prompt: se crea un nuevo campo llamado Score, donde si los campos Home Score son iguales, el valor es cero, si Home Score es mayor poner 1 y si AwayScore es mayor poner -1

# Se crea una nueva columna llamada 'Score' con valores iniciales de 0
df['Score'] = 0

# Se utiliza la función .loc para asignar valores a la columna 'Score' según las condiciones
df.loc[df['Home Score'] > df['AwayScore'], 'Score'] = 1
df.loc[df['Home Score'] < df['AwayScore'], 'Score'] = -1
df


Unnamed: 0,Round,Home Club,Away Club,Home Score,AwayScore,Score
0,Final,Fluminense FC,Boca Juniors,2,1,1
1,Semifinal,Palmeiras,Boca Juniors,1,1,0
2,Semifinal,Internacional,Fluminense FC,1,2,-1
3,Semifinal,Boca Juniors,Palmeiras,0,0,0
4,Semifinal,Fluminense FC,Internacional,2,2,0
...,...,...,...,...,...,...
1937,Groups,Caracas,Fluminense FC,0,1,-1
1938,Groups,Deportes Iquique,Peñarol,1,2,-1
1939,Groups,Emelec,Vélez Sarsfield,1,0,1
1940,Groups,Universidad de Chile,Deportivo Lara,2,0,1


In [13]:
# se eliminan los campos Home Score y	AwayScore
df = df.drop(['Home Score', 'AwayScore'], axis=1)
df

Unnamed: 0,Round,Home Club,Away Club,Score
0,Final,Fluminense FC,Boca Juniors,1
1,Semifinal,Palmeiras,Boca Juniors,0
2,Semifinal,Internacional,Fluminense FC,-1
3,Semifinal,Boca Juniors,Palmeiras,0
4,Semifinal,Fluminense FC,Internacional,0
...,...,...,...,...
1937,Groups,Caracas,Fluminense FC,-1
1938,Groups,Deportes Iquique,Peñarol,-1
1939,Groups,Emelec,Vélez Sarsfield,1
1940,Groups,Universidad de Chile,Deportivo Lara,1


In [15]:
# se importan las librerías para definir el modelo
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from mlflow.models import infer_signature


In [22]:
import pandas as pd


# Separar variables predictoras y objetivo
X = df[['Round', 'Home Club', 'Away Club']]
y = df['Score']

# Preprocesamiento: codificación de variables categóricas
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Round', 'Home Club', 'Away Club'])
    ]
)

params = {
    'multi_class': 'multinomial', 
    'solver': 'lbfgs', 
    'max_iter': 500, 
    'random_state': 42,
}

# Definición del pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', LogisticRegression(**params))
])

# División de los datos en entrenamiento y prueba (aunque en este ejemplo la data es muy pequeña)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenamiento del modelo
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)



In [21]:
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)

# se crean nuevas métricas
precision = precision_score(y_test, y_pred, average="macro")
recall = recall_score(y_test, y_pred, average="macro")
f1 = f1_score(y_test, y_pred, average="macro")
y_proba = pipeline.predict_proba(X_test)
AUC_ROC = roc_auc_score(y_test, y_proba, multi_class="ovr")

In [19]:
# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

In [24]:
mlflow.set_experiment("Predicciones Copa Libertadores")
# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(params)
    
    # Log the loss metric
    mlflow.set_tag("Training Info", "Predicciones Copa Libertadores")
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1", f1)
    mlflow.log_metric("AUC_ROC", AUC_ROC)
    # Infer the model signature
    signature = infer_signature(X_train, pipeline.predict(X_train))
    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="libetadores",
        signature=signature,
        input_example=X_train,
        registered_model_name="copa-libertadores",
    )

Successfully registered model 'copa-libertadores'.
2025/04/12 11:35:12 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: copa-libertadores, version 1


🏃 View run bustling-hare-171 at: http://127.0.0.1:8080/#/experiments/844903279001092614/runs/b238ce9cff7d49eca182f55a36450be5
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/844903279001092614


Created version '1' of model 'copa-libertadores'.


In [None]:
# Evaluación del modelo
#score = pipeline.score(X_test, y_test)
#print("Puntaje de prueba:", score)

In [83]:
# guarda el modelo
import joblib

# Guarda el pipeline entrenado en un archivo .pkl
joblib.dump(pipeline, 'mejor_modelo.pkl')

joblib.dump(pipeline, 'pipeline_total.pkl')

['pipeline_total.pkl']