In [7]:
import pandas as pd
import numpy as np
import pycaret.classification as pc

import matplotlib.pyplot as plt

import mlflow
from mlflow.models.signature import infer_signature
from mlflow.client import MlflowClient


# Configurar MLFlow

In [8]:
# Para usar o sqlite como repositorio
mlflow.set_tracking_uri("sqlite:///mlruns.db")

experiment_name = 'Projeto Kobe Prod'
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    experiment_id = mlflow.create_experiment(experiment_name)
    experiment = mlflow.get_experiment(experiment_id)
experiment_id = experiment.experiment_id

In [9]:
from sklearn.model_selection import train_test_split

data_cols = ['lat','lon','minutes_remaining','period','playoffs','shot_distance','shot_made_flag']
train_perc = 0.8

with mlflow.start_run(experiment_id=experiment_id, run_name = 'PreparacaoDados'):

  df_prod = pd.read_parquet('../data/raw/dataset_kobe_prod.parquet')
  df_prod = df_prod[data_cols].copy()

  df_prod.to_parquet("../data/processed/data_filtered.parquet")
  df_prod = df_prod.dropna(subset=['shot_made_flag'])
  xtrain, xtest, ytrain, ytest = train_test_split(df_prod[['lat','lon','minutes_remaining','period','playoffs','shot_distance']],
                                                  df_prod['shot_made_flag'],
                                                  train_size=train_perc,
                                                  stratify=df_prod['shot_made_flag'])

  xtrain['shot_made_flag'] = ytrain
  xtest['shot_made_flag'] = ytest
  
  xtrain.to_parquet('../data/processed/base_train.parquet')
  xtest.to_parquet('../data/processed/base_test.parquet')

  mlflow.log_params({
      'perc_test': 1-train_perc,
      'colunas-selecionadas': data_cols
      })
  mlflow.log_metrics({
      'qtd_linhas_treino': xtrain.shape[0],
      'qtd_linhas_teste': xtest.shape[0],
  })

# Treinamento do Modelo

In [10]:
exp = pc.setup(
    data=xtrain,
    target = 'shot_made_flag',
    test_data = xtest,
    normalize=True,
    log_experiment = False
)
list_models = exp.compare_models(['lr','dt'], n_select=2, sort='f1')
list_models
exp.plot_model(list_models[0], plot='vc', save = True)

Unnamed: 0,Description,Value
0,Session id,1217
1,Target,shot_made_flag
2,Target type,Binary
3,Original data shape,"(5412, 7)"
4,Transformed data shape,"(5412, 7)"
5,Transformed train set shape,"(4329, 7)"
6,Transformed test set shape,"(1083, 7)"
7,Numeric features,6
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,0.5546,0.499,0.3348,0.3267,0.3303,-0.003,-0.003,0.008
lr,Logistic Regression,0.6704,0.5576,0.0,0.0,0.0,-0.0009,-0.0067,0.014


'Validation Curve.png'

In [11]:
from sklearn.model_selection import validation_curve

def plot_parameter_validation_curve(X, Y, param_name, grid_search,
                                    model, model_name, scoring,
                                    logx):
    print('Parameter:', param_name)
    print('GridSearch:', grid_search[param_name])
    print('Scoring:', scoring)
    plt.figure(figsize=(6,4))
    train_scores, test_scores = validation_curve(model,
                                                 X = X,
                                                 y = Y,
                                                 param_name=param_name,
                                                 param_range= grid_search[param_name],
                                                 scoring=scoring,
                                                 cv=10,
                                                 n_jobs=-1)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.title("Curva Validação Modelo " + model_name)
    plt.xlabel(param_name)
    plt.ylabel("Score ("+scoring+")")
    if logx:
        plt.semilogx(grid_search[param_name], train_scores_mean,'-o', label="Treino",
                     color="darkorange", lw=2)
        plt.semilogx(grid_search[param_name], test_scores_mean,'-o', label="Validação-Cruzada",
                     color="navy", lw=2)
    else:
        plt.plot(grid_search[param_name], train_scores_mean,'-o', label="Treino",
                     color="darkorange", lw=2)
        plt.plot(grid_search[param_name], test_scores_mean,'-o', label="Validação-Cruzada",
                 color="navy", lw=2)
    plt.fill_between(grid_search[param_name], train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2,
                     color="darkorange", lw=2)
    plt.fill_between(grid_search[param_name], test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.2,
                     color="navy", lw=2)
    plt.legend(loc='best')
    plt.grid(True)
    return plt.gcf()

In [12]:
mlflow.end_run()

In [13]:
import os
from sklearn.metrics import log_loss, f1_score

registered_model_name = 'modelo_kobe'
nexamples = 5
model_version = -1

with mlflow.start_run(experiment_id=experiment_id, run_name = 'Treinamento'):

    exp = pc.setup(
        data=xtrain,
        target = 'shot_made_flag',
        test_data = xtest,
        normalize=True,
        log_experiment = False
    )
    list_models = exp.compare_models(['lr','dt'], n_select=2, sort='f1')
    
    # REGRESSÃO LOGÍSTICA
    yhat_test = exp.predict_model(list_models[1])
    plot_parameter_validation_curve(xtrain.drop('shot_made_flag', axis=1), ytrain, 'C', {'C': [0.001, 0.01, 0.1, 1, 10]}, list_models[1], 'Regressão Logística', 'f1', logx=True)
    
    mlflow.log_metrics({
        'lr_log_loss': log_loss(yhat_test.shot_made_flag, yhat_test.prediction_label),
        'lr_f1': f1_score(yhat_test.shot_made_flag, yhat_test.prediction_label)
    })

    plt.savefig('lr_validation_curve.png')
    mlflow.log_artifact('lr_validation_curve.png')
    # os.remove('lr_validation_curve.png')

    # ÁRVORE DECISÃO
    yhat_test = exp.predict_model(list_models[0])
    plot_parameter_validation_curve(xtrain.drop('shot_made_flag', axis=1), ytrain, 'max_depth', {'max_depth': [2, 3, 4, 5, 6, 7, 8]}, list_models[0], 'Árvore Decisão', 'f1', logx=False)

    mlflow.log_metrics({
        'dt_log_loss': log_loss(yhat_test.shot_made_flag, yhat_test.prediction_label),
        'dt_f1': f1_score(yhat_test.shot_made_flag, yhat_test.prediction_label)
    })
    plt.savefig('dt_validation_curve.png')
    mlflow.log_artifact('dt_validation_curve.png')
    # os.remove('dt_validation_curve.png')

    # FINALIZAÇÃO MELHOR MODELO
    tune_model = exp.tune_model(list_models[1],
                                optimize='f1',
                                search_library='scikit-learn',
                                search_algorithm='random',
                                n_iter = 4)

    yhat_test = exp.predict_model(tune_model)
    mlflow.log_metrics({
        'final_model_log_loss': log_loss(yhat_test.shot_made_flag, yhat_test.prediction_label),
        'final_model_f1': f1_score(yhat_test.shot_made_flag, yhat_test.prediction_label)
    })
    final_model = exp.finalize_model(tune_model)

    # EXPORTAÇÃO PARA LOG E REGISTRO DO MODELO
    exp.save_model(final_model, f'./{registered_model_name}') 
    # Carrega novamente o pipeline + bestmodel
    model_pipe = exp.load_model(f'./{registered_model_name}')
    # Assinatura do Modelo Inferida pelo MLFlow
    model_features = list(xtrain.drop('shot_made_flag', axis=1).columns)
    inf_signature = infer_signature(xtrain[model_features],
                                    model_pipe.predict(xtrain.drop('shot_made_flag', axis=1)))
    # Exemplo de entrada para o MLmodel
    input_example = {x: xtrain[x].values[:nexamples] for x in model_features}
    # Log do pipeline de modelagem do sklearn e registrar como uma nova versao
    mlflow.sklearn.log_model(
        sk_model=model_pipe,
        artifact_path="sklearn-model",
        registered_model_name=registered_model_name,
        signature = inf_signature,
        input_example = input_example
    )
    # Criacao do cliente do servico MLFlow e atualizacao versao modelo
    client = MlflowClient()
    if model_version == -1:
        model_version = client.get_latest_versions(registered_model_name)[-1].version
    # Registrar o modelo como staging
    client.set_registered_model_alias(
        name=registered_model_name,
        alias='staging',
        version=model_version
    )
    

Unnamed: 0,Description,Value
0,Session id,8033
1,Target,shot_made_flag
2,Target type,Binary
3,Original data shape,"(5412, 7)"
4,Transformed data shape,"(5412, 7)"
5,Transformed train set shape,"(4329, 7)"
6,Transformed test set shape,"(1083, 7)"
7,Numeric features,6
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,0.5565,0.5006,0.3341,0.3286,0.3309,-0.0004,-0.0004,0.007
lr,Logistic Regression,0.6704,0.5576,0.0,0.0,0.0,-0.0009,-0.0067,0.006


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.6704,0.5377,0.0,0.0,0.0,0.0,0.0


Parameter: C
GridSearch: [0.001, 0.01, 0.1, 1, 10]
Scoring: f1


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Decision Tree Classifier,0.5716,0.508,0.3193,0.3403,0.3295,0.0152,0.0152


Parameter: max_depth
GridSearch: [2, 3, 4, 5, 6, 7, 8]
Scoring: f1


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.4942,0.5633,0.6268,0.349,0.4484,0.0468,0.0537
1,0.5058,0.5568,0.6479,0.3594,0.4623,0.0699,0.0805
2,0.4827,0.5308,0.6197,0.3411,0.44,0.0294,0.034
3,0.5427,0.5714,0.662,0.3852,0.487,0.1238,0.1387
4,0.485,0.5214,0.5664,0.3347,0.4208,0.0096,0.0107
5,0.5242,0.5616,0.6294,0.3704,0.4663,0.0865,0.0965
6,0.5635,0.6076,0.6503,0.4009,0.496,0.1477,0.1613
7,0.4734,0.5289,0.6294,0.3396,0.4412,0.0213,0.025
8,0.4988,0.5371,0.6364,0.3555,0.4561,0.0562,0.0645
9,0.5648,0.5976,0.662,0.4017,0.5,0.1538,0.169


Fitting 10 folds for each of 4 candidates, totalling 40 fits


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.4977,0.5372,0.6134,0.3504,0.446,0.0455,0.0516


Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Loaded


Registered model 'modelo_kobe' already exists. Creating a new version of this model...
Created version '12' of model 'modelo_kobe'.


In [14]:
final_model