In [2]:
import pandas as pd
import numpy as np
import pycaret.classification as pc

import matplotlib.pyplot as plt

import mlflow


# Configurar MLFlow

In [3]:
# Para usar o sqlite como repositorio
mlflow.set_tracking_uri("sqlite:///mlruns.db")

experiment_name = 'Projeto Kobe'
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    experiment_id = mlflow.create_experiment(experiment_name)
    experiment = mlflow.get_experiment(experiment_id)
experiment_id = experiment.experiment_id

In [4]:
from sklearn.model_selection import train_test_split

data_cols = ['lat','lon','minutes_remaining','period','playoffs','shot_distance','shot_made_flag']
train_perc = 0.8

with mlflow.start_run(experiment_id=experiment_id, run_name = 'PreparacaoDados'):

  #df_dev = pd.read_parquet('./drive/MyDrive/Colab Notebooks/kobe_dataset/data/raw/dataset_kobe_dev.parquet')
  df_dev = pd.read_parquet('../data/raw/dataset_kobe_dev.parquet')
  df_dev = df_dev[data_cols].copy()

  #df_dev.to_parquet("./drive/MyDrive/Colab Notebooks/kobe_dataset/data/processed/data_filtered.parquet")
  df_dev.to_parquet("../data/processed/data_filtered.parquet")
  df_dev = df_dev.dropna(subset=['shot_made_flag'])
  xtrain, xtest, ytrain, ytest = train_test_split(df_dev[['lat','lon','minutes_remaining','period','playoffs','shot_distance']],
                                                  df_dev['shot_made_flag'],
                                                  train_size=train_perc,
                                                  stratify=df_dev['shot_made_flag'])

  xtrain['shot_made_flag'] = ytrain
  xtest['shot_made_flag'] = ytest
  #xtrain.to_parquet('./drive/MyDrive/Colab Notebooks/kobe_dataset/data/processed/base_train.parquet')
  xtrain.to_parquet('../data/processed/base_train.parquet')
  xtest.to_parquet('../data/processed/base_test.parquet')

  mlflow.log_params({
      'perc_test': 1-train_perc,
      'colunas-selecionadas': data_cols
      })
  mlflow.log_metrics({
      'qtd_linhas_treino': xtrain.shape[0],
      'qtd_linhas_teste': xtest.shape[0],
  })

# Treinamento do Modelo

In [5]:
exp = pc.setup(
    data=xtrain,
    target = 'shot_made_flag',
    test_data = xtest,
    normalize=True,
    log_experiment = False
)
list_models = exp.compare_models(['lr','dt'], n_select=2, sort='f1')
list_models
exp.plot_model(list_models[0], plot='vc', save = True)

Unnamed: 0,Description,Value
0,Session id,8449
1,Target,shot_made_flag
2,Target type,Binary
3,Original data shape,"(20285, 7)"
4,Transformed data shape,"(20285, 7)"
5,Transformed train set shape,"(16228, 7)"
6,Transformed test set shape,"(4057, 7)"
7,Numeric features,6
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,0.5357,0.5155,0.5838,0.512,0.5455,0.0752,0.0759,0.019
lr,Logistic Regression,0.5768,0.5984,0.4826,0.5666,0.5212,0.1464,0.148,0.219


'Validation Curve.png'

In [6]:
# Funcao de teste de hiperparametros por validacao cruzada
from sklearn.model_selection import validation_curve

def plot_parameter_validation_curve(X, Y, param_name, grid_search,
                                    model, model_name, scoring,
                                    logx):
    print('Parameter:', param_name)
    print('GridSearch:', grid_search[param_name])
    print('Scoring:', scoring)
    plt.figure(figsize=(6,4))
    train_scores, test_scores = validation_curve(model,
                                                 X = X,
                                                 y = Y,
                                                 param_name=param_name,
                                                 param_range= grid_search[param_name],
                                                 scoring=scoring,
                                                 cv=10,
                                                 n_jobs=-1)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.title("Curva Validação Modelo " + model_name)
    plt.xlabel(param_name)
    plt.ylabel("Score ("+scoring+")")
    if logx:
        plt.semilogx(grid_search[param_name], train_scores_mean,'-o', label="Treino",
                     color="darkorange", lw=2)
        plt.semilogx(grid_search[param_name], test_scores_mean,'-o', label="Validação-Cruzada",
                     color="navy", lw=2)
    else:
        plt.plot(grid_search[param_name], train_scores_mean,'-o', label="Treino",
                     color="darkorange", lw=2)
        plt.plot(grid_search[param_name], test_scores_mean,'-o', label="Validação-Cruzada",
                 color="navy", lw=2)
    plt.fill_between(grid_search[param_name], train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2,
                     color="darkorange", lw=2)
    plt.fill_between(grid_search[param_name], test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.2,
                     color="navy", lw=2)
    plt.legend(loc='best')
    plt.grid(True)
    plt.show()

In [12]:
plot_parameter_validation_curve(xtrain, ytrain, 'C', {'C': [0.001, 0.01, 0.1, 1, 10]}, list_models[0], 'Regressão Logística', 'f1', logx=False)

AttributeError: 'function' object has no attribute 'get_params'