In [1]:
import os
import warnings
import sys

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
from sklearn import linear_model, preprocessing, metrics, model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, f1_score
import mlflow
import mlflow.sklearn
from mlflow.models.signature import ModelSignature
from mlflow.models.signature import infer_signature
from mlflow.tracking import MlflowClient
import streamlit as st
import joblib
import seaborn as sns

import pycaret.classification as pc
import warnings
warnings.filterwarnings("ignore")

2022-04-24 13:20:34.200 INFO    numexpr.utils: Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-04-24 13:20:34.201 INFO    numexpr.utils: NumExpr defaulting to 8 threads.


### Criando experimento

In [9]:
# Criando experimento e fazendo tracking com SQLite

mlflow.set_tracking_uri("sqlite:///runs_mlflow.db")
experiment_name = 'Projeto - Engenharia de Machine Learning'

experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    experiment_id = mlflow.create_experiment(experiment_name)
    experiment = mlflow.get_experiment(experiment_id)
experiment_id = experiment.experiment_id
mlflow_client = MlflowClient()

2022/04/24 13:24:15 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2022/04/24 13:24:15 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

### Coleta e preparação de dados

In [11]:
with mlflow.start_run(experiment_id=experiment_id, run_name='PreparacaoDados', nested=True):

    # Colunas usadas no modelo
    features = ['lat', 'lon', 'minutes_remaining',
                'period', 'playoffs', 'shot_distance']
    
    target = 'shot_made_flag'
    
    
    # Leitura dos dados
    df = pd.read_csv('./Data/kobe_dataset.csv')
    df.dropna(subset=[target], inplace=True)
    
    
    # Separando dataset de treino e de operação
    
        # Percentual separado para teste
    test_size = 0.2
    
        # Modelo (treino + teste)
    data_filtered = df.query('shot_type == "2PT Field Goal"')[features + [target]].copy()
    data_operation = df.query('shot_type == "3PT Field Goal"')[features + [target]].copy()
    data_filtered.to_parquet('./Data/processed/data_filtered.parquet')
    data_operation.to_parquet('./Data/processed/data_operation.parquet')
       
    data_train, data_test, y_train, y_test = train_test_split(data_filtered[features],
                                                              data_filtered[[target]],
                                                              test_size=test_size, 
                                                              stratify=data_filtered[[target]])
    
    data_train[target] = y_train
    data_test[target] = y_test
      
    data_train.to_parquet('./Data/operalization/base_train.parquet')
    data_test.to_parquet('./Data/operalization/base_test.parquet')
    
    
    # Log de parâmetro do modelo
    mlflow.log_param('Test Size', test_size)

    # Log de métricas globais
    mlflow.log_metric('Dataset de treino - Tamanho', data_train.shape[0])
    mlflow.log_metric('Dataset de teste - Tamanho', data_test.shape[0])
   
    
mlflow.end_run()

print('== Bases de Dados ==')
print(f'Dataset de treino {data_train.shape}')
print(f'Dataset de teste {data_test.shape}')
print(f'Colunas: {list(data_train.columns)}')

== Bases de Dados ==
Dataset de treino (16228, 7)
Dataset de teste (4057, 7)
Colunas: ['lat', 'lon', 'minutes_remaining', 'period', 'playoffs', 'shot_distance', 'shot_made_flag']


### Treinamento do modelo

In [14]:
registered_model_name = 'modelo_kobe'
model_test = 'modelo_kobe_reglog'
model_version = -1

with mlflow.start_run(experiment_id=experiment_id, run_name='Treinamento', nested=True):
    
    pc.setup(session_id=123,
             data = data_train, 
             train_size=1-test_size,
             target = 'shot_made_flag',
             fold_strategy = 'stratifiedkfold', 
             fold = 10,
             categorical_features = ['playoffs'],
             numeric_features = ['lat', 'lon', 'minutes_remaining', 'shot_distance', 'period'], 
             experiment_name = experiment_name,
             silent=True)
    
    model_lr = pc.create_model(estimator='lr', probability_threshold=0.5)
    
    # Dummies do x_test (o pycaret automaticamente dropou playoffs_1 no treinamento)
    x_test_dum = pd.get_dummies(data_test, columns=['playoffs']).drop(['shot_made_flag',
                                                                       'playoffs_1'], 
                                                                      axis=1)
    
    # Registrar log_loss com base de teste
    y_true = data_test.shot_made_flag.values
    y_pred_lr = model_lr.predict(x_test_dum.values)
    
    lr_log_loss = log_loss(y_true, y_pred_lr)
    
    mlflow.log_metric('Log Loss - Logistic Regression', lr_log_loss)
    
    best_model = pc.compare_models(n_select = 1, sort='Accuracy', include=['lr', 'dt', 'svm'])
    
    tuned_model = pc.tune_model(best_model,
                                optimize = 'Accuracy',
                                search_library = 'scikit-learn',
                                search_algorithm = 'random',
                                n_iter = 4)
    
    y_pred_tuned_model = tuned_model.predict(x_test_dum.values)
    
    # Registrar log_loss e f1_ratio com base de teste
    tuned_log_loss = log_loss(y_true, y_pred_tuned_model)
    tuned_f1 = f1_score(y_true, y_pred_tuned_model)
    
    mlflow.log_metric('Log Loss - Tuned Model', tuned_log_loss)
    mlflow.log_metric('F1 Score - Tuned Model', tuned_f1)
    
    # Artefatos utilizados
    classification_plots = ['auc', 'threshold', 'pr', 
                            'confusion_matrix', 'class_report', 'feature', 'learning']

    for plot_type in classification_plots:
        print('=> Aplicando plot ', plot_type)
        try:
            artifact = pc.plot_model(tuned_model, plot=plot_type, save=True, use_train_data=False)
            mlflow.log_artifact(artifact)
        except:
            print('=> Nao possível plotar: ', plot_type)
            continue

    pc.save_model(tuned_model, f'./{registered_model_name}') 
    pc.save_model(model_lr, f'./{model_test}')
    # Carrega novamente o pipeline + modelo tunado
    model_pipe = pc.load_model(f'./{registered_model_name}')

    
    # -------------- Registro -------------------
    
    # Assinatura do Modelo Inferida pelo MLFlow
    
    model_features = list(data_test.columns)
    inf_signature = infer_signature(data_filtered, model_pipe.predict(data_test))
    
    # Exemplo de entrada para o MLmodel
    input_example = {x: data_test[x].values[:5] for x in features}
    
    # Log do pipeline de modelagem do sklearn e registrar como uma nova versao
    mlflow.sklearn.log_model(sk_model=model_pipe,
                             artifact_path="sklearn-model",
                             registered_model_name=registered_model_name,
                             signature = inf_signature,
                             input_example = input_example)
    
    # Criacao do cliente do servico MLFlow e atualizacao versao modelo
    client = MlflowClient()
    if model_version == -1:
        model_version = client.get_latest_versions(registered_model_name)[-1].version
    
    # Registrar o modelo como staging
    client.transition_model_version_stage(name=registered_model_name,
                                          version=model_version, 
                                          stage="Staging")

INFO  [logs] Saving 'Learning Curve.png'
INFO  [logs] Visual Rendered Successfully
INFO  [logs] plot_model() succesfully completed......................................
INFO  [logs] Initializing save_model()
INFO  [logs] save_model(model=LogisticRegression(C=7.689, class_weight={}, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=123, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False), model_name=./modelo_kobe, prep_pipe_=Pipeline(memory=None,
         steps=[('dtypes',
                 DataTypes_Auto_infer(categorical_features=['playoffs'],
                                      display_types=False, features_todrop=[],
                                      id_columns=[],
                                      ml_usecase='classification',
                                      numerical_features=['lat', 'lon',
     

INFO  [logs] save_model() successfully completed......................................
INFO  [logs] Initializing load_model()


Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved


INFO  [logs] load_model(model_name=./modelo_kobe, platform=None, authentication=None, verbose=True)


Transformation Pipeline and Model Successfully Loaded


Registered model 'modelo_kobe' already exists. Creating a new version of this model...
2022/04/24 13:26:56 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: modelo_kobe, version 2
Created version '2' of model 'modelo_kobe'.


### Servindo o modelo

In [15]:
import requests
host = 'localhost'
port = '5001'
url = f'http://{host}:{port}/invocations'
headers = {'Content-Type': 'application/json',}

http_data = data_test.to_json(orient='split')
r = requests.post(url=url, headers=headers, data=http_data)

data_test.loc[:, 'operation_label'] = pd.read_json(r.text).values[:,0]

print(metrics.classification_report(data_test['shot_made_flag'], data_test['operation_label']))

              precision    recall  f1-score   support

         0.0       0.59      0.66      0.62      2120
         1.0       0.57      0.50      0.54      1937

    accuracy                           0.58      4057
   macro avg       0.58      0.58      0.58      4057
weighted avg       0.58      0.58      0.58      4057



In [45]:
import requests
host = 'localhost'
port = '5001'
url = f'http://{host}:{port}/invocations'
headers = {'Content-Type': 'application/json',}

data_3_pts = df.query('shot_type == "3PT Field Goal"')[features + [target]].copy()

http_data = data_3_pts.to_json(orient='split')
r = requests.post(url=url, headers=headers, data=http_data)

data_3_pts.loc[:, 'operation_label'] = pd.read_json(r.text).values[:,0]

print(metrics.classification_report(data_3_pts['shot_made_flag'], data_3_pts['operation_label']))
print('Log Loss: {:.2f}' .format(log_loss(data_3_pts['shot_made_flag'], data_3_pts['operation_label'])))
print('F1-Score: {:.2f}' .format(f1_score(data_3_pts['shot_made_flag'], data_3_pts['operation_label'])))

              precision    recall  f1-score   support

         0.0       0.67      1.00      0.80      3630
         1.0       0.00      0.00      0.00      1782

    accuracy                           0.67      5412
   macro avg       0.34      0.50      0.40      5412
weighted avg       0.45      0.67      0.54      5412

Log Loss: 11.38
F1-Score: 0.00
