In [1]:
import pandas as pd
import numpy as np
import awswrangler as wr
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, f1_score, average_precision_score
from sklearn import tree
import pickle
from mlflow.tracking import MlflowClient
import mlflow
from datetime import date
import datetime


In [2]:
today = date.today()
day, month, year = today.strftime("%d/%m/%Y").split('/')
period = year + month

In [3]:
train = wr.s3.read_parquet("s3://rimac-analytics-temporal/individuals/Dante/covid-vida-v2/data/prec/train/data.parquet")
valid = wr.s3.read_parquet("s3://rimac-analytics-temporal/individuals/Dante/covid-vida-v2/data/prec/valid/data.parquet")
test = wr.s3.read_parquet("s3://rimac-analytics-temporal/individuals/Dante/covid-vida-v2/data/prec/test/data.parquet")


In [39]:
params = {'max_depth': 5, 'min_samples_leaf': 50, 'random_state': 0}

try:
    id_exp = MlflowClient().create_experiment('exp_3','s3://rimac-analytics-temporal/individuals/Dante/covid-vida-v2/mlruns')
except:
    id_exp = mlflow.get_experiment_by_name('exp_3').experiment_id

with mlflow.start_run(experiment_id=id_exp) as run:
    mlflow.log_params(params)
    
    X_train, y_train = train.drop(['target'], axis=1), train[['target']].astype(int)
    X_val, y_val = valid.drop(['target'], axis=1), valid[['target']].astype(int)
    
    mlflow.log_param('columnas', X_train.columns)
    clf = tree.DecisionTreeClassifier(**params)
    clf = clf.fit(X_train, y_train)

    y_trn_pred = clf.predict_proba(X_train)
    y_val_pred = clf.predict_proba(X_val)
        
    trn_auc = roc_auc_score(y_train, y_trn_pred[:,1])
    val_auc = roc_auc_score(y_val, y_val_pred[:,1])
        
    print(f'Train AUC: {100*trn_auc:.2f} | '
              f'Val AUC: {100*val_auc:.2f} | '
              f'Train Gini: {(100*trn_auc-50)*2:.2f} | '
              f'Val Gini: {(100*val_auc-50)*2:.2f}')
        
    mlflow.log_metric('train_auc', trn_auc)
    mlflow.log_metric('val_auc', val_auc)
    mlflow.sklearn.log_model(clf, 'model')

    df_val = pd.DataFrame()
    df_val['pred'] = y_val_pred[:, 1]
    df_val['target'] = y_val.target.reset_index(drop=True)
    df_val['decile'], cuts= pd.qcut(y_val_pred[:, 1], 14, duplicates='drop', retbins=True)
    np.save('cuts.npy', cuts)
    mlflow.log_artifact('cuts.npy')
    
    mlflow.set_tag('period_train', period)
    
    print()
    print(f"artifact_uri={mlflow.get_artifact_uri()}")
    
mlflow.end_run()

Train AUC: 88.88 | Val AUC: 88.72 | Train Gini: 77.75 | Val Gini: 77.43

artifact_uri=s3://rimac-analytics-temporal/individuals/Dante/covid-vida-v2/mlruns/7709e96756304c149fec68a417488c2e/artifacts
