In [1]:
import pandas as pd 
import numpy as np 
import awswrangler as wr
from datetime import date
import datetime

import mlflow
from mlflow.tracking import MlflowClient

import lightgbm as lgb
import joblib
from sklearn.metrics import roc_auc_score, confusion_matrix

import pickle 
import boto3

In [2]:
today = date.today()
day, month, year = (today - datetime.timedelta(days=60)).strftime("%d/%m/%Y").split('/')
period_before = year + month

day, month, year = today.strftime("%d/%m/%Y").split('/')
period = year + month

In [4]:
id_exp = mlflow.get_experiment_by_name('exp_6').experiment_id

df_runs = mlflow.search_runs(experiment_ids=[id_exp], filter_string="metrics.val_auc > 0.70", order_by=["metrics.val_auc DESC"], max_results=1)

In [5]:
s3 = boto3.resource('s3')

remote_path = df_runs.artifact_uri.values[0]
model = pickle.loads(s3.Bucket(remote_path.split('/')[2]).Object(f'{"/".join(remote_path.split("/")[3:])}/model/model.pkl').get()['Body'].read())

In [6]:
test = wr.s3.read_parquet("s3://rimac-analytics-temporal/individuals/Dante/cross-sell-veh/data/prec/potencial/data.parquet")


In [14]:
preds = model.predict(test.drop(['periodo_target', 'cuc', 'placa', 'target', 'jerarquia'], axis=1), num_iteration=model.best_iteration)
test['preds'] = preds

In [15]:
test.preds.median()

0.17384033150579126

In [16]:
test[test.periodo_target == 202102].shape

(613302, 35)

In [10]:
test[test.periodo_target == 202102].shape

(613302, 34)

In [17]:
wr.s3.to_parquet(
        df=test[['periodo_target', 'cuc', 'placa', 'target', 'preds']],
        path=f's3://rimac-analytics-temporal/individuals/Dante/cross_prop_veh/predictions/proba_{period}/',
        mode='overwrite',
        dataset=True,
        database="coe_analytics_tmp",
        table=f"crossveh_test_{period}"
)

{'paths': ['s3://rimac-analytics-temporal/individuals/Dante/cross_prop_veh/predictions/proba_202104/46556f4ad9e44413b41f931685c83ec3.snappy.parquet'],
 'partitions_values': {}}

In [18]:
f"coe_analytics_tmp.crossveh_test_{period}"

'coe_analytics_tmp.crossveh_test_202104'

In [10]:
all_data = wr.s3.read_parquet("s3://rimac-analytics-temporal/individuals/Dante/cross-sell-veh/data/prec/all/data.parquet")


In [11]:
preds = model.predict(all_data.drop(['periodo_target', 'cuc', 'placa', 'target', 'jerarquia'], axis=1), num_iteration=model.best_iteration)
all_data['preds'] = preds

In [12]:
all_data[all_data.cuc == '1-44956524']

Unnamed: 0,avg_beneficio_pps,scoreingreso,tipotrabajador,dem_edad,last_prima_tot_miscli,avg_prima_tot_miscli,PRIMA_PROM_GRUPOS,PRIMA_PROM_RIESGOS,ratio_inc_prima_tot_miscli,dem_lima_agrup,...,VEH_var,RIESGOS_GENERALES_riesgos,months_ant_miscli2,num_meses_miscli,target,periodo_target,cuc,placa,jerarquia,preds
397324,,350.0,REG. ESPECIAL D. LEG.1057,35.0,1.865592,2.921455,1.865592,1.865592,0.638583,PROVINCIA,...,0.0,1.0,36.0,11.0,1.0,202012,1-44956524,05804U,0,0.155541
1051672,,350.0,REG. ESPECIAL D. LEG.1057,35.0,1.865592,2.921455,1.865592,1.865592,0.638583,PROVINCIA,...,0.0,1.0,36.0,11.0,1.0,202012,1-44956524,BTD094,1,0.034214
1051673,,350.0,REG. ESPECIAL D. LEG.1057,35.0,1.865592,2.921455,1.865592,1.865592,0.638583,PROVINCIA,...,0.0,1.0,36.0,11.0,1.0,202012,1-44956524,BTD094,1,0.034214
1410201,,350.0,REG. ESPECIAL D. LEG.1057,35.0,1.865592,2.921455,1.865592,1.865592,0.638583,PROVINCIA,...,0.0,1.0,36.0,11.0,1.0,202012,1-44956524,55407U,0,0.021029


In [13]:
wr.s3.to_parquet(
        df=all_data[['periodo_target', 'cuc', 'placa', 'target', 'preds']],
        path=f's3://rimac-analytics-temporal/individuals/Dante/cross_prop_veh/predictions/all/',
        mode='overwrite',
        dataset=True,
        database="coe_analytics_tmp",
        table=f"crossveh_test_back"
)

{'paths': ['s3://rimac-analytics-temporal/individuals/Dante/cross_prop_veh/predictions/all/60d0406af3c744ceb02f76d42b9a0973.snappy.parquet'],
 'partitions_values': {}}

In [32]:
f"crossveh_test_{period}"

'crossveh_test_202104'