In [4]:
import os
import sys
import pytz
import glob
import joblib
import pandas as pd
from datetime import datetime as dt
from sklearn.pipeline import Pipeline

sys.path.append(os.path.dirname(os.getcwd()) + '/src')
import utils
import get_data
import pipeline_modules
from guara.modeling.supervised_modelz import *

In [60]:
def load_artifacts():
    
    try:
        parent_dir = os.path.dirname(os.getcwd())
        print(f'parent dir: {parent_dir}/artifacts/staging/')
        production_id = glob.glob(parent_dir+'/artifacts/staging/*')[0].split('/')[-1]
        print(f'Production Experiment: {production_id}')
    except: 
        print('No experiment in production, exiting...')
        exit()

    model_path = parent_dir + f'/artifacts/staging/{production_id}/model.joblib'
    fe_pipe_path = parent_dir+ f'/artifacts/staging/{production_id}/fe_pipeline.joblib'
    fs_pipe_path = parent_dir+ f'/artifacts/staging/{production_id}/fs_pipeline.joblib'

    model = joblib.load(model_path)
    print(f'Model Loaded')

    fe_pipeline = joblib.load(fe_pipe_path)
    print(f'Feature engineering pipeline loaded')

    fs_pipeline = joblib.load(fs_pipe_path)
    print(f'Feature selection pipeline loaded')

    production_id = production_id
    
    return model, fe_pipeline, fs_pipeline

In [61]:
model, fe_pipeline, fs_pipeline = load_artifacts()

parent dir: /home/jupyter/poc-gpa-churn-italo/artifacts/staging/
Production Experiment: 20220520-201927
Model Loaded
Feature engineering pipeline loaded
Feature selection pipeline loaded


In [62]:
model

<lightgbm.basic.Booster at 0x7f76a5bc0e50>

In [63]:
fe_pipeline

Pipeline(steps=[('drop_temporary_columns', drop_temporary_columns()),
                ('drop_with_low_variance',
                 drop_numerical_with_variance(columns=['val_venda_bruta_cupom',
                                                       'qtd_item_venda',
                                                       'flg_vend_meu_desct',
                                                       'valor_desconto',
                                                       'flag_dev',
                                                       'tipo_promo_0',
                                                       'tipo_promo_1',
                                                       'tipo_promo_2',
                                                       'tipo_promo_3',
                                                       'tipo_promo_4',
                                                       'tipo_promo_5',
                                                       'categoria_0',
                     

In [64]:
fs_pipeline

Pipeline(steps=[('select_with_correlation',
                 select_with_correlation(threshold=0.82))])

---

In [66]:
df_val = pd.read_parquet('gs://gpa-churn/data/processed/steps/after_stix_30.parquet')

In [67]:
df_val.head()

Unnamed: 0,cod_cliente,val_venda_bruta_cupom,qtd_item_venda,val_gross_margin_cupom,val_vend_bruta_mercad,flg_vend_meu_desct,valor_desconto,flag_dev,tipo_promo_0,tipo_promo_1,...,sexo,cidade,uf,region,pib_percapita,idade,delta_de_cadastro,ind_email,cadastro_stix,delta_de_stix
0,1585777,27.84,10.0,4.514,64.010002,0.0,0.0,0.0,0.0,0.0,...,F,sao paulo,ac,n,17722.0,27944.0,3014.0,1.0,1,406.0
1,1585777,77.979996,2.0,13.234,49.970001,0.0,0.0,0.0,0.0,0.0,...,F,sao paulo,ac,n,17722.0,27975.0,3045.0,1.0,1,437.0
2,1585780,1.79,1.0,0.276,1.79,0.0,0.0,0.0,0.0,0.0,...,M,santos,sp,se,51140.0,26302.0,2925.0,,0,
3,1587327,1022.359985,147.0,243.003998,1172.390015,12.0,22.969999,0.0,12.0,0.0,...,F,brasilia,df,co,90742.0,23892.0,3883.0,1.0,1,184.0
4,1587327,988.690002,206.0,276.554993,1367.959961,15.0,30.129999,0.0,12.0,2.0,...,F,brasilia,df,co,90742.0,23923.0,3914.0,1.0,1,215.0


In [68]:
def make_predictions(
    X, 
    model,
    fe_pipeline,
    fs_pipeline
    ):
        
    md = SupervisedModelz('lgbm', 'regression')
    md.model = model
    utils.log('Successfully loaded model')
    
    initial_shape = X.shape
    X = fe_pipeline.transform(X)
    X = fs_pipeline.transform(X)
    utils.log('Successfully applied pipelines')
    utils.log(f'X transformed: {initial_shape} -> {X.shape}')

    pred = md.model.predict(X)

    return pred

In [69]:
features = list(df_val.columns)
features = [i for i in features if i!= 'target']

pred = make_predictions(
    df_val[features], 
    model,
    fe_pipeline,
    fs_pipeline
    )

[2022-05-27 15:33:35] Successfully loaded model
Dropped numerical variables: 
 {'flag_dev', 'agg_l3m_flag_dev'}



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[2022-05-27 15:33:37] Successfully applied pipelines
[2022-05-27 15:33:37] X transformed: (111016, 58) -> (111016, 35)


In [70]:
pred

array([0.72822965, 0.64891126, 0.87812256, ..., 0.5715512 , 0.51277254,
       0.6714902 ])

In [82]:
from datetime import datetime
import uuid
def make_predictions_table(pred, X):
    
    Xpred = pd.DataFrame()
    Xpred['cod_cliente'] = X['cod_cliente'].copy()
    Xpred['churn_prediction'] = pred
    Xpred['prediction_time'] = datetime.now().strftime(format='%Y-%m-%d %H:%M:%S')
    Xpred['variant'] = f'20220527'
    Xpred['batch_id'] = str(uuid.uuid4())
    Xpred['model_status'] = 'poc'

    return Xpred

In [83]:
make_predictions_table(pred, df_val)

Unnamed: 0,cod_cliente,churn_prediction,prediction_time,variant,batch_id,model_status
0,1585777,0.728230,2022-05-27 18:46:31,20220527,633e2377-9683-452d-8c3d-14633947000f,poc
1,1585777,0.648911,2022-05-27 18:46:31,20220527,633e2377-9683-452d-8c3d-14633947000f,poc
2,1585780,0.878123,2022-05-27 18:46:31,20220527,633e2377-9683-452d-8c3d-14633947000f,poc
3,1587327,0.009670,2022-05-27 18:46:31,20220527,633e2377-9683-452d-8c3d-14633947000f,poc
4,1587327,0.003378,2022-05-27 18:46:31,20220527,633e2377-9683-452d-8c3d-14633947000f,poc
...,...,...,...,...,...,...
111011,53269348,0.839738,2022-05-27 18:46:31,20220527,633e2377-9683-452d-8c3d-14633947000f,poc
111012,53285868,0.638432,2022-05-27 18:46:31,20220527,633e2377-9683-452d-8c3d-14633947000f,poc
111013,53285868,0.571551,2022-05-27 18:46:31,20220527,633e2377-9683-452d-8c3d-14633947000f,poc
111014,53285876,0.512773,2022-05-27 18:46:31,20220527,633e2377-9683-452d-8c3d-14633947000f,poc


In [90]:
# for i in range(28,32):
#     os.system(f'gsutil cp gs://gpa-churn/data/processed/steps/after_stix_{i}.parquet gs://gpa-churn/data/processed/test/after_stix_{i}.parquet')

In [91]:
import get_data

In [97]:
from google.cloud import storage
def get_prediction_data(
    bucket:str='gpa-churn',
    prefix:str='data/processed/test/after_stix_'
    ):
    
    storage_client = storage.Client()
    obj_list = storage_client.list_blobs(bucket)
    obj_list = [i.name for i in obj_list if prefix in i.name]
    
    df_list = []
    for obj in obj_list:
        local_df = pd.read_parquet('gs://gpa-churn/'+obj)
        df_list.append(local_df)
        print(f'added {prefix}{obj}')

    df = pd.concat(df_list, axis=0)
    df.drop_duplicates(inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    target = 'target'
    features = list(df.columns)
    features = [i for i in features if i != target]

    X = df[features]
    print('Successfully read test data.')
    print(f'X:{X.shape}')
    
    return X

In [98]:
get_prediction_data()

added data/processed/test/after_stix_data/processed/test/after_stix_28.parquet
added data/processed/test/after_stix_data/processed/test/after_stix_29.parquet
added data/processed/test/after_stix_data/processed/test/after_stix_30.parquet
added data/processed/test/after_stix_data/processed/test/after_stix_31.parquet
Successfully read test data.
X:(426514, 58)


Unnamed: 0,cod_cliente,val_venda_bruta_cupom,qtd_item_venda,val_gross_margin_cupom,val_vend_bruta_mercad,flg_vend_meu_desct,valor_desconto,flag_dev,tipo_promo_0,tipo_promo_1,...,sexo,cidade,uf,region,pib_percapita,idade,delta_de_cadastro,ind_email,cadastro_stix,delta_de_stix
0,1491247,132.279999,32.0,62.290001,172.709991,0.0,0.0,0.0,0.0,0.0,...,F,santana de parnaiba,sp,se,51140.0,24208.0,7984.0,,0,
1,1506612,77.160004,6.0,19.107000,77.160004,0.0,0.0,0.0,0.0,0.0,...,M,sao paulo,sp,se,51140.0,18770.0,1605.0,,0,
2,1506658,33.369999,6.0,8.052999,50.639999,0.0,0.0,0.0,0.0,0.0,...,M,santos,sp,se,51140.0,23052.0,6423.0,,0,
3,1507390,49.990002,2.0,26.034000,99.980003,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,0,
4,1511025,173.550003,8.0,29.380001,190.529999,0.0,0.0,0.0,0.0,0.0,...,F,sao paulo,sp,se,51140.0,19485.0,3761.0,1.0,1,536.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
426509,53325944,86.129997,8.0,21.305000,72.379997,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,0,
426510,53346080,1201.329956,94.0,5.044000,14.290000,0.0,0.0,0.0,0.0,0.0,...,F,sorocaba,sp,se,51140.0,13033.0,126.0,,0,
426511,53346080,967.619995,64.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,F,sorocaba,sp,se,51140.0,13064.0,157.0,,0,
426512,53346080,777.059998,50.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,F,sorocaba,sp,se,51140.0,13095.0,188.0,,0,


---

In [105]:
df = pd.read_parquet('gs://gpa-churn/data/processed/output/predictions.parquet')
df.head(40)

Unnamed: 0,cod_cliente,churn_prediction,prediction_time,variant,batch_id,model_status
0,1491247,0.802752,2022-05-27 20:04:00,20220520-201927,c0485759-76a1-4e84-bc03-1fd2bd9a36c3,poc
1,1506612,0.851889,2022-05-27 20:04:00,20220520-201927,c0485759-76a1-4e84-bc03-1fd2bd9a36c3,poc
2,1506658,0.84503,2022-05-27 20:04:00,20220520-201927,c0485759-76a1-4e84-bc03-1fd2bd9a36c3,poc
3,1507390,0.936035,2022-05-27 20:04:00,20220520-201927,c0485759-76a1-4e84-bc03-1fd2bd9a36c3,poc
4,1511025,0.503865,2022-05-27 20:04:00,20220520-201927,c0485759-76a1-4e84-bc03-1fd2bd9a36c3,poc
5,1511025,0.638865,2022-05-27 20:04:00,20220520-201927,c0485759-76a1-4e84-bc03-1fd2bd9a36c3,poc
6,1511025,0.525433,2022-05-27 20:04:00,20220520-201927,c0485759-76a1-4e84-bc03-1fd2bd9a36c3,poc
7,1520043,0.800168,2022-05-27 20:04:00,20220520-201927,c0485759-76a1-4e84-bc03-1fd2bd9a36c3,poc
8,1520043,0.662534,2022-05-27 20:04:00,20220520-201927,c0485759-76a1-4e84-bc03-1fd2bd9a36c3,poc
9,1520043,0.774884,2022-05-27 20:04:00,20220520-201927,c0485759-76a1-4e84-bc03-1fd2bd9a36c3,poc
