-------
# Solution DEMO

In [1]:
import os
import sys
import json
import joblib
import pandas as pd
from io import BytesIO
from google.cloud import storage

---
### Reading artifacts

In [2]:
storage_client = storage.Client()
bucket = storage_client.get_bucket('gpa-churn')
path_ref = 'artifacts/training_pipeline/production/'

In [3]:
endpoint_information = 'endpoint_information.json'
blob = bucket.blob(f'{path_ref}{endpoint_information}')
endpoint_information = json.loads(blob.download_as_string(client=None))
endpoint_information

{'project_number': '437364709834', 'endpoint': '4274364647118733312'}

In [4]:
model_monitor_information = 'model_monitor_information.json'
blob = bucket.blob(f'{path_ref}{model_monitor_information}')
model_monitor_information = json.loads(blob.download_as_string(client=None))
model_monitor_information['model_monitor_id']

'6708920335770583040'

In [5]:
metrics = 'metrics.parquet'
metrics = pd.read_parquet(f'gs://gpa-churn/{path_ref}{metrics}')
metrics

Unnamed: 0,roc,precision,recall,f1
0,0.813921,0.530399,0.738476,0.617377


---

## Endpoint (Real-time)

### Create input and upload to cloud storage

In [6]:
path = 'gs://gpa-churn/data/processed/input/'
bucket = 'gpa-churn'
path_ref = 'data/processed/input/'

# reading dataframes in path folder
#-------------------------------------------------------
storage_client = storage.Client()
obj_list = storage_client.list_blobs(bucket)
obj_list = [i.name for i in obj_list if path_ref in i.name]
obj_list = obj_list[1:5]
df_list = []
for obj in obj_list:
    local_df = pd.read_parquet(f'gs://{bucket}/{obj}')
    df_list.append(local_df)
    print(f'added {path}{obj}')

# concatenating df_list and saving cod_client column in an independent df
#-------------------------------------------------------
df_endpoint_input = pd.concat(df_list, axis=0)
df_endpoint_input.drop_duplicates(inplace=True)
df_endpoint_input.reset_index(drop=True, inplace=True)

added gs://gpa-churn/data/processed/input/data/processed/input/after_stix_0.parquet
added gs://gpa-churn/data/processed/input/data/processed/input/after_stix_1.parquet
added gs://gpa-churn/data/processed/input/data/processed/input/after_stix_10.parquet
added gs://gpa-churn/data/processed/input/data/processed/input/after_stix_11.parquet


In [7]:
df_endpoint_input.head()

Unnamed: 0,cod_cliente,val_venda_bruta_cupom,qtd_item_venda,val_gross_margin_cupom,val_vend_bruta_mercad,flg_vend_meu_desct,valor_desconto,flag_dev,tipo_promo_0,tipo_promo_1,...,sexo,cidade,uf,region,pib_percapita,idade,delta_de_cadastro,ind_email,cadastro_stix,delta_de_stix
0,1,11.48,2.0,4.271,11.48,0.0,0.0,0.0,0.0,0.0,...,F,fortaleza,ce,ne,17912.0,18662.0,7845.0,,0,
1,14,583.659973,47.0,168.953995,521.73999,0.0,0.0,0.0,0.0,0.0,...,F,brasilia,df,co,90742.0,24995.0,553.0,,0,
2,14,844.429993,55.0,188.481003,648.880005,0.0,0.0,0.0,0.0,0.0,...,F,brasilia,df,co,90742.0,25026.0,584.0,,0,
3,14,434.459991,32.0,103.25,375.369995,0.0,0.0,0.0,0.0,0.0,...,F,brasilia,df,co,90742.0,25057.0,615.0,,0,
4,14,722.330017,28.0,136.904007,487.25,0.0,0.0,0.0,0.0,0.0,...,F,brasilia,df,co,90742.0,25085.0,643.0,,0,


In [8]:
df_endpoint_input[df_endpoint_input['idade']>=28000]
df_endpoint_input.sample(n=2000).to_parquet('gs://gpa-churn/data/processed/drift_validation/dataset_0.parquet', index=False, compression='gzip')

At this point, the trigger should start the endpoint prediction pipeline.

### Reading predictions from endpoint (real-time approach)

In [10]:
filename = '51f0d22d-867d-4d3a-b1f5-243bbd20faab.parquet'
path = 'gs://gpa-churn/data/processed/output/'
full_path = f'{path}{filename}'
df_endpoint_output = pd.read_parquet(full_path)
df_endpoint_output.head()

Unnamed: 0,cod_cliente,churn_prediction,reference_date,model_id,prediction_time,batch_id,model_stage
0,11143747,0.660241,2022-02-01,6842661081882558464,2022-06-10 16:46:27,51f0d22d-867d-4d3a-b1f5-243bbd20faab,poc
1,414237,0.200775,2022-03-01,6842661081882558464,2022-06-10 16:46:27,51f0d22d-867d-4d3a-b1f5-243bbd20faab,poc
2,11214410,0.351286,2022-03-01,6842661081882558464,2022-06-10 16:46:27,51f0d22d-867d-4d3a-b1f5-243bbd20faab,poc
3,10849732,0.052581,2022-02-01,6842661081882558464,2022-06-10 16:46:27,51f0d22d-867d-4d3a-b1f5-243bbd20faab,poc
4,19394792,0.523533,2022-01-01,6842661081882558464,2022-06-10 16:46:27,51f0d22d-867d-4d3a-b1f5-243bbd20faab,poc


In [12]:
# unique client ids
len(df_endpoint_output)

2000

---

## Batch

### Create input and upload to cloud storage

In [13]:
path = 'gs://gpa-churn/data/processed/input/'
bucket = 'gpa-churn'
path_ref = 'data/processed/input/'

# reading dataframes in path folder
#-------------------------------------------------------
storage_client = storage.Client()
obj_list = storage_client.list_blobs(bucket)
obj_list = [i.name for i in obj_list if path_ref in i.name]
obj = obj_list[1]
batch_filepath = f'gs://{bucket}/{obj}'
df_batch_input = pd.read_parquet(batch_filepath)
print(f'added {batch_filepath}')
df_batch_input.head()

added gs://gpa-churn/data/processed/input/after_stix_0.parquet


Unnamed: 0,cod_cliente,val_venda_bruta_cupom,qtd_item_venda,val_gross_margin_cupom,val_vend_bruta_mercad,flg_vend_meu_desct,valor_desconto,flag_dev,tipo_promo_0,tipo_promo_1,...,sexo,cidade,uf,region,pib_percapita,idade,delta_de_cadastro,ind_email,cadastro_stix,delta_de_stix
0,1,11.48,2.0,4.271,11.48,0.0,0.0,0.0,0.0,0.0,...,F,fortaleza,ce,ne,17912.0,18662.0,7845.0,,0,
1,14,583.659973,47.0,168.953995,521.73999,0.0,0.0,0.0,0.0,0.0,...,F,brasilia,df,co,90742.0,24995.0,553.0,,0,
2,14,844.429993,55.0,188.481003,648.880005,0.0,0.0,0.0,0.0,0.0,...,F,brasilia,df,co,90742.0,25026.0,584.0,,0,
3,14,434.459991,32.0,103.25,375.369995,0.0,0.0,0.0,0.0,0.0,...,F,brasilia,df,co,90742.0,25057.0,615.0,,0,
4,14,722.330017,28.0,136.904007,487.25,0.0,0.0,0.0,0.0,0.0,...,F,brasilia,df,co,90742.0,25085.0,643.0,,0,


In [14]:
df_batch_input.to_parquet(batch_filepath, index=False, compression='gzip')

At this point, the trigger should start the batch prediction pipeline.

### Reading batch prediction output

In [15]:
filename = 'predictions.parquet'
path = 'gs://gpa-churn/data/processed/batch_output/'
full_path = f'{path}{filename}'
df_batch_output = pd.read_parquet(full_path)
df_batch_output.head()

Unnamed: 0,cod_cliente,churn_prediction,reference_date,prediction_time,model_stage
0,1,0.868395,2022-02-01,2022-06-10 16:54:57,poc
1,14,0.368346,2021-12-01,2022-06-10 16:54:57,poc
2,14,0.22718,2022-01-01,2022-06-10 16:54:57,poc
3,14,0.215375,2022-02-01,2022-06-10 16:54:57,poc
4,14,0.207159,2022-03-01,2022-06-10 16:54:57,poc


In [16]:
# unique client ids
len(df_batch_output)

1468915