In [1]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from google.cloud import storage

In [2]:
prefix = 'gs://gpa-churn/'

## Preprocessing data from STAGING

In [3]:
# listing objects in blob storage bucket
storage_client = storage.Client()
obj_list = storage_client.list_blobs('gpa-churn')
obj_list = [i.name for i in obj_list if 'STAGING/' in i.name]

dict_tables = {
    'registro':[i for i in obj_list if 'cadastro' in i],
    'stix':[i for i in obj_list if 'stix_optin' in i],
    'ativacao':[i for i in obj_list if 'ativacao_resgate_stix' in i],
    'email':[i for i in obj_list if 'optin_email' in i],
    'itens':[i for i in obj_list if 'relevanc_item' in i],
    'loja':[i for i in obj_list if 'relevanc_store' in i],
    'produto':[i for i in obj_list if 'relevanc_product' in i],
    'desconto':[i for i in obj_list if 'ativ_meu_desconto' in i],
    'vendas':[i for i in obj_list if 'mov_vend_crm' in i]
}

In [4]:
dict_tables

{'registro': ['STAGING/tck_origemcadastro_20220425.zip',
  'STAGING/tck_origemcadastro_20220427.zip',
  'STAGING/tck_origemcadastro_20220428.zip',
  'STAGING/tck_origemcadastro_20220429.zip',
  'STAGING/tck_origemcadastro_20220430.zip',
  'STAGING/tck_origemcadastro_20220501.zip',
  'STAGING/tck_origemcadastro_20220502.zip'],
 'stix': ['STAGING/v_stix_optin_20220427.zip',
  'STAGING/v_stix_optin_20220428.zip',
  'STAGING/v_stix_optin_20220429.zip',
  'STAGING/v_stix_optin_20220430.zip',
  'STAGING/v_stix_optin_20220501.zip',
  'STAGING/v_stix_optin_20220502.zip'],
 'ativacao': ['STAGING/ativacao_resgate_stix_20220425.zip',
  'STAGING/ativacao_resgate_stix_20220427.zip',
  'STAGING/ativacao_resgate_stix_20220428.zip',
  'STAGING/ativacao_resgate_stix_20220429.zip',
  'STAGING/ativacao_resgate_stix_20220430.zip',
  'STAGING/ativacao_resgate_stix_20220501.zip'],
 'email': ['STAGING/pa_optin_email_delivery_20220427.zip',
  'STAGING/pa_optin_email_delivery_20220428.zip',
  'STAGING/pa_optin

---

### Registro

Se houver clientes com mais de uma entrada de endereço, o que fazer? 
É possível mapear o momento em que aconteceu a mudança de endereço no cadastro? Manter primeira/última entrada?

* A atualização sobrescreve os valores anteriores (keep='last')
* data em que o cliente se cadastrou no grupo extra/...

In [16]:
df_list = []
for file in dict_tables['registro']:
    try:
        local_df = pd.read_csv(prefix + file, sep=';')
    except:
        local_df = pd.read_csv(prefix + file, sep='\n')
        column_list = local_df.columns[0].split(';')
        local_df = local_df[local_df.columns[0]].str.split(';', expand=True)
        local_df.columns = column_list
    df_list.append(local_df)
    print('added file: ', file)
print('-'*10)

registro = pd.concat(df_list, axis=0)
registro = registro.dropna(subset=['datcadorigem'])
registro.sort_values(by=['datcadorigem'], inplace=True)
registro

added file:  STAGING/tck_origemcadastro_20220425.zip
added file:  STAGING/tck_origemcadastro_20220427.zip
added file:  STAGING/tck_origemcadastro_20220428.zip
added file:  STAGING/tck_origemcadastro_20220429.zip
added file:  STAGING/tck_origemcadastro_20220430.zip
added file:  STAGING/tck_origemcadastro_20220501.zip
added file:  STAGING/tck_origemcadastro_20220502.zip
----------


Unnamed: 0,idcliente,codorigcliente,datcadorigem,datultatual,codsexo,cidadecli,ufcli,datnasccli
1191671,810904,CRM5,1997-02-04,2020-05-17,F,SAO PAULO,SP,1961-12-20
4490759,2078616,CRM5,1997-02-24,2017-08-25,M,SAO PAULO,SP,1962-08-17
5299361,17272344,CRM5,1998-08-12,2017-08-14,M,VALINHOS,SP,1958-12-19
287945,303150,CRM5,1999-04-26,2022-02-06,F,SAO PAULO,SP,1972-01-13
982538,3631526,CRM5,1999-10-02,2015-01-01,M,CAMPINAS,SP,1935-07-05
...,...,...,...,...,...,...,...,...
0,40986465,CRM5,2040-11-04,2040-11-04,F,SAO PAULO,SP,1949-03-03
4556992,38335999,CRM5,2040-11-04,2040-11-04,F,SALTO,SP,1988-04-28
0,38335999,CRM5,2040-11-04,2040-11-04,F,SALTO,SP,1988-04-28
1,38335999,CRM5,2040-11-04,2040-11-04,F,SALTO,SP,1988-04-28


In [17]:
registro['datcadorigem'].value_counts()

2020-06-01    102817
2018-06-23     49396
2018-05-03     46559
2013-10-23     35415
2013-10-25     27888
               ...  
2008-02-16         1
2008-02-15         1
2008-02-11         1
2008-02-06         1
1997-02-04         1
Name: datcadorigem, Length: 8229, dtype: int64

In [24]:
mask_2021 = registro['datcadorigem']>='2021-01-01'
mask_2022 = registro['datcadorigem']<='2022-05-05'
registro[mask_2021 & mask_2022]

Unnamed: 0,idcliente,codorigcliente,datcadorigem,datultatual,codsexo,cidadecli,ufcli,datnasccli
9636244,52136169,CRM5,2021-01-01,2021-01-01,N,,,1965-09-25
6621822,52124577,CRM5,2021-01-01,2021-01-01,F,,,1970-02-22
3193530,52124644,CRM5,2021-01-01,2021-01-01,F,TERESINA,PI,1976-08-19
7407828,52132288,CRM5,2021-01-01,2021-01-01,N,,,2000-05-09
9732894,52132800,CRM5,2021-01-01,2021-01-01,F,,,1981-08-08
...,...,...,...,...,...,...,...,...
75,37512247,CRM5,2022-05-01,2022-05-01,F,,,1980-12-19
74,36511397,CRM5,2022-05-01,2022-05-01,F,,,1959-08-02
71,35803223,CRM5,2022-05-01,2022-05-01,F,BELO HORIZONTE,MG,1975-10-18
64,32730461,CRM5,2022-05-01,2022-05-01,M,,,1940-04-22


In [60]:
# mask_2022 = registro['datcadorigem']>='2022-05-05'
# registro[mask_2021 & mask_2022]

---

### Stix

O campo dt_cadastro_origem (stix) é, de fato, a mesma informação de datcadorigem do dataframe 'registro'?

* data em que o cliente fez cadastro no programa stix
    * oct 2020

In [18]:
df_list = []
for file in dict_tables['stix']:
    try:
        local_df = pd.read_csv(prefix + file, sep=';')
    except:
        local_df = pd.read_csv(prefix + file, sep='\n')
        column_list = local_df.columns[0].split(';')
        local_df = local_df[local_df.columns[0]].str.split(';', expand=True)
        local_df.columns = column_list
    df_list.append(local_df)
    print('added file: ', file)
print('-'*10)

stix = pd.concat(df_list, axis=0)
stix = stix.dropna(subset=['dt_cadastro_origem'])
stix.sort_values(by=['dt_cadastro_origem'], inplace=True)
stix

added file:  STAGING/v_stix_optin_20220427.zip
added file:  STAGING/v_stix_optin_20220428.zip
added file:  STAGING/v_stix_optin_20220429.zip
added file:  STAGING/v_stix_optin_20220430.zip
added file:  STAGING/v_stix_optin_20220501.zip
added file:  STAGING/v_stix_optin_20220502.zip
----------


Unnamed: 0,id_cliente,dt_cadastro_origem,dt_nascimento,ind_email
1223762,30024411,2020-05-27,1997-08-28,1
1234656,45899961,2020-05-27,1991-08-16,1
1171283,30024411,2020-05-27,1997-08-28,1
1259802,45899961,2020-05-27,1991-08-16,1
1229389,30024411,2020-05-27,1997-08-28,1
...,...,...,...,...
2144,41710394,2022-05-02,1976-08-15,1
2906,3828296,2022-05-02,1972-12-26,2
2811,35082038,2022-05-02,1990-07-20,1
2646,48387949,2022-05-02,1987-10-08,1


In [19]:
stix['dt_cadastro_origem'].value_counts()

2020-06-01    2294985
2020-10-13     139870
2020-10-05     127589
2020-10-14     111765
2020-10-15     111420
               ...   
2022-05-01        464
2022-05-02         27
2020-05-28         24
2020-05-29         12
2020-05-27         12
Name: dt_cadastro_origem, Length: 704, dtype: int64

In [25]:
mask_2021 = stix['dt_cadastro_origem']>='2021-01-01'
mask_2022 = stix['dt_cadastro_origem']<='2022-05-05'
stix[mask_2021 & mask_2022]

Unnamed: 0,id_cliente,dt_cadastro_origem,dt_nascimento,ind_email
357079,52134884,2021-01-01,1984-11-23,1
2533125,44321757,2021-01-01,2000-09-29,1
2159716,52135999,2021-01-01,1976-10-29,1
1550639,52137022,2021-01-01,1998-03-16,1
558490,34750142,2021-01-01,1968-01-21,1
...,...,...,...,...
2144,41710394,2022-05-02,1976-08-15,1
2906,3828296,2022-05-02,1972-12-26,2
2811,35082038,2022-05-02,1990-07-20,1
2646,48387949,2022-05-02,1987-10-08,1


---

### Vendas - utilização do desconto

chave = [id_cliente, num_cupom, 'cod_plu_orig']

Sem dados históricos.

* Tabela de ativação do desconto
* FLG_VEND_MEU_DESCT - uso do desconto
* VAL_DESCT_MEU_DESCT - valor do desconto

In [27]:
df_list = []
for file in dict_tables['vendas']:
    try:
        local_df = pd.read_csv(prefix + file, sep=';')
    except:
        local_df = pd.read_csv(prefix + file, sep='\n')
        column_list = local_df.columns[0].split(';')
        local_df = local_df[local_df.columns[0]].str.split(';', expand=True)
        local_df.columns = column_list
    df_list.append(local_df)
    print('added file: ', file)
print('-'*10)

vendas = pd.concat(df_list, axis=0)
vendas = vendas.dropna(subset=['DAT_VENDA'])
vendas.sort_values(by=['DAT_VENDA'], inplace=True)
vendas

added file:  STAGING/v_dw_fm16_mov_vend_crm_20220427.zip
added file:  STAGING/v_dw_fm16_mov_vend_crm_20220428.zip
added file:  STAGING/v_dw_fm16_mov_vend_crm_20220429.zip
added file:  STAGING/v_dw_fm16_mov_vend_crm_20220430.zip
added file:  STAGING/v_dw_fm16_mov_vend_crm_20220502.zip
----------


Unnamed: 0,DAT_VENDA,NUM_CUPOM_FISCAL,COD_PLU_ORIG,COD_LOJA,ID_CLIENTE,VAL_GROSS_MARGIN_CUPOM,VAL_VEND_BRUTA_MERCAD,FLG_VEND_MEU_DESCT,VAL_DESCT_MEU_DESCT,COD_TIPO_PROMO_DESCT_FOCADO,FLG_DEVOL
0,2022-04-26,512879,248037,1696,37750261,-2.570,4.40,0,0.00,,0
870870,2022-04-26,185452,1090871,1020,19369782,3.801,10.49,0,0.00,,0
870869,2022-04-26,476617,6009229,1414,22095132,0.900,4.79,0,0.00,,0
870868,2022-04-26,599673,1089900,5431,53449005,0.506,2.49,0,0.00,,0
870867,2022-04-26,152390,247764,1018,46661027,1.216,3.77,0,0.00,,0
...,...,...,...,...,...,...,...,...,...,...,...
439904,2022-05-01,445958,1175993,2409,11507101,2.770,14.84,1,1.65,G,0
439903,2022-05-01,240710,4333364,50,3960897,1.624,6.19,0,0.00,,0
439902,2022-05-01,459356,4842262,1717,23981346,0.958,5.49,0,0.00,,0
439880,2022-05-01,17927,791540,6765,20263462,1.988,6.45,0,0.00,,0


In [43]:
mask_2021 = vendas['DAT_VENDA']>='2021-01-01'
mask_2022 = vendas['DAT_VENDA']<='2022-01-01'
vendas[mask_2021 & mask_2022]

Unnamed: 0,DAT_VENDA,NUM_CUPOM_FISCAL,COD_PLU_ORIG,COD_LOJA,ID_CLIENTE,VAL_GROSS_MARGIN_CUPOM,VAL_VEND_BRUTA_MERCAD,FLG_VEND_MEU_DESCT,VAL_DESCT_MEU_DESCT,COD_TIPO_PROMO_DESCT_FOCADO,FLG_DEVOL


In [44]:
mask_2021 = vendas['DAT_VENDA']>='2021-01-01'
mask_2022 = vendas['DAT_VENDA']<='2022-05-05'
vendas[mask_2021 & mask_2022]

Unnamed: 0,DAT_VENDA,NUM_CUPOM_FISCAL,COD_PLU_ORIG,COD_LOJA,ID_CLIENTE,VAL_GROSS_MARGIN_CUPOM,VAL_VEND_BRUTA_MERCAD,FLG_VEND_MEU_DESCT,VAL_DESCT_MEU_DESCT,COD_TIPO_PROMO_DESCT_FOCADO,FLG_DEVOL
0,2022-04-26,512879,248037,1696,37750261,-2.570,4.40,0,0.00,,0
870870,2022-04-26,185452,1090871,1020,19369782,3.801,10.49,0,0.00,,0
870869,2022-04-26,476617,6009229,1414,22095132,0.900,4.79,0,0.00,,0
870868,2022-04-26,599673,1089900,5431,53449005,0.506,2.49,0,0.00,,0
870867,2022-04-26,152390,247764,1018,46661027,1.216,3.77,0,0.00,,0
...,...,...,...,...,...,...,...,...,...,...,...
439904,2022-05-01,445958,1175993,2409,11507101,2.770,14.84,1,1.65,G,0
439903,2022-05-01,240710,4333364,50,3960897,1.624,6.19,0,0.00,,0
439902,2022-05-01,459356,4842262,1717,23981346,0.958,5.49,0,0.00,,0
439880,2022-05-01,17927,791540,6765,20263462,1.988,6.45,0,0.00,,0


In [30]:
vendas['DAT_VENDA'].min()

'2022-04-26'

---

### Itens

Compreende dados históricos. Semelhante/equivalente ao dataframe de vendas?

In [36]:
chunksize = 1*10 ** 6
for chunk in pd.read_csv(prefix + 'STAGING/relevanc_item_20220425.zip', sep=';', chunksize=chunksize):
    itens = chunk
    break
del chunk

itens = itens.dropna(subset=['dat_venda','cod_cliente'])
itens.sort_values(by=['dat_venda'], inplace=True)
itens

Unnamed: 0,cod_cliente,cod_tipo_item,cod_loja,dat_venda,num_cupom,cod_interno_prod,val_venda_bruta_cupom,qtd_item_venda
300393,35840229.0,Online,2337,2016-10-31,4696545,105002,4.49,1.0
185815,3347140.0,Online,2337,2016-10-31,4696494,1399035,8.79,1.0
300400,3347140.0,Online,2337,2016-10-31,4696494,3651568,5.55,2.0
185816,35840229.0,Online,2337,2016-10-31,4696545,1262520,3.39,15.0
300385,35840229.0,Online,2337,2016-10-31,4696545,1741209,5.69,2.0
...,...,...,...,...,...,...,...,...
823024,20691316.0,Online,2349,2022-04-23,12727900,4646266,1.79,4.0
823023,10513792.0,Online,2469,2022-04-23,12728391,1274811,5.77,3.0
823022,40361453.0,Online,2469,2022-04-23,12725160,247528,5.99,1.0
823020,6036514.0,Online,2349,2022-04-23,12726399,4218548,6.49,2.0


In [37]:
mask_2021 = itens['dat_venda']>='2021-01-01'
mask_2022 = itens['dat_venda']<='2022-01-01'
itens[mask_2021 & mask_2022]

Unnamed: 0,cod_cliente,cod_tipo_item,cod_loja,dat_venda,num_cupom,cod_interno_prod,val_venda_bruta_cupom,qtd_item_venda
353295,22760005.0,Online,1885,2021-08-17,11180122,1228164,13.00,1.0
320815,22461574.0,Online,1885,2021-08-17,11182135,1126432,2.49,14.0
738445,24801152.0,Online,2073,2021-09-18,11380373,336796,5.58,1.0
542569,42192789.0,Online,2073,2021-09-18,11381309,5100903,14.99,1.0
542582,11012111.0,Online,2073,2021-09-22,11400344,1269092,7.79,1.0
...,...,...,...,...,...,...,...,...
234467,25082860.0,Offline,2071,2022-01-01,427532,4562375,3.29,3.0
369603,45888156.0,Offline,1774,2022-01-01,22483,2089898,11.29,1.0
234468,11456390.0,Offline,2071,2022-01-01,157534,3796832,6.29,1.0
369604,41662502.0,Offline,1774,2022-01-01,233048,8801661,6.39,3.0


In [38]:
mask_2021 = itens['dat_venda']>='2021-01-01'
mask_2022 = itens['dat_venda']<='2022-05-05'
itens[mask_2021 & mask_2022]

Unnamed: 0,cod_cliente,cod_tipo_item,cod_loja,dat_venda,num_cupom,cod_interno_prod,val_venda_bruta_cupom,qtd_item_venda
353295,22760005.0,Online,1885,2021-08-17,11180122,1228164,13.00,1.0
320815,22461574.0,Online,1885,2021-08-17,11182135,1126432,2.49,14.0
738445,24801152.0,Online,2073,2021-09-18,11380373,336796,5.58,1.0
542569,42192789.0,Online,2073,2021-09-18,11381309,5100903,14.99,1.0
542582,11012111.0,Online,2073,2021-09-22,11400344,1269092,7.79,1.0
...,...,...,...,...,...,...,...,...
823024,20691316.0,Online,2349,2022-04-23,12727900,4646266,1.79,4.0
823023,10513792.0,Online,2469,2022-04-23,12728391,1274811,5.77,3.0
823022,40361453.0,Online,2469,2022-04-23,12725160,247528,5.99,1.0
823020,6036514.0,Online,2349,2022-04-23,12726399,4218548,6.49,2.0


---

### Ativação Stix

Não compreende registros históricos.
Descontinuada.

In [40]:
chunksize = 1*10 ** 6
for chunk in pd.read_csv(prefix + 'STAGING/ativacao_resgate_stix_20220425.zip', sep=';', chunksize=chunksize):
    ativacao = chunk
    break
del chunk

ativacao = ativacao.dropna(subset=['cod_mes','cod_cliente'])
ativacao

Unnamed: 0,cod_cliente,val_meta_pri_desafio,val_faltante_pri_desafio,val_stix,perc_ating_pri_desafio,perc_ating_pri_desafio_m_1,perc_ating_pri_desafio_m_2,qtd_stix_recebido,qtd_stix_recebido_m_1,qtd_stix_recebido_m_2,qtd_stix_recebido_ano,ind_premio_ativado,ind_premio_ativado_m_1,ind_premio_ativado_m_2,ind_premio_resgatado,ind_premio_resgatado_m_1,ind_premio_resgatado_m_2,dat_proc,cod_mes
0,1635,239.99,239.99,1350,0,0,0,0,0,0,0,0,0,0,0,0,0,2022-04-24,202204
1,2771,1549.99,808.99,4800,48,100,94,0,0,0,5340,0,0,0,0,0,0,2022-04-24,202204
2,3644,149.99,149.99,600,0,0,0,0,0,0,0,0,0,0,0,0,0,2022-04-24,202204
3,4345,215.99,215.99,1300,0,0,0,0,0,0,0,0,0,0,0,0,0,2022-04-24,202204
4,5130,289.99,255.99,1400,12,12,80,0,0,0,0,0,0,0,0,0,0,2022-04-24,202204
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,12369555,199.99,199.99,800,0,0,70,0,0,0,0,0,0,0,0,0,0,2022-04-24,202204
999996,12369617,349.99,0.00,1800,100,100,100,1920,0,0,8920,1,0,0,1,0,0,2022-04-24,202204
999997,12369804,319.99,319.99,1500,0,0,0,0,0,0,0,0,0,0,0,0,0,2022-04-24,202204
999998,12370216,359.99,158.99,1600,56,0,13,0,0,0,0,0,0,0,0,0,0,2022-04-24,202204


In [42]:
ativacao['cod_mes'].value_counts()

202204    1000000
Name: cod_mes, dtype: int64

---

### Descontos Stix

* DAT_ATIVA representa o dia em que o cliente fez uso do desconto ou recebeu uma notificação sobre o desconto?
* O dado será descontinuado.
* Em qual dataset seria possível mapear informações de uso de cupons/descontos?

In [48]:
descontos = pd.read_csv(prefix + dict_tables['desconto'][0], sep=';')
descontos = descontos.dropna(subset=['DAT_ATIVA'])
descontos.sort_values(by=['DAT_ATIVA'], inplace=True)
descontos

Unnamed: 0,ID_CLIENTE,COD_PROMOCAO_PDV,DAT_PROM_INICIO,DAT_PROM_FIM,DAT_ATIVA,HOR_ATIVACAO,FLG_ATIVACAO,QTD_LIMITADOR_PRODUTO,QTD_VENDIDA_PRODUTO,NUM_CICLO_MEU_DESCONTO,COD_CLASSIFIC_PROMO_MEU_DSCT
0,48858185,922881,2022-04-21,2022-04-27,2022-04-26,155417,A,10.0,0.0,126,0
272347,17035028,930855,2022-04-14,2022-04-27,2022-04-26,171128,A,4.0,0.0,126,1
272346,37975254,924871,2022-04-14,2022-04-27,2022-04-26,83343,A,12.0,0.0,126,1
272345,3050709,920500,2022-04-14,2022-04-27,2022-04-26,151828,A,3.0,0.0,126,2
272344,22058624,920631,2022-04-14,2022-04-27,2022-04-26,100343,A,1.0,0.0,126,2
...,...,...,...,...,...,...,...,...,...,...,...
326093,46950545,923275,2022-04-14,2022-04-27,2022-04-27,1347,A,6.0,0.0,126,2
387419,6042426,920557,2022-04-14,2022-04-27,2022-04-27,5737,A,6.0,0.0,126,2
225453,25021388,923806,2022-04-14,2022-04-27,2022-04-27,5212,A,3.0,0.0,126,2
123902,4209906,922698,2022-04-21,2022-04-27,2022-04-27,22858,A,10.0,0.0,126,0


In [51]:
mask_2021 = descontos['DAT_ATIVA']>='2021-01-01'
mask_2022 = descontos['DAT_ATIVA']<='2022-01-01'
descontos[mask_2021 & mask_2022]

Unnamed: 0,ID_CLIENTE,COD_PROMOCAO_PDV,DAT_PROM_INICIO,DAT_PROM_FIM,DAT_ATIVA,HOR_ATIVACAO,FLG_ATIVACAO,QTD_LIMITADOR_PRODUTO,QTD_VENDIDA_PRODUTO,NUM_CICLO_MEU_DESCONTO,COD_CLASSIFIC_PROMO_MEU_DSCT


---

### Email

Indica ids validos para ativação.

In [55]:
df_list = []
for file in dict_tables['email']:
    try:
        local_df = pd.read_csv(prefix + file, sep=';', low_memory=False)
    except:
        local_df = pd.read_csv(prefix + file, sep='\n', low_memory=False)
        column_list = local_df.columns[0].split(';')
        local_df = local_df[local_df.columns[0]].str.split(';', expand=True)
        local_df.columns = column_list
    df_list.append(local_df)
    print('added file: ', file)
print('-'*10)

email = pd.concat(df_list, axis=0)
email

added file:  STAGING/pa_optin_email_delivery_20220427.zip
added file:  STAGING/pa_optin_email_delivery_20220428.zip
added file:  STAGING/pa_optin_email_delivery_20220429.zip
added file:  STAGING/pa_optin_email_delivery_20220430.zip
added file:  STAGING/pa_optin_email_delivery_20220501.zip
added file:  STAGING/pa_optin_email_delivery_20220502.zip
----------


Unnamed: 0,customer_id,email_deliverability_status,email_permission_status,dat_referencia
0,19702639.0,D,I,2022-04-26
1,37499976.0,D,I,2022-04-26
2,45418434.0,D,I,2022-04-26
3,10578739.0,D,I,2022-04-26
4,25073986.0,D,I,2022-04-26
...,...,...,...,...
6824551,0041732008,D,I,2022-05-01
6824552,0006918098,D,I,2022-05-01
6824553,0021595707,D,I,2022-05-01
6824554,0046712310,D,I,2022-05-01


In [56]:
email['email_deliverability_status'].value_counts()

D    35059477
Name: email_deliverability_status, dtype: int64

In [57]:
email['email_permission_status'].value_counts()

I    35059477
Name: email_permission_status, dtype: int64

In [58]:
mask_2021 = email['dat_referencia']>='2021-01-01'
mask_2022 = email['dat_referencia']<='2022-01-01'
email[mask_2021 & mask_2022]

Unnamed: 0,customer_id,email_deliverability_status,email_permission_status,dat_referencia


---

### Loja

In [53]:
loja = pd.read_csv(prefix + dict_tables['loja'][0], sep=';')
loja

Unnamed: 0,cod_loja,cod_band,desc_band,nom_band,sgl_uf,sgl_uf_regional
0,1,30400,PAO DE ACUCAR,PA,SP,SP/PR
1,3,30400,PAO DE ACUCAR,PA,SP,SP/PR
2,4,30400,PAO DE ACUCAR,PA,SP,SP/PR
3,6,30400,PAO DE ACUCAR,PA,SP,SP/PR
4,8,30400,PAO DE ACUCAR,PA,SP,SP/PR
...,...,...,...,...,...,...
565,8021,30500,E-COMMERCE..,PA,CE,NE
566,8028,30500,E-COMMERCE..,PA,MG,CO
567,8050,30500,E-COMMERCE..,PA,SP,SP/PR
568,8071,30500,E-COMMERCE..,PA,PE,NE


---

### Produto

In [52]:
produto = pd.read_csv(prefix + dict_tables['produto'][0], sep=';')
produto

Unnamed: 0,cod_plu,desc_plu,cod_categoria,desc_categoria,cod_subcategoria,desc_subcategoria,cod_grupo,desc_grupo,cod_subgrupo,desc_subgrupo,cod_departamento,desc_departamento,ind_ativo
0,833,MOCOTO BOV CONG KG,700,CARNES,701,BOVINOS,3,BOV-CONGELADOS,3,CONG-MIUDO BOVINO,2,PERECIVEIS,True
1,2233,AZEITONA VERDE MATONETO,900,PADARIA E ROTISSERIE,903,ROTISSERIE,10,EMPORIO E APERITIVOS PARA FESTA,10,AZEITONAS,2,PERECIVEIS,True
2,6033,STOLLEN DE NATAL KG,900,PADARIA E ROTISSERIE,902,PADARIA,11,PADARIA E CONFEITARIA SAZONAL,1,PANETONES,2,PERECIVEIS,True
3,8433,BATATA COMUM LAVADA KG,600,FLV,602,LEGUMES,18,BATATA A GRANEL,2,BATATA LAVADA A GRANEL,2,PERECIVEIS,True
4,11433,PRESUNTO FRA QUADRADO FAT,800,PEREC COMPLEMENTAR,805,FRIOS MANIPULADOS,36,FRIO DE AVE MANIPULADO,1,BLANQUET E TRANCHET MANIPULADO,2,PERECIVEIS,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
265526,8859136,VINAGRE ITALDORO BALSAMICO 500ML,300,MERC COMPLEMENTAR,330,MERCEARIA SALGADA,719,VINAGRES E AGRINS,6,VINAGRES SABORIZADOS,1,MERCEARIA,True
265527,8860736,VINHO CH.LA JOYA HAVERT 750ML,100,LIQUIDA,104,VINHOS,592,VINHOS BRANCOS,2,BRANCOS CHILENOS,1,MERCEARIA,True
265528,8862136,VINHO GRAMBELL COOLER 750ML,100,LIQUIDA,104,VINHOS,592,VINHOS BRANCOS,5,BRANCOS NACIONAIS,1,MERCEARIA,True
265529,8864536,VINHO STA.HELENA SELEC.DI SAUV BLA.,100,LIQUIDA,104,VINHOS,592,VINHOS BRANCOS,2,BRANCOS CHILENOS,1,MERCEARIA,True
