In [31]:
import os
import gc
import sys
import sqlite3
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from google.cloud import storage
from sqlalchemy import create_engine
from sklearn.pipeline import Pipeline

sys.path.append(os.path.dirname(os.getcwd())+'/src')
import pipeline_modules

In [32]:
prefix = 'gs://gpa-churn/'

In [34]:
# listing objects in blob storage bucket
storage_client = storage.Client()
obj_list = storage_client.list_blobs('gpa-churn')
obj_list = [i.name for i in obj_list if 'STAGING/' in i.name]

tables_dict = {
    'register':[i for i in obj_list if 'cadastro' in i],
    'stix':[i for i in obj_list if 'stix_optin' in i],
    'email':[i for i in obj_list if 'optin_email' in i],
    'items':[i for i in obj_list if 'relevanc_item' in i],
    'stores':[i for i in obj_list if 'relevanc_store' in i],
    'products':[i for i in obj_list if 'relevanc_product' in i],
    'activations':[i for i in obj_list if 'mov_vend_crm' in i]
}

In [35]:
def get_register_data(
    tables_dict:dict
    ):
    
    register = read_files_from_staging(
        tables_dict['register']
        )

    register.sort_values(by=['idcliente','datultatual'], inplace=True, ascending=True)
    register.drop_duplicates(subset=['idcliente','datultatual'], keep='last')

    register_cols_to_drop = ['codorigcliente','codorigcliente']
    register = register.drop(columns=register_cols_to_drop)
    register = register.dropna(subset=['idcliente','cidadecli','ufcli'], how='any', axis=0)

    register_cols_to_rename = {
        'idcliente':'cod_cliente',
        'datcadorigem':'data_cadastro',
        'codsexo':'sexo',
        'cidadecli':'cidade',
        'ufcli':'uf',
        'datnasccli':'data_nascimento',
        'datultatual':'data_registro'
    }
    register.rename(columns=register_cols_to_rename, inplace=True)

    register_cols_to_lower = ['cidade', 'uf']
    for var in register_cols_to_lower:
        register[var] = register[var].str.lower()

    register = register.drop_duplicates(subset=['cod_cliente','sexo','cidade','uf','data_nascimento'])
    register.reset_index(drop=True, inplace=True)
    
    return register

In [36]:
register = get_register_data(tables_dict)
register

added file:  STAGING/tck_origemcadastro_20220425.zip
added file:  STAGING/tck_origemcadastro_20220427.zip
added file:  STAGING/tck_origemcadastro_20220428.zip
added file:  STAGING/tck_origemcadastro_20220429.zip
added file:  STAGING/tck_origemcadastro_20220430.zip
added file:  STAGING/tck_origemcadastro_20220501.zip
added file:  STAGING/tck_origemcadastro_20220502.zip
added file:  STAGING/tck_origemcadastro_20220503.csv.zip
added file:  STAGING/tck_origemcadastro_20220504.csv.zip
added file:  STAGING/tck_origemcadastro_20220505.csv.zip
added file:  STAGING/tck_origemcadastro_20220506.csv.zip
added file:  STAGING/tck_origemcadastro_20220509.csv.zip
added file:  STAGING/tck_origemcadastro_20220510.csv.zip
----------


Unnamed: 0,cod_cliente,data_cadastro,data_registro,sexo,cidade,uf,data_nascimento
0,1,2000-08-10,2014-12-30,F,fortaleza,ce,1970-12-29
1,14,2020-05-27,2020-05-27,F,brasilia,df,1953-06-26
2,15,2000-08-10,2008-05-24,F,fortaleza,ce,1920-11-11
3,16,2000-08-10,2008-05-24,M,fortaleza,ce,1942-02-25
4,17,2000-08-10,2008-05-23,M,fortaleza,ce,1920-11-11
...,...,...,...,...,...,...,...
8095738,54592416,2022-05-08,2022-05-08,,sao paulo,sp,1993-02-12
8095739,54592418,2022-05-08,2022-05-08,,sao paulo,sp,1997-05-25
8095740,54594623,2022-05-08,2022-05-09,F,sao paulo,sp,1948-02-15
8095741,54595150,2022-05-09,2022-05-09,,jacare,sp,1984-04-24


In [37]:
register.value_counts('uf')

uf
sp    4620549
rj     961433
ce     586562
df     430127
pe     248351
pr     216559
go     194686
pb     157618
pi     154228
mg     137397
ba     135068
se      72605
ms      58621
rs      20685
sc      17892
ma      11097
es      11077
rn      10386
mt       9884
ap       9747
pa       7901
al       7535
to       6630
am       4278
ro       2079
ac       2021
rr        727
dtype: int64

In [61]:
def get_region(x):    
    region_dict = {
        'se':['sp','mg','rj','es'],  #Sudeste
        's':['pr','sc','rs'],        #Sul
        'ne':['ma','ce','rn','pb','pe','al','se','ba','pi'], #Nordeste
        'n':['rr','ap','am','pa','ac','ro','to'],   #Norte
        'co':['mt','df','go','ms']    #Centro Oeste
        }
    
    for key in region_dict.keys():
        try:
            if x in region_dict[key]:
                return key
        except:
            pass
              
    return 'null'

In [62]:
register

Unnamed: 0,cod_cliente,data_cadastro,data_registro,sexo,cidade,uf,data_nascimento
0,1,2000-08-10,2014-12-30,F,fortaleza,ce,1970-12-29
1,14,2020-05-27,2020-05-27,F,brasilia,df,1953-06-26
2,15,2000-08-10,2008-05-24,F,fortaleza,ce,1920-11-11
3,16,2000-08-10,2008-05-24,M,fortaleza,ce,1942-02-25
4,17,2000-08-10,2008-05-23,M,fortaleza,ce,1920-11-11
...,...,...,...,...,...,...,...
8095738,54592416,2022-05-08,2022-05-08,,sao paulo,sp,1993-02-12
8095739,54592418,2022-05-08,2022-05-08,,sao paulo,sp,1997-05-25
8095740,54594623,2022-05-08,2022-05-09,F,sao paulo,sp,1948-02-15
8095741,54595150,2022-05-09,2022-05-09,,jacare,sp,1984-04-24


In [65]:
register['region_dict'] = register['uf'].apply(lambda x:get_region(x))

In [66]:
register

Unnamed: 0,cod_cliente,data_cadastro,data_registro,sexo,cidade,uf,data_nascimento,region_dict
0,1,2000-08-10,2014-12-30,F,fortaleza,ce,1970-12-29,ne
1,14,2020-05-27,2020-05-27,F,brasilia,df,1953-06-26,co
2,15,2000-08-10,2008-05-24,F,fortaleza,ce,1920-11-11,ne
3,16,2000-08-10,2008-05-24,M,fortaleza,ce,1942-02-25,ne
4,17,2000-08-10,2008-05-23,M,fortaleza,ce,1920-11-11,ne
...,...,...,...,...,...,...,...,...
8095738,54592416,2022-05-08,2022-05-08,,sao paulo,sp,1993-02-12,se
8095739,54592418,2022-05-08,2022-05-08,,sao paulo,sp,1997-05-25,se
8095740,54594623,2022-05-08,2022-05-09,F,sao paulo,sp,1948-02-15,se
8095741,54595150,2022-05-09,2022-05-09,,jacare,sp,1984-04-24,se


In [67]:
register.value_counts('region_dict')

region_dict
se    5730456
ne    1383450
co     693318
s      255136
n       33383
dtype: int64

---

In [7]:
# %pip install geopy
# %pip install pycep_correios

from geopy.geocoders import Nominatim
from pycep_correios import get_address_from_cep, WebService

In [10]:
geolocator = Nominatim(user_agent="get_lat_long", timeout=1)
def get_lat_long(municipio):

    location = geolocator.geocode(municipio)
    return location.latitude, location.longitude

In [17]:
for i in ['uberlandia', 'patrocinio', 'goias', 'sao paulo', 'porto alegre']:
    print(get_lat_long(i))

(-18.9188041, -48.2767837)
(-18.9408294, -46.9926624)
(-15.9323662, -50.1392928)
(-23.5506507, -46.6333824)
(-30.0324999, -51.2303767)
