# Análise e Engenharia de Dados no E-commerce Brasileiro


## Introdução


No presente projeto, utilizou-se o 'Brazilian E-Commerce Public Dataset by Olist' como base para simular o papel de engenheiros de dados em uma empresa de e-commerce. O propósito é extrair insights valiosos que não apenas melhorem as operações de negócios, mas também otimizem a logística e aprimorem a experiência do cliente. Ao mergulharmos nos dados reais do setor de e-commerce brasileiro, esta iniciativa oferece uma oportunidade prática de explorar tendências, identificar padrões e, consequentemente, orientar decisões estratégicas. Além disso, destacamos a importância de análises orientadas por dados no contexto dinâmico do comércio eletrônico, evidenciando como tais abordagens podem impulsionar melhorias tangíveis e sustentáveis no desempenho empresarial.



### Importação de bibliotecas

In [2]:
# Importando a biblioteca pandas, importante para tratamento e visualização dos dados importados
import pandas as pd
import pandas_gbq
import numpy as np
#!pip install psycopg2
#! -binary


# Importando as biblioteca psycopg2 e sqlalchemy, para que possamos transferir os arquibvos tratados para um banco de dados.
import psycopg2
import sqlalchemy
from sqlalchemy import create_engine

# Importando a biblioteca zipfile para descompactar os arquivos importados via API do Kaggle.
import zipfile


from google.cloud import bigquery
import os

### Importando dataset via Kaggle API

In [2]:
#!pip install kaggle --user

In [5]:
# Download do conjunto de dados via API do Kaggle.\n",
!kaggle datasets download -d olistbr/brazilian-ecommerce

Dataset URL: https://www.kaggle.com/datasets/olistbr/brazilian-ecommerce
License(s): CC-BY-NC-SA-4.0
Downloading brazilian-ecommerce.zip to C:\Users\Léo\Documents\GitHub\Projeto-Ecossistema-de-BigData




  0%|          | 0.00/42.6M [00:00<?, ?B/s]
  2%|2         | 1.00M/42.6M [00:00<00:19, 2.20MB/s]
  5%|4         | 2.00M/42.6M [00:00<00:10, 4.03MB/s]
  9%|9         | 4.00M/42.6M [00:00<00:05, 8.09MB/s]
 16%|#6        | 7.00M/42.6M [00:00<00:02, 13.1MB/s]
 23%|##3       | 10.0M/42.6M [00:00<00:02, 16.6MB/s]
 28%|##8       | 12.0M/42.6M [00:01<00:01, 17.7MB/s]
 35%|###5      | 15.0M/42.6M [00:01<00:01, 19.9MB/s]
 42%|####2     | 18.0M/42.6M [00:01<00:01, 21.2MB/s]
 49%|####9     | 21.0M/42.6M [00:01<00:01, 22.1MB/s]
 56%|#####6    | 24.0M/42.6M [00:01<00:00, 22.7MB/s]
 63%|######3   | 27.0M/42.6M [00:01<00:00, 23.0MB/s]
 70%|#######   | 30.0M/42.6M [00:01<00:00, 23.2MB/s]
 77%|#######7  | 33.0M/42.6M [00:01<00:00, 23.4MB/s]
 84%|########4 | 36.0M/42.6M [00:02<00:00, 23.6MB/s]
 91%|#########1| 39.0M/42.6M [00:02<00:00, 23.7MB/s]
 98%|#########8| 42.0M/42.6M [00:02<00:00, 23.8MB/s]
100%|##########| 42.6M/42.6M [00:02<00:00, 18.7MB/s]


In [6]:
#Descompactando arquivo recebido
zip_file = 'brazilian-ecommerce.zip'
# Selecionando uma pasta chamada /csv/ para melhorar a organização.
destination_folder = 'csv/'
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(destination_folder)

### Criação de dataframes

In [7]:
df_geolocation = pd.read_csv('csv/olist_geolocation_dataset.csv')
df_customers = pd.read_csv('csv/olist_customers_dataset.csv')
df_items = pd.read_csv('csv/olist_order_items_dataset.csv')
df_payments = pd.read_csv('csv/olist_order_payments_dataset.csv')
df_reviews = pd.read_csv('csv/olist_order_reviews_dataset.csv')
df_orders = pd.read_csv('csv/olist_orders_dataset.csv')
df_products = pd.read_csv('csv/olist_products_dataset.csv')
df_sellers = pd.read_csv('csv/olist_sellers_dataset.csv')
df_name = pd.read_csv('csv/product_category_name_translation.csv')

### Função de tratamento de Dados

In [8]:
# Define uma função para gerar um DataFrame de resumo das características dos dados
def analise(data):
    # Cria um DataFrame 'analise' com colunas para capturar várias características dos dados
    analise = pd.DataFrame({
        'característica': data.columns.values,  # Nomes das colunas
        'tipo_de_dados': data.dtypes.values,  # Tipos de dados das colunas
        'valor_nulo(%)': data.isna().mean().values * 100,  # Porcentagem de valores ausentes
        'valor_negativo(%)': [len(data[col][data[col] < 0]) / len(data) * 100 if col in data.select_dtypes(include=[np.number]).columns else 0 for col in data.columns],  # Porcentagem de valores negativos para colunas numéricas
        'valor_zero(%)': [len(data[col][data[col] == 0]) / len(data) * 100 if col in data.select_dtypes(include=[np.number]).columns else 0 for col in data.columns],  # Porcentagem de valores zero para colunas numéricas
        'duplicado': data.duplicated().sum(),  # Número de linhas duplicadas
        'n_único': data.nunique().values,  # Número de valores únicos para cada coluna
        'amostra_única': [data[col].unique() for col in data.columns]  # Amostra de valores únicos para cada coluna
    })
    
    # Arredonda os valores no DataFrame de resumo para 3 casas decimais
    return analise.round(3)



### Dataset Geo-Localização    
Geolocalização: Fornece dados de geolocalização relacionados aos clientes.


### Analisando dados de Geolocalização

![Geolocalização](img/dicionario/geolocalizacao.png)


In [9]:
print('Quantidade de Dados:',df_geolocation.shape)
df_geolocation.head(100)

Quantidade de Dados: (1000163, 5)


Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.644820,sao paulo,SP
2,1046,-23.546129,-46.642951,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP
...,...,...,...,...,...
95,1046,-23.546129,-46.642951,sao paulo,SP
96,1042,-23.544667,-46.640669,são paulo,SP
97,1009,-23.546341,-46.636239,sao paulo,SP
98,1050,-23.549107,-46.644105,são paulo,SP


In [10]:
analise(df_geolocation)

Unnamed: 0,característica,tipo_de_dados,valor_nulo(%),valor_negativo(%),valor_zero(%),duplicado,n_único,amostra_única
0,geolocation_zip_code_prefix,int64,0.0,0.0,0.0,261831,19015,"[1037, 1046, 1041, 1035, 1012, 1047, 1013, 102..."
1,geolocation_lat,float64,0.0,99.866,0.0,261831,717360,"[-23.54562128115268, -23.54608112703553, -23.5..."
2,geolocation_lng,float64,0.0,100.0,0.0,261831,717613,"[-46.63929204800168, -46.64482029837157, -46.6..."
3,geolocation_city,object,0.0,0.0,0.0,261831,8011,"[sao paulo, são paulo, sao bernardo do campo, ..."
4,geolocation_state,object,0.0,0.0,0.0,261831,27,"[SP, RN, AC, RJ, ES, MG, BA, SE, PE, AL, PB, C..."


Tipos de Dados: O conjunto de dados contém uma mistura de tipos de dados. geolocation_zip_code_prefix é do tipo int64, o que é apropriado para códigos postais. geolocation_lat (latitude) e geolocation_lng (longitude) são do tipo float64, que é adequado para coordenadas geográficas. geolocation_city e geolocation_state são do tipo objeto, indicando que provavelmente são valores de string representando nomes geográficos.
m Valores Ausentes ou Zero: Nenhuma das colunas contém valores nulos ou zero. Isso indica boa integridade dos dados para esses campos.

Valor Negativo: As colunas geolocation_lat e geolocation_lng mostram quase todos os valores como negativos (99% e 100%, respectivamente). Isso é realmente esperado para coordenadas no Brasil, já que está localizado no Hemisfério Ocidental (longitude negativa) e principalmente no Hemisfério Sul (latitude negati

Duplicados: O as: O conjunto de dados possui um número muito grande de linhas duplicadas (261.831), o que sugere um problema de entrada de dados ou que o processo de coleta de dados capturou várias entradas para os mesmos pontos de geolocalização.oints.df



### Procurando por nomes de cidades que não seguem um padrão


In [11]:
import re
def filtrar_cidade(data, col):
    pattern = re.compile("[^a-z\sA-Z0-9-\'+]")
    filtered_df = data[data[col].str.contains(pattern)]
    return filtered_df



In [12]:
filtrar_cidade(df_geolocation, 'geolocation_city')


Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
5,1012,-23.547762,-46.635361,são paulo,SP
14,1037,-23.545187,-46.637855,são paulo,SP
17,1024,-23.541390,-46.629899,são paulo,SP
21,1020,-23.552235,-46.628441,são paulo,SP
22,1011,-23.546690,-46.635447,são paulo,SP
...,...,...,...,...,...
1000094,99940,-28.060955,-51.858637,ibiaçá,RS
1000096,99900,-27.884844,-52.230025,getúlio vargas,RS
1000098,99900,-27.900022,-52.237668,getúlio vargas,RS
1000143,99930,-27.913659,-52.248615,estação,RS


### Dataset Clientes
Clientes: Contém informações sobre os clientes do sistema de comércio eletrônico.


### Analisando dados de Clientes

![Clientes](img/dicionario/clientes.png)

In [13]:
print('Quantidade de Dados:',df_customers.shape)
df_customers.head(10)

Quantidade de Dados: (99441, 5)


Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP
5,879864dab9bc3047522c92c82e1212b8,4c93744516667ad3b8f1fb645a3116a4,89254,jaragua do sul,SC
6,fd826e7cf63160e536e0908c76c3f441,addec96d2e059c80c30fe6871d30d177,4534,sao paulo,SP
7,5e274e7a0c3809e14aba7ad5aae0d407,57b2a98a409812fe9618067b6b8ebe4f,35182,timoteo,MG
8,5adf08e34b2e993982a47070956c5c65,1175e95fb47ddff9de6b2b06188f7e0d,81560,curitiba,PR
9,4b7139f34592b3a31687243a302fa75b,9afe194fb833f79e300e37e580171f22,30575,belo horizonte,MG


In [14]:
df_customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB


In [15]:
df_customers.describe()

Unnamed: 0,customer_zip_code_prefix
count,99441.0
mean,35137.474583
std,29797.938996
min,1003.0
25%,11347.0
50%,24416.0
75%,58900.0
max,99990.0


In [16]:
analise(df_customers)

Unnamed: 0,característica,tipo_de_dados,valor_nulo(%),valor_negativo(%),valor_zero(%),duplicado,n_único,amostra_única
0,customer_id,object,0.0,0.0,0.0,0,99441,"[06b8999e2fba1a1fbc88172c00ba8bc7, 18955e83d33..."
1,customer_unique_id,object,0.0,0.0,0.0,0,96096,"[861eff4711a542e4b93843c6dd7febb0, 290c77bc529..."
2,customer_zip_code_prefix,int64,0.0,0.0,0.0,0,14994,"[14409, 9790, 1151, 8775, 13056, 89254, 4534, ..."
3,customer_city,object,0.0,0.0,0.0,0,4119,"[franca, sao bernardo do campo, sao paulo, mog..."
4,customer_state,object,0.0,0.0,0.0,0,27,"[SP, SC, MG, PR, RJ, RS, PA, GO, ES, BA, MA, M..."


Tipos de dados: os tipos de dados no conjunto de dados incluem int64 para a coluna customer_zip_code_prefix e object para todas as outras colunas, como customer_id, customer_unique_id, customer_city e customer_state.

Sem valores ausentes ou negativos: nenhuma das colunas contém valores nulos ou negativos. Isso indica boa integridade de dados para esses campos.

Sem dupdicatas: não há linhas duplicadas neste segmento do conjunto de dados, o que sugere que cada entrada é única.

Diversidade de dados: Os vaamostra únicaunique para customer_state revelam que o conjunto de dados inclui clientes de uma ampla variedade de estados brasileiros, o que pode ser valioso para segmentação de mercado e análise regional.

### Dataset Itens de Pedido

In [17]:
df_items.head(10)

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.0,17.87
3,00024acbcdf0a6daa1e931b038114c75,1,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,2018-08-15 10:10:18,12.99,12.79
4,00042b26cf59d7ce69dfabb4e55b4fd9,1,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,2017-02-13 13:57:51,199.9,18.14
5,00048cc3ae777c65dbb7d2a0634bc1ea,1,ef92defde845ab8450f9d70c526ef70f,6426d21aca402a131fc0a5d0960a3c90,2017-05-23 03:55:27,21.9,12.69
6,00054e8431b9d7675808bcb819fb4a32,1,8d4f2bb7e93e6710a28f34fa83ee7d28,7040e82f899a04d1b434b795a43b4617,2017-12-14 12:10:31,19.9,11.85
7,000576fe39319847cbb9d288c5617fa6,1,557d850972a7d6f792fd18ae1400d9b6,5996cddab893a4652a15592fb58ab8db,2018-07-10 12:30:45,810.0,70.75
8,0005a1a1728c9d785b8e2b08b904576c,1,310ae3c140ff94b03219ad0adc3c778f,a416b6a846a11724393025641d4edd5e,2018-03-26 18:31:29,145.95,11.65
9,0005f50442cb953dcd1d21e1fb923495,1,4535b0e1091c278dfd193e5a1d63b39f,ba143b05f0110f0dc71ad71b4466ce92,2018-07-06 14:10:56,53.99,11.4


In [18]:
ordens_multiplas = df_items['order_item_id'] > 1
ordens_multiplas[ordens_multiplas].shape

(13984,)

In [19]:
df_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   order_id             112650 non-null  object 
 1   order_item_id        112650 non-null  int64  
 2   product_id           112650 non-null  object 
 3   seller_id            112650 non-null  object 
 4   shipping_limit_date  112650 non-null  object 
 5   price                112650 non-null  float64
 6   freight_value        112650 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 6.0+ MB


In [20]:
df_items['shipping_limit_date'] = pd.to_datetime(df_items['shipping_limit_date'])

In [21]:
df_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   order_id             112650 non-null  object        
 1   order_item_id        112650 non-null  int64         
 2   product_id           112650 non-null  object        
 3   seller_id            112650 non-null  object        
 4   shipping_limit_date  112650 non-null  datetime64[ns]
 5   price                112650 non-null  float64       
 6   freight_value        112650 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 6.0+ MB


### Dataset Pagamentos

In [22]:
df_payments.head(10)

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71
3,ba78997921bbcdc1373bb41e913ab953,1,credit_card,8,107.78
4,42fdf880ba16b47b59251dd489d4441a,1,credit_card,2,128.45
5,298fcdf1f73eb413e4d26d01b25bc1cd,1,credit_card,2,96.12
6,771ee386b001f06208a7419e4fc1bbd7,1,credit_card,1,81.16
7,3d7239c394a212faae122962df514ac7,1,credit_card,3,51.84
8,1f78449c87a54faf9e96e88ba1491fa9,1,credit_card,6,341.09
9,0573b5e23cbd798006520e1d5b4c6714,1,boleto,1,51.95


### Dataset Reviews

In [23]:
df_reviews.head(10)

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53
5,15197aa66ff4d0650b5434f1b46cda19,b18dcdf73be66366873cd26c5724d1dc,1,,,2018-04-13 00:00:00,2018-04-16 00:39:37
6,07f9bee5d1b850860defd761afa7ff16,e48aa0d2dcec3a2e87348811bcfdf22b,5,,,2017-07-16 00:00:00,2017-07-18 19:30:34
7,7c6400515c67679fbee952a7525281ef,c31a859e34e3adac22f376954e19b39d,5,,,2018-08-14 00:00:00,2018-08-14 21:36:06
8,a3f6f7f6f433de0aefbb97da197c554c,9c214ac970e84273583ab523dfafd09b,5,,,2017-05-17 00:00:00,2017-05-18 12:05:37
9,8670d52e15e00043ae7de4c01cc2fe06,b9bf720beb4ab3728760088589c62129,4,recomendo,aparelho eficiente. no site a marca do aparelh...,2018-05-22 00:00:00,2018-05-23 16:45:47


In [24]:
df_reviews.head(10)

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53
5,15197aa66ff4d0650b5434f1b46cda19,b18dcdf73be66366873cd26c5724d1dc,1,,,2018-04-13 00:00:00,2018-04-16 00:39:37
6,07f9bee5d1b850860defd761afa7ff16,e48aa0d2dcec3a2e87348811bcfdf22b,5,,,2017-07-16 00:00:00,2017-07-18 19:30:34
7,7c6400515c67679fbee952a7525281ef,c31a859e34e3adac22f376954e19b39d,5,,,2018-08-14 00:00:00,2018-08-14 21:36:06
8,a3f6f7f6f433de0aefbb97da197c554c,9c214ac970e84273583ab523dfafd09b,5,,,2017-05-17 00:00:00,2017-05-18 12:05:37
9,8670d52e15e00043ae7de4c01cc2fe06,b9bf720beb4ab3728760088589c62129,4,recomendo,aparelho eficiente. no site a marca do aparelh...,2018-05-22 00:00:00,2018-05-23 16:45:47


### Dataset Pedidos

In [25]:
df_orders.head(10)

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00
5,a4591c265e18cb1dcee52889e2d8acc3,503740e9ca751ccdda7ba28e9ab8f608,delivered,2017-07-09 21:57:05,2017-07-09 22:10:13,2017-07-11 14:58:04,2017-07-26 10:57:55,2017-08-01 00:00:00
6,136cce7faa42fdb2cefd53fdc79a6098,ed0271e0b7da060a393796590e7b737a,invoiced,2017-04-11 12:22:08,2017-04-13 13:25:17,,,2017-05-09 00:00:00
7,6514b8ad8028c9f2cc2374ded245783f,9bdf08b4b3b52b5526ff42d37d47f222,delivered,2017-05-16 13:10:30,2017-05-16 13:22:11,2017-05-22 10:07:46,2017-05-26 12:55:51,2017-06-07 00:00:00
8,76c6e866289321a7c93b82b54852dc33,f54a9f0e6b351c431402b8461ea51999,delivered,2017-01-23 18:29:09,2017-01-25 02:50:47,2017-01-26 14:16:31,2017-02-02 14:08:10,2017-03-06 00:00:00
9,e69bfb5eb88e0ed6a785585b27e16dbf,31ad1d1b63eb9962463f764d4e6e0c9d,delivered,2017-07-29 11:55:02,2017-07-29 12:05:32,2017-08-10 19:45:24,2017-08-16 17:14:30,2017-08-23 00:00:00


### Dataset Produtos

In [26]:
df_products.head(10)

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0
3,cef67bcfe19066a932b7673e239eb23d,bebes,27.0,261.0,1.0,371.0,26.0,4.0,26.0
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37.0,402.0,4.0,625.0,20.0,17.0,13.0
5,41d3672d4792049fa1779bb35283ed13,instrumentos_musicais,60.0,745.0,1.0,200.0,38.0,5.0,11.0
6,732bd381ad09e530fe0a5f457d81becb,cool_stuff,56.0,1272.0,4.0,18350.0,70.0,24.0,44.0
7,2548af3e6e77a690cf3eb6368e9ab61e,moveis_decoracao,56.0,184.0,2.0,900.0,40.0,8.0,40.0
8,37cc742be07708b53a98702e77a21a02,eletrodomesticos,57.0,163.0,1.0,400.0,27.0,13.0,17.0
9,8c92109888e8cdf9d66dc7e463025574,brinquedos,36.0,1156.0,1.0,600.0,17.0,10.0,12.0


### Dataset Vendedores

In [27]:
df_sellers.head(10)

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP
1,d1b65fc7debc3361ea86b5f14c68d2e2,13844,mogi guacu,SP
2,ce3ad9de960102d0677a81f5d0bb7b2d,20031,rio de janeiro,RJ
3,c0f3eea2e14555b6faeea3dd58c1b1c3,4195,sao paulo,SP
4,51a04a8a6bdcb23deccc82b0b80742cf,12914,braganca paulista,SP
5,c240c4061717ac1806ae6ee72be3533b,20920,rio de janeiro,RJ
6,e49c26c3edfa46d227d5121a6b6e4d37,55325,brejao,PE
7,1b938a7ec6ac5061a66a3766e0e75f90,16304,penapolis,SP
8,768a86e36ad6aae3d03ee3c6433d61df,1529,sao paulo,SP
9,ccc4bbb5f32a6ab2b7066a4130f114e3,80310,curitiba,PR


### Dataset Nomes e Traduções

In [28]:
df_name.head(10)

Unnamed: 0,product_category_name,product_category_name_english
0,beleza_saude,health_beauty
1,informatica_acessorios,computers_accessories
2,automotivo,auto
3,cama_mesa_banho,bed_bath_table
4,moveis_decoracao,furniture_decor
5,esporte_lazer,sports_leisure
6,perfumaria,perfumery
7,utilidades_domesticas,housewares
8,telefonia,telephony
9,relogios_presentes,watches_gifts


### Carregamentos dos dados tratados para um Banco de Dados POSTGRES

In [29]:
# Criação de engine para conexão ao Banco de Dados Postgres
#engine = create_engine('postgresql://koru_j8mm_user:mVPYJRzo9Ve20CebTRI6pEbK3vSIldcL@dpg-cobdglv109ks738hlstg-a.oregon-postgres.render.com/koru_j8mm')

# Criação de tabela a partir de um dataframe no pandas
#df_geolocation.to_sql('geolocation', engine)
#df_customers.to_sql('customers', engine)
#df_items.to_sql('items', engine)
#df_payments.to_sql('payments', engine)
#df_reviews.to_sql('reviews', engine)
#df_orders.to_sql('orders', engine)
#df_products.to_sql('products', engine)
#df_sellers.to_sql('sellers', engine)
#df_name.to_sql('name', engine)

In [30]:
def replace_char(city_name):
    city_name = re.sub(r'[ãââàáä]', 'a', city_name)
    city_name = re.sub(r'[íîì]', 'i', city_name)
    city_name = re.sub(r'[úûùü]', 'u', city_name)
    city_name = re.sub(r'[éêèë]', 'e', city_name)
    city_name = re.sub(r'[óõôòö]', 'o', city_name)
    city_name = re.sub(r'[ç]', 'c', city_name)
    return city_name

### Carregamentos dos dados tratados para o Google Big Query

In [31]:
#os.environ['GOOGLE_APPLICATION_CREDENTIALS'] ='gbq.json'
client = bigquery.Client()

In [8]:
client = bigquery.Client.from_service_account_json('gbq.json')

In [9]:
def list_datasets():
    datasets = list(client.list_datasets())
    project = client.project

    if datasets:
        print("Datasets in project {}:".format(project))
        for dataset in datasets:
            print("\t{}".format(dataset.dataset_id))
    else:
        print("{} project does not contain any datasets.".format(project))
list_datasets()

Datasets in project koru-dados:
	bd01
	projeto
	projeto2


In [38]:
import pandas
import pandas_gbq

#TODO: Set project_id to your Google Cloud Platform project ID.
project_id = "koru-dados"

#TODO: Set table_id to the full destination table ID (including the
#      dataset ID).
table_id = 'koru-dados.projeto2.name'

pandas_gbq.to_gbq(df_name, table_id, project_id=project_id)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]


In [6]:
valor = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS')

In [7]:
print(valor)

None
