In [0]:
#import packages
import pandas as pd
from sqlalchemy import create_engine
import numpy as np

In [0]:
#import datasets (csv files)
customers = pd.read_csv('olist_customers_dataset.csv')
geolocation = pd.read_csv('olist_geolocation_dataset.csv')
order_items = pd.read_csv('olist_order_items_dataset.csv')
payments = pd.read_csv('olist_order_payments_dataset.csv')
reviews = pd.read_csv('olist_order_reviews_dataset.csv')
orders = pd.read_csv('olist_orders_dataset.csv')
products = pd.read_csv('olist_products_dataset.csv')
sellers = pd.read_csv('olist_sellers_dataset.csv')
product_category = pd.read_csv('product_category_name_translation.csv')

# Extract, Transform and Load

## Geolocation - Address

As is seen in the design of the database schema, "address" is the first table to construct. Thus, it is necessary to first check the geolocation dataset and perform necessary transformation.

In [0]:
geolocation.head()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.64482,sao paulo,SP
2,1046,-23.546129,-46.642951,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP


In [0]:
geolocation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000163 entries, 0 to 1000162
Data columns (total 5 columns):
geolocation_zip_code_prefix    1000163 non-null int64
geolocation_lat                1000163 non-null float64
geolocation_lng                1000163 non-null float64
geolocation_city               1000163 non-null object
geolocation_state              1000163 non-null object
dtypes: float64(2), int64(1), object(2)
memory usage: 38.2+ MB


In [0]:
geolocation.shape

(1000163, 5)

"Geolocation" dataset does not have null values. Also, it is a well-constructed dataset with information we need including zip code, longitude, latitude, city, and state. Then we need to check the duplicates, construct the unique  "geolocation_id" as the primary key, adjust the column names and the order.

In [0]:
geo_df = geolocation.drop_duplicates() #drop duplicated rows to get the unique rows
len(geo_df)

738332

In [0]:
#rename the columns to make it consistent with the "address" table
geo_df.columns = ['zip_code_prefix', 'geolocation_lat', 'geolocation_lng', 'geolocation_city', 'geolocation_state']

In [0]:
#construct the "geolocation_id"
geo_df.insert(0, 'geolocation_id', range(1, 1+len(geo_df)))

In [0]:
#change the order of the columns to make it consistent with the "address" table
geo_df = geo_df[['geolocation_id', 'zip_code_prefix', 'geolocation_city', 'geolocation_state', 'geolocation_lat', 'geolocation_lng']]

In [0]:
geo_df.head()

Unnamed: 0,geolocation_id,zip_code_prefix,geolocation_city,geolocation_state,geolocation_lat,geolocation_lng
0,1,1037,sao paulo,SP,-23.545621,-46.639292
1,2,1046,sao paulo,SP,-23.546081,-46.64482
2,3,1046,sao paulo,SP,-23.546129,-46.642951
3,4,1041,sao paulo,SP,-23.544392,-46.639499
4,5,1035,sao paulo,SP,-23.541578,-46.641607


In [0]:
#Load the table into sql
#geo_df.to_sql(name='address', con=engine, if_exists='append', index=False)

## Customers

The second table is "customers" of which the information comes from the customers dataset. 

In [0]:
customers.head()

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP


In [0]:
#address and customers table should be linked to each other. 
#"Geolocation_id" will be the foreign key referencing the address table, which should be added to the dataset.
#Merge two tables together by zipcode using Outer Join.
temp_customers = customers.merge(geo_df, left_on= customers['customer_zip_code_prefix'], right_on = geo_df['zip_code_prefix'], how = 'outer')

In [0]:
temp_customers.head()

Unnamed: 0,key_0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,geolocation_id,zip_code_prefix,geolocation_city,geolocation_state,geolocation_lat,geolocation_lng
0,14409,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409.0,franca,SP,228216.0,14409.0,franca,SP,-20.509897,-47.397866
1,14409,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409.0,franca,SP,228243.0,14409.0,franca,SP,-20.497396,-47.399241
2,14409,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409.0,franca,SP,228253.0,14409.0,franca,SP,-20.510459,-47.399553
3,14409,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409.0,franca,SP,228256.0,14409.0,franca,SP,-20.48094,-47.394161
4,14409,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409.0,franca,SP,228257.0,14409.0,franca,SP,-20.515413,-47.398194


In [0]:
#Pick the 3 variables for this table
customers_df = temp_customers[['customer_id', 'customer_unique_id', 'geolocation_id']]
customers_df['geolocation_id'].unique()

array([228216., 228243., 228253., ..., 738060., 738072., 738205.])

In [0]:
customers_df.head()

Unnamed: 0,customer_id,customer_unique_id,geolocation_id
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,228216.0
1,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,228243.0
2,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,228253.0
3,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,228256.0
4,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,228257.0


In [0]:
#customers_df.to_sql(name='customers', con=engine, if_exists='append', index=False)

## Sellers

In [0]:
sellers.head()

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP
1,d1b65fc7debc3361ea86b5f14c68d2e2,13844,mogi guacu,SP
2,ce3ad9de960102d0677a81f5d0bb7b2d,20031,rio de janeiro,RJ
3,c0f3eea2e14555b6faeea3dd58c1b1c3,4195,sao paulo,SP
4,51a04a8a6bdcb23deccc82b0b80742cf,12914,braganca paulista,SP


In [0]:
#address and sellers table should be linked to each other. 
#"Geolocation_id" will be the foreign key referencing the address table, which should be added to the dataset.
#Merge two tables together by zipcode using Outer Join.
temp_sellers = sellers.merge(geo_df, left_on= sellers['seller_zip_code_prefix'], 
                             right_on = geo_df['zip_code_prefix'], how = 'outer')

In [0]:
#pick two variables
sellers_df = temp_sellers[['seller_id', 'geolocation_id']]
sellers_df['geolocation_id'].unique()

array([188925., 188934., 189024., ..., 738290., 738072., 738205.])

In [0]:
sellers_df.head()

Unnamed: 0,seller_id,geolocation_id
0,3442f8959a84dea7ee197c632cb2df15,188925.0
1,3442f8959a84dea7ee197c632cb2df15,188934.0
2,3442f8959a84dea7ee197c632cb2df15,189024.0
3,3442f8959a84dea7ee197c632cb2df15,189064.0
4,3442f8959a84dea7ee197c632cb2df15,189081.0


In [0]:
#sellers_df.to_sql(name='sellers', con=engine, if_exists='append', index=False)

## Product Category

Before diving deep into the products dataset, it is critical to transform the product category dataset and add the product_category_id column.

In [0]:
product_category.head()  #each row only contains one product category (e.g. "health_beauty" means one category)

Unnamed: 0,product_category_name,product_category_name_english
0,beleza_saude,health_beauty
1,informatica_acessorios,computers_accessories
2,automotivo,auto
3,cama_mesa_banho,bed_bath_table
4,moveis_decoracao,furniture_decor


In [0]:
category_df = product_category.drop_duplicates() #drop duplicated rows
len(category_df)

71

In [0]:
category_df.insert(0, 'product_category_id', range(1, 1+len(category_df))) #construct the id column for product category

In [0]:
#pick the id and english name column and save it as a new dataframe
category = category_df[['product_category_id', 'product_category_name_english']] 
category.head() 

Unnamed: 0,product_category_id,product_category_name_english
0,1,health_beauty
1,2,computers_accessories
2,3,auto
3,4,bed_bath_table
4,5,furniture_decor


In [0]:
#category.to_sql(name='category', con=engine, if_exists='append', index=False)

## Products

In [0]:
products.head()

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0
3,cef67bcfe19066a932b7673e239eb23d,bebes,27.0,261.0,1.0,371.0,26.0,4.0,26.0
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37.0,402.0,4.0,625.0,20.0,17.0,13.0


The products dataset is supposed to include the "product_category_id". It is thus necessary to merge product category dataset with the original products dataset by the product category name. 

In [0]:
#merge the original product category dataset and products datasets by product category name
products_df = products.merge(category_df, left_on = products['product_category_name'], 
                             right_on = category_df['product_category_name'], how = 'left') 
#drop unnecessary columns
products_df.drop(['key_0', 'product_category_name_x','product_category_name_y', 'product_category_name_english'], 
                 axis = 1, inplace = True)
#rename the columns
products_df.columns = ['product_id', 'product_name_length', 'product_description_length', 'product_photos_qty',
                       'product_weight', 'product_length', 'product_height', 'product_width', 'product_category_id']
#check the dataframe
products_df.head()

Unnamed: 0,product_id,product_name_length,product_description_length,product_photos_qty,product_weight,product_length,product_height,product_width,product_category_id
0,1e9e8ef04dbcff4541ed26657ea517e5,40.0,287.0,1.0,225.0,16.0,10.0,14.0,7.0
1,3aa071139cb16b67ca9e5dea641aaa2f,44.0,276.0,1.0,1000.0,30.0,18.0,20.0,47.0
2,96bd76ec8810374ed1b65e291975717f,46.0,250.0,1.0,154.0,18.0,9.0,15.0,6.0
3,cef67bcfe19066a932b7673e239eb23d,27.0,261.0,1.0,371.0,26.0,4.0,26.0,12.0
4,9dc1a7de274444849c219cff195d0b71,37.0,402.0,4.0,625.0,20.0,17.0,13.0,8.0


In [0]:
#drop duplicates and save the results as the new dataframe
products_df_2 = products_df.drop_duplicates()
#check the number of rows
len(products_df_2)

32951

In [0]:
#load the dataframe into sql database
#products_df_2.to_sql(name='product', con=engine, if_exists='append', index=False)

## Delivery

The delivery information is all stored in the "orders" table. By selecting the columns out, we are able to construct the delivery table after removing the duplicates if necessary and adding the "delivery_id". But the "delivery" table should be loaded after we load the "orders" table and the "products_ordered" table first. 

In [0]:
orders.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00


In [0]:
delivery = orders[['order_id', 'order_status', 'order_approved_at', 'order_delivered_carrier_date', 
                   'order_estimated_delivery_date', 'order_delivered_customer_date']] #pick the variables
delivery.head()

Unnamed: 0,order_id,order_status,order_approved_at,order_delivered_carrier_date,order_estimated_delivery_date,order_delivered_customer_date
0,e481f51cbdc54678b7cc49136f2d6af7,delivered,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-18 00:00:00,2017-10-10 21:25:13
1,53cdb2fc8bc7dce0b6741e2150273451,delivered,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-13 00:00:00,2018-08-07 15:27:45
2,47770eb9100c2d0c44946d9cf07ec65d,delivered,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-09-04 00:00:00,2018-08-17 18:06:29
3,949d5b44dbf5de918fe9c16f97b45f8a,delivered,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-15 00:00:00,2017-12-02 00:28:42
4,ad21c59c0840e6cb83a9ceb5573f8159,delivered,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-26 00:00:00,2018-02-16 18:17:02


In [0]:
delivery.shape

(99441, 6)

In [0]:
delivery_df = delivery.drop_duplicates()
delivery_df.shape

(99441, 6)

The delivery dataframe does not have any duplicates. We can construct the "delivery_id" using the delivery dataframe.

In [0]:
delivery.insert(0, 'delivery_id', range(1, 1+len(delivery))) 
delivery.head()

Unnamed: 0,delivery_id,order_id,order_status,order_approved_at,order_delivered_carrier_date,order_estimated_delivery_date,order_delivered_customer_date
0,1,e481f51cbdc54678b7cc49136f2d6af7,delivered,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-18 00:00:00,2017-10-10 21:25:13
1,2,53cdb2fc8bc7dce0b6741e2150273451,delivered,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-13 00:00:00,2018-08-07 15:27:45
2,3,47770eb9100c2d0c44946d9cf07ec65d,delivered,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-09-04 00:00:00,2018-08-17 18:06:29
3,4,949d5b44dbf5de918fe9c16f97b45f8a,delivered,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-15 00:00:00,2017-12-02 00:28:42
4,5,ad21c59c0840e6cb83a9ceb5573f8159,delivered,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-26 00:00:00,2018-02-16 18:17:02


## Orders

It is critical to point out that one order might include multiple products, meaning that the customer purchased multiple products in one order. Orders and Products tables are in a many-to-many relationship. In order to reduce the database redundancy, we construct a "products_ordered" table including the order_id and product_id, which helps link orders and products table together. 

In [0]:
orders.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00


In [0]:
orders.shape

(99441, 8)

In [0]:
order = orders.drop_duplicates() #drop the duplicates and check the number of duplicates
len(order) #We find that the orders dataset does not have duplicated rows

99441

The "order_items" dataset has already linked the orders with the products.

In [0]:
order_items.head() #we need to check another dataset "order_items"

Unnamed: 0,order_id,order_item_id,product_id,seller_id,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,58.9,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,239.9,19.93
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,199.0,17.87
3,00024acbcdf0a6daa1e931b038114c75,1,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,12.99,12.79
4,00042b26cf59d7ce69dfabb4e55b4fd9,1,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,199.9,18.14


In [0]:
#Merge order dataset with order_items dataset by order_id
order = order_items.merge(orders, left_on = order_items['order_id'], right_on = orders['order_id'], how = 'outer')
#drop unnecessary columns that will be included in the delivery table
order.drop(['key_0','order_status', 'order_id_y','order_approved_at', 'order_delivered_carrier_date', 
                   'order_estimated_delivery_date', 'order_delivered_customer_date'], axis = 1, inplace = True)
#check the order dataframe
order.head()

Unnamed: 0,order_id_x,order_item_id,product_id,seller_id,price,freight_value,customer_id,order_purchase_timestamp
0,00010242fe8c5a6d1ba2dd792cb16214,1.0,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,58.9,13.29,3ce436f183e68e07877b285a838db11a,2017-09-13 08:59:02
1,00018f77f2f0320c557190d7a144bdd3,1.0,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,239.9,19.93,f6dd3ec061db4e3987629fe6b26e5cce,2017-04-26 10:53:06
2,000229ec398224ef6ca0657da4fc703e,1.0,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,199.0,17.87,6489ae5e4333f3693df5ad4372dab6d3,2018-01-14 14:33:31
3,00024acbcdf0a6daa1e931b038114c75,1.0,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,12.99,12.79,d4eb9395c8c0431ee92fce09860c5a06,2018-08-08 10:00:35
4,00042b26cf59d7ce69dfabb4e55b4fd9,1.0,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,199.9,18.14,58dbd0b2d70206bf40e62cd34e84d795,2017-02-04 13:57:51


In [0]:
#reorder the dataframe columns and save it into a new order dataframe
order_df = order[['order_id_x', 'customer_id', 'order_purchase_timestamp', 'freight_value']]
#rename the columns
order_df.columns = [['order_id', 'customer_id', 'order_purchase_timestamp', 'order_freight_value']]
order_df.shape

(113425, 4)

In [0]:
#Drop the duplicated rows in the new order dataframe
order_df_2 = order_df.drop_duplicates()
len(order_df_2) #we find that there are duplicates in the original dataframe

101632

In [0]:
#order_df_2.to_sql(name='orders', con=engine, if_exists='append', index=False)

## Products_ordered

We also need to construct the products_ordered table specifying the order and product relationship by selecting the relavant columns from the order dataframe.

In [0]:
#selecting the variables from the order dataframe and save it as the products_ordered dataframe
products_ordered = order[['order_id_x', 'product_id', 'seller_id', 'price']]
products_ordered.columns = [['order_id', 'product_id', 'seller_id', 'item_price']]
products_ordered.shape

(113425, 4)

In [0]:
products_ordered_df = products_ordered.drop_duplicates() #drop duplicated rows
len(products_ordered_df) #we find that there are duplicates 

102426

In [0]:
#products_ordered_df.to_sql(name='products_ordered', con=engine, if_exists='append', index=False)

After loading the order and products_ordered tables, we need to load the delivery table into sql database.

In [0]:
#delivery.to_sql(name='delivery', con=engine, if_exists='append', index=False)

## Reviews

In [0]:
reviews.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53


In [0]:
#pick the columns
reviews = reviews[['review_id','order_id','review_score','review_comment_title',
                   'review_comment_message','review_creation_date','review_answer_timestamp']]
reviews.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53


In [0]:
#reviews.to_sql(name='reviews', con=engine, if_exists='append', index=False)

## Payment

In [0]:
payments.head()

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71
3,ba78997921bbcdc1373bb41e913ab953,1,credit_card,8,107.78
4,42fdf880ba16b47b59251dd489d4441a,1,credit_card,2,128.45


In [0]:
payments.shape

(103886, 5)

In [0]:
payment = payments.drop_duplicates() #drop duplicates and check the dataframe
payment.shape

(103886, 5)

In [0]:
#change the order of the columns
payment = payment[['order_id', 'payment_installments', 'payment_sequential','payment_type', 'payment_value']]
#construct the payment_id
payment.insert(0, 'payment_id', range(1, 1+len(payment)))
#check the dataframe
payment.head()

Unnamed: 0,payment_id,order_id,payment_installments,payment_sequential,payment_type,payment_value
0,1,b81ef226f3fe1789b1e8b2acac839d17,8,1,credit_card,99.33
1,2,a9810da82917af2d9aefd1278f1dcfa0,1,1,credit_card,24.39
2,3,25e8ea4e93396b6fa0d3dd708e76c1bd,1,1,credit_card,65.71
3,4,ba78997921bbcdc1373bb41e913ab953,8,1,credit_card,107.78
4,5,42fdf880ba16b47b59251dd489d4441a,2,1,credit_card,128.45


In [0]:
#rename the columns
payment.columns = [['payment_id ','order_id',
                    'installments','sequential',
                    'payment_type', 'transaction_value']]
payment.head()

Unnamed: 0,payment_id,order_id,installments,sequential,payment_type,transaction_value
0,1,b81ef226f3fe1789b1e8b2acac839d17,8,1,credit_card,99.33
1,2,a9810da82917af2d9aefd1278f1dcfa0,1,1,credit_card,24.39
2,3,25e8ea4e93396b6fa0d3dd708e76c1bd,1,1,credit_card,65.71
3,4,ba78997921bbcdc1373bb41e913ab953,8,1,credit_card,107.78
4,5,42fdf880ba16b47b59251dd489d4441a,2,1,credit_card,128.45


In [0]:
#payment.to_sql(name='payment', con=engine, if_exists='append', index=False)