In [1]:
import pandas as pd

from Extract.extract_s3 import extract_s3
from dotenv import load_dotenv
import os

In [2]:
load_dotenv("../.env")
BUCKET = os.getenv("BUCKET_Name")

## Extracting Data from S3

In [3]:
# customers
customers_df = extract_s3(bucket=BUCKET, key="Data/customers.parquet")
if customers_df is None:
    raise ValueError("No data found")

In [4]:
# order items
order_items_df = extract_s3(BUCKET, key="Data/order_items.parquet")
if order_items_df is None:
    raise ValueError("No data found")

In [5]:
# order payments
order_payments_df = extract_s3(BUCKET, key="Data/order_payments.parquet")

In [6]:
# order reviews
order_reviews_df = extract_s3(BUCKET, key="Data/order_reviews.parquet")
if order_reviews_df is None:
    raise ValueError("No data found")

In [7]:
# orders
orders_df = extract_s3(BUCKET, key="Data/orders.parquet")
if orders_df is None:
    raise ValueError("No data found")

In [8]:
# product categories
product_category_df = extract_s3(BUCKET, key="Data/product_categories.parquet")
if product_category_df is None:
    raise ValueError("No data found")

In [9]:
# products
products_df = extract_s3(BUCKET, key="Data/products.parquet")
if products_df is None:
    raise ValueError("No data found")

In [10]:
# sellers
sellers_df = extract_s3(BUCKET, key="Data/sellers.parquet")
if sellers_df is None:
    raise ValueError("No data found")

## Cleaning Data

### Customers

In [11]:
customers_df.head()

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e-2fba-1a1f-bc88-172c00ba8bc7,861eff47-11a5-42e4-b938-43c6dd7febb0,14409,franca,SP
1,18955e83-d337-fd6b-2def-6b18a428ac77,290c77bc-529b-7ac9-35b9-3aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00-2885-86eb-d087-12fdd0374a03,060e732b-5b29-e818-1a18-229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027b-c5c5-109e-529d-4dc6358b12c3,259dac75-7896-d24d-7702-b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab1-71c8-0ec8-364f-7c12e35b23ad,345ecd01-c38d-18a9-036e-d96c73b8d066,13056,campinas,SP


In [12]:
customers_df.shape

(99441, 5)

In [13]:
customers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB


In [14]:
customers_df.duplicated().sum()

np.int64(0)

In [15]:
from uuid import UUID

In [16]:
# converting object to uuid
customers_df["customer_id"] = customers_df["customer_id"].apply(UUID)
customers_df["customer_unique_id"] = customers_df["customer_unique_id"].apply(UUID)

In [17]:
customers_df.isnull().sum()

customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64

In [18]:
# id to name mapping in customers
id_to_name = {
    cust_id: f"Customer_{idx + 1}"
    for idx, cust_id in enumerate(customers_df["customer_unique_id"].unique())
}
customers_df["customer_name"] = customers_df["customer_unique_id"].map(id_to_name)

### Order Items

In [19]:
order_items_df.head()

Unnamed: 0,order_id,quantity,product_id,seller_id,shipping_limit_date,price,freight_value,order_items_id,total_price
0,00010242-fe8c-5a6d-1ba2-dd792cb16214,1,4244733e-06e7-ecb4-970a-6e2683c13e61,48436dad-e18a-c8b2-bce0-89ec2a041202,2017-09-19 09:45:35,58.9,13.29,0,72.19
1,00018f77-f2f0-320c-5571-90d7a144bdd3,1,e5f2d52b-8021-89ee-6588-65ca93d83a8f,dd7ddc04-e1b6-c2c6-1435-2b383efe2d36,2017-05-03 11:05:13,239.9,19.93,1,259.83
2,000229ec-3982-24ef-6ca0-657da4fc703e,1,c777355d-18b7-2b67-abbe-ef9df44fd0fd,5b51032e-ddd2-42ad-c84c-38acab88f23d,2018-01-18 14:48:30,199.0,17.87,2,216.87
3,00024acb-cdf0-a6da-a1e9-31b038114c75,1,7634da15-2a46-10f1-595e-fa32f14722fc,9d7a1d34-a505-2409-0064-25275ba1c2b4,2018-08-15 10:10:18,12.99,12.79,3,25.78
4,00042b26-cf59-d7ce-69df-abb4e55b4fd9,1,ac6c3623-068f-30de-0304-5865e4e10089,df560393-f3a5-1e74-553a-b94004ba5c87,2017-02-13 13:57:51,199.9,18.14,4,218.04


In [20]:
order_items_df.shape

(112650, 9)

In [21]:
order_items_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   order_id             112650 non-null  object        
 1   quantity             112650 non-null  int64         
 2   product_id           112650 non-null  object        
 3   seller_id            112650 non-null  object        
 4   shipping_limit_date  112650 non-null  datetime64[ns]
 5   price                112650 non-null  float64       
 6   freight_value        112650 non-null  float64       
 7   order_items_id       112650 non-null  int64         
 8   total_price          112650 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(2), object(3)
memory usage: 7.7+ MB


In [22]:
order_items_df.sample(10)

Unnamed: 0,order_id,quantity,product_id,seller_id,shipping_limit_date,price,freight_value,order_items_id,total_price
84797,c0a8ef94-e050-fd7f-b74a-3757be5b3da0,1,53b36df6-7ebb-7c41-585e-8d54d6772e08,7d13fca1-5225-3586-21be-4086e1eb0964,2018-07-09 19:31:08,119.0,13.49,84797,132.49
36121,51e88bc3-51dd-268b-b9c9-e78d263c4a06,1,d6fcd73c-e07d-a9fa-8685-8f67f6ba7346,42ef3192-a9ff-87a2-2d18-67b74b3ee205,2018-03-13 15:55:46,58.0,9.42,36121,67.42
59949,8890828f-16fa-cb8b-8a9c-48194daccb85,1,3863a6df-816f-3b8c-e035-8a1243541fcb,1b4c3a6f-5306-8f0b-6944-d2d005c9fc89,2017-07-06 13:15:12,74.9,17.77,59949,92.67
74006,a8ad7572-8836-e5b2-1034-63b917d0e6f4,1,a1eae0ed-718c-783a-4c0d-fef382d31ee4,1025f0e2-d44d-7041-d6cf-58b6550e0bfa,2018-08-08 21:04:14,320.0,45.38,74006,365.38
29565,4334cbad-2570-eb17-ab0f-2be70bcc22b7,1,0fe922b7-f7d4-3ef4-54bb-c241416e7401,f7ba60f8-c3f9-9e7e-e404-2fdef03b70c4,2018-02-27 03:31:13,16.0,7.78,29565,23.78
46176,68d8838c-9019-1c81-a8e3-e1273e230414,1,c835fd9d-2e46-6148-ac37-55300628e33d,a3a38f4a-ffed-601e-b87a-97788c949667,2018-04-25 19:11:00,59.9,23.35,46176,83.25
95530,d8b1b374-baab-f8ba-4778-23087ddfb1a2,1,a1ebb47a-4820-0e65-6660-3de49b83f621,46dc3b2c-c098-0fb8-ec44-634e21d2718e,2017-10-20 12:56:32,269.99,26.34,95530,296.33
109361,f8898796-f33b-7a07-41dc-0cf4f54c0001,1,363218ba-55c6-10b7-5022-4f90bdd34be1,00ee6830-8b45-bc5e-2660-cd833c3f81cc,2017-10-09 08:07:19,90.0,16.88,109361,106.88
39386,59880d84-39b6-a1a6-9772-7592bdab4b96,1,a10e0fcb-1c40-9869-c3c6-da4eb13b7612,855668e0-971d-4dfd-7bef-1b6a4133b41b,2017-10-24 11:06:31,50.0,18.59,39386,68.59
59035,867b894c-187b-59ff-2425-447c7c13c898,1,cc8a6ff2-506f-53b7-3873-7ef6b4dccd17,e601b04a-cf48-b457-6e73-ddca9481f0dc,2018-06-18 02:57:19,12.49,7.39,59035,19.88


In [23]:
order_items_df.drop(["order_items_id"], axis=1, inplace=True)

Dropping `order_items_id` because it doesn't provide any information about dataset. It simply acts as the index value.

In [24]:
order_items_df.rename({"quantity": "order_item_id"}, axis=1, inplace=True)

**Note:** Previously `order_item_id` was misjudged as `quantity`, it was used in total price calculation. So we have to fix the total price by recalculating it.
`order_item_id` shows the id of item in particular order.

In [25]:
order_items_df["total_price"] = (
    order_items_df["freight_value"] + order_items_df["price"]
)

`total_price` was calculated with quantity which was `order_item_id`, so we have to recalculate the total price, where `price` shows the price of the product and `freight_value` shows the shipping cost of the particular order.

In [26]:
import numpy as np

In [27]:
order_items_df["total_price"] = (
    order_items_df["total_price"].apply(np.round).astype(int)
)

In [28]:
order_items_df.duplicated().sum()

np.int64(0)

In [29]:
order_items_df["order_id"] = order_items_df["order_id"].apply(UUID)
order_items_df["product_id"] = order_items_df["product_id"].apply(UUID)
order_items_df["seller_id"] = order_items_df["seller_id"].apply(UUID)

### Order Payments

In [30]:
order_payments_df.shape

(103886, 5)

In [31]:
order_payments_df.dtypes

order_id                 object
payment_sequential        int64
payment_type             object
payment_installments      int64
payment_value           float64
dtype: object

`order_payments` is in correct datatypes.

In [32]:
order_payments_df.isnull().sum()

order_id                0
payment_sequential      0
payment_type            0
payment_installments    0
payment_value           0
dtype: int64

In [33]:
order_payments_df.duplicated().sum()

np.int64(0)

In [34]:
order_payments_df["order_id"] = order_payments_df["order_id"].apply(UUID)

### Order Reviews

In [35]:
order_reviews_df.shape

(99224, 7)

In [36]:
order_reviews_df.dtypes

review_id                          object
order_id                           object
review_score                        int64
review_comment_title               object
review_comment_message             object
review_creation_date       datetime64[ns]
review_answer_timestamp            object
dtype: object

In [37]:
order_reviews_df.isnull().sum()

review_id                      0
order_id                       0
review_score                   0
review_comment_title       87656
review_comment_message     58247
review_creation_date           0
review_answer_timestamp        0
dtype: int64

**NaN** values in `review_comment_title` and `review_comment_message` contains more than half of the data. So dropping them is needed.

In [38]:
order_reviews_df.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc24061-10b9-2639-3aa5-6f80a40eba40,73fc7af8-7114-b397-12e6-da79b0a377eb,4,,,2018-01-18,2018-01-18 21:46:59
1,80e641a1-1e56-f04c-1ad4-69d5645fdfde,a548910a-1c61-4779-6b98-fdf73dbeba33,5,,,2018-03-10,2018-03-11 03:05:13
2,228ce550-0dc1-d8e0-20d8-d1322874b6f0,f9e4b658-b201-a9f2-ecde-cbb34bed034b,5,,,2018-02-17,2018-02-18 14:36:24
3,e64fb393-e7b3-2834-bb78-9ff8bb30750e,658677c9-7b38-5a9b-e170-737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21,2017-04-21 22:02:06
4,f7c4243c-7fe1-938f-181b-ec41a392bdeb,8e6bfb81-e283-fa7e-4f11-123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01,2018-03-02 10:26:53


In [39]:
order_reviews_df.drop(
    ["review_comment_title", "review_comment_message"], axis=1, inplace=True
)

In [40]:
order_reviews_df["review_answer_timestamp"] = pd.to_datetime(
    order_reviews_df["review_answer_timestamp"]
)

In [41]:
order_reviews_df.dtypes

review_id                          object
order_id                           object
review_score                        int64
review_creation_date       datetime64[ns]
review_answer_timestamp    datetime64[ns]
dtype: object

In [42]:
order_reviews_df.duplicated().sum()

np.int64(0)

In [43]:
order_reviews_df["order_id"] = order_reviews_df["order_id"].apply(UUID)
order_reviews_df["review_id"] = order_reviews_df["review_id"].apply(UUID)

In [44]:
order_reviews_df.groupby("order_id")["review_id"].count().sort_values(ascending=False)

order_id
03c939fd-7fd3-b38f-8485-a0f95798f1f6    3
8e17072e-c97c-e29f-0e1f-111e598b0c85    3
c88b1d1b-157a-9999-ce36-8f218a407141    3
df56136b-8031-ecd2-8e20-0bb18e6ddb2e    3
7f13a20e-2535-0f4a-55fb-2a7c9a2e8d88    2
                                       ..
ffe73fc1-d73e-03fd-b50e-63903ddfe882    1
ffe88510-12fc-daf8-3de7-b595fd5154b3    1
ffea20c7-6303-43a6-cd9e-09858c1295cd    1
ffea406a-848c-8afe-4dec-22bf6290ba00    1
0008288a-a423-d2a3-f00f-cb17cd7d8719    1
Name: review_id, Length: 98673, dtype: int64

In [45]:
order_reviews_df[
    order_reviews_df["order_id"] == UUID("03c939fd-7fd3-b38f-8485-a0f95798f1f6")
]

Unnamed: 0,review_id,order_id,review_score,review_creation_date,review_answer_timestamp
8273,b04ed893-318d-a5b8-63e8-78cd3d0511df,03c939fd-7fd3-b38f-8485-a0f95798f1f6,3,2018-03-20,2018-03-21 02:28:23
51351,f4bb9d6d-d4fb-6dcc-2298-f0e7b17b8e1e,03c939fd-7fd3-b38f-8485-a0f95798f1f6,4,2018-03-29,2018-03-30 00:29:09
69438,405eb2ea-45e1-dbe2-6625-41ae5b47e2aa,03c939fd-7fd3-b38f-8485-a0f95798f1f6,3,2018-03-06,2018-03-06 19:50:32


single `order_id` contains 3 reviews, which is misleading. a single order can only be brought by single customer, so we have to remove the duplicates and keep the latest review.

In [46]:
order_reviews_df = order_reviews_df.sort_values("review_creation_date").drop_duplicates(
    "order_id", keep="last"
)

#### Orders

In [47]:
orders_df.shape

(99441, 8)

In [48]:
orders_df.dtypes

order_id                                 object
customer_id                              object
order_status                             object
order_purchase_timestamp         datetime64[ns]
order_approved_at                datetime64[ns]
order_delivered_carrier_date     datetime64[ns]
order_delivered_customer_date    datetime64[ns]
order_estimated_delivery_date    datetime64[ns]
dtype: object

In [49]:
orders_df.isnull().sum()

order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965
order_estimated_delivery_date       0
dtype: int64

`order_approved_at`, `order_delievered_carriar_date` and `order_delivered_customer_date` contains NaN values but those are linked with `order_status`, if order is cancelled those columns will not be populated.

In [50]:
orders_df.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51c-bdc5-4678-b7cc-49136f2d6af7,9ef432eb-6251-2973-04e7-6186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18
1,53cdb2fc-8bc7-dce0-b674-1e2150273451,b0830fb4-747a-6c6d-20de-a0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13
2,47770eb9-100c-2d0c-4494-6d9cf07ec65d,41ce2a54-c0b0-3bf3-443c-3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04
3,949d5b44-dbf5-de91-8fe9-c16f97b45f8a,f8819746-5ea7-920a-dcdb-ec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15
4,ad21c59c-0840-e6cb-83a9-ceb5573f8159,8ab97904-e6da-ea88-66db-dbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26


In [51]:
orders_df.duplicated().sum()

np.int64(0)

In [52]:
orders_df["order_id"] = orders_df["order_id"].apply(UUID)
orders_df["customer_id"] = orders_df["customer_id"].apply(UUID)

### Product Categories

In [53]:
product_category_df.shape

(71, 2)

In [54]:
product_category_df.dtypes

product_category_name            object
product_category_name_english    object
dtype: object

In [55]:
product_category_df.isnull().sum()

product_category_name            0
product_category_name_english    0
dtype: int64

In [56]:
product_category_df.head()

Unnamed: 0,product_category_name,product_category_name_english
0,beleza_saude,health_beauty
1,informatica_acessorios,computers_accessories
2,automotivo,auto
3,cama_mesa_banho,bed_bath_table
4,moveis_decoracao,furniture_decor


### Products

In [57]:
products_df.shape

(32951, 9)

In [58]:
products_df.dtypes

product_id                     object
product_category_name          object
product_name_lenght           float64
product_description_lenght    float64
product_photos_qty            float64
product_weight_g              float64
product_length_cm             float64
product_height_cm             float64
product_width_cm              float64
dtype: object

In [59]:
products_df.isnull().sum()

product_id                      0
product_category_name         610
product_name_lenght           610
product_description_lenght    610
product_photos_qty            610
product_weight_g                2
product_length_cm               2
product_height_cm               2
product_width_cm                2
dtype: int64

In [60]:
products_df.head()

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef0-4dbc-ff45-41ed-26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa07113-9cb1-6b67-ca9e-5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0
2,96bd76ec-8810-374e-d1b6-5e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0
3,cef67bcf-e190-66a9-32b7-673e239eb23d,bebes,27.0,261.0,1.0,371.0,26.0,4.0,26.0
4,9dc1a7de-2744-4484-9c21-9cff195d0b71,utilidades_domesticas,37.0,402.0,4.0,625.0,20.0,17.0,13.0


In [61]:
products_df.rename(
    {
        "product_description_lenght": "product_description_length",
        "product_name_lenght": "product_name_length",
    },
    axis=1,
    inplace=True,
)

In [62]:
products_df.duplicated().sum()

np.int64(0)

In [63]:
products_df.fillna({"product_category_name": "unknown"}, inplace=True)

`product_category_name` contains nan values, but they show important information about products, so we replace it with "unknown"

In [64]:
products_df["product_id"] = products_df["product_id"].apply(UUID)

In [65]:
# id to product name mapping
id_to_product_name = {
    prod_id: f"Product_{idx + 1}"
    for idx, prod_id in enumerate(products_df["product_id"].unique())
}
products_df["product_name"] = products_df["product_id"].map(id_to_product_name)



### Sellers

In [67]:
sellers_df.shape

(3095, 4)

In [68]:
sellers_df.dtypes

seller_id                 object
seller_zip_code_prefix     int64
seller_city               object
seller_state              object
dtype: object

In [69]:
sellers_df.isnull().sum()

seller_id                 0
seller_zip_code_prefix    0
seller_city               0
seller_state              0
dtype: int64

In [70]:
sellers_df.head()

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,3442f895-9a84-dea7-ee19-7c632cb2df15,13023,campinas,SP
1,d1b65fc7-debc-3361-ea86-b5f14c68d2e2,13844,mogi guacu,SP
2,ce3ad9de-9601-02d0-677a-81f5d0bb7b2d,20031,rio de janeiro,RJ
3,c0f3eea2-e145-55b6-faee-a3dd58c1b1c3,4195,sao paulo,SP
4,51a04a8a-6bdc-b23d-eccc-82b0b80742cf,12914,braganca paulista,SP


In [71]:
sellers_df.duplicated().sum()

np.int64(0)

In [72]:
sellers_df["seller_id"] = sellers_df["seller_id"].apply(UUID)

In [73]:
# id to seller name
id_to_seller_name = {
    sel_id: f"Seller_{idx + 1}"
    for idx, sel_id in enumerate(sellers_df["seller_id"].unique())
}
sellers_df["seller_name"] = sellers_df["seller_id"].map(id_to_seller_name)

## Dimension Modeling

In [74]:
customer_map = customers_df.set_index("customer_id")["customer_unique_id"]

In [75]:
orders_df["customer_id"] = orders_df["customer_id"].map(customer_map)

In [76]:
customers_df.drop("customer_id", axis=1, inplace=True)

In [77]:
customers_df.drop_duplicates(inplace=True)

In [78]:
customers_df.rename({"customer_unique_id": "customer_id"}, axis=1, inplace=True)

### Fact Tables

In [139]:
fact_orders = order_items_df.merge(orders_df, on="order_id", how="left").merge(
    order_reviews_df, on="order_id", how="left"
)

In [144]:
fact_orders.drop(
    ["review_id", "review_answer_timestamp", "review_creation_date"],
    axis=1,
    inplace=True,
)

In [145]:
fact_orders.rename({"review_score": "order_rating"}, axis=1, inplace=True)

In [81]:
order_payments_df.groupby("order_id")["payment_type"].count().sort_values(
    ascending=False
)

order_id
fa65dad1-b0e8-18e3-ccc5-cb0e39231352    29
ccf804e7-64ed-5650-cd87-59557269dc13    26
285c2e15-bebd-4ac8-3635-ccc563dc71f4    22
895ab968-e7bb-0d56-59d1-6cd74cd1650c    21
ee9ca989-fc93-ba09-a6ed-dc250ce01742    19
                                        ..
ffe73fc1-d73e-03fd-b50e-63903ddfe882     1
ffe88510-12fc-daf8-3de7-b595fd5154b3     1
ffea20c7-6303-43a6-cd9e-09858c1295cd     1
ffea406a-848c-8afe-4dec-22bf6290ba00     1
00063b38-1e24-06b5-2ad4-29470734ebd5     1
Name: payment_type, Length: 99440, dtype: int64

`order_payments_df` contains the multiple payments of single order, so merging it with `fact_orders` will lead to duplicates so keeping it as separate fact table.

In [82]:
fact_payments = order_payments_df

### Dim Tables

In [83]:
customers_df.duplicated().sum()

np.int64(0)

In [84]:
dim_customers = customers_df

In [85]:
sellers_df.duplicated().sum()

np.int64(0)

In [87]:
dim_sellers = sellers_df

In [88]:
product_category_df.head()

Unnamed: 0,product_category_name,product_category_name_english
0,beleza_saude,health_beauty
1,informatica_acessorios,computers_accessories
2,automotivo,auto
3,cama_mesa_banho,bed_bath_table
4,moveis_decoracao,furniture_decor


In [94]:
product_category_map = product_category_df.set_index("product_category_name")[
    "product_category_name_english"
].to_dict()

In [96]:
product_category_map["unknown"] = "unknown"
products_df["product_category_name"] = products_df["product_category_name"].map(
    product_category_map
)

In [114]:
products_df["product_category_name"].value_counts()

product_category_name
bed_bath_table               3029
sports_leisure               2867
furniture_decor              2657
health_beauty                2444
housewares                   2335
                             ... 
tablets_printing_image          9
fashion_childrens_clothes       5
home_comfort_2                  5
security_and_services           2
cds_dvds_musicals               1
Name: count, Length: 72, dtype: int64

In [115]:
dim_products = products_df

In [119]:
dim_dates = pd.DataFrame(
    pd.date_range(start="2016-01-01", end="2018-12-31"), columns=["date"]
)

In [133]:
dim_dates["quarter"] = dim_dates.date.dt.quarter
dim_dates["month"] = dim_dates.date.dt.month
dim_dates["year"] = dim_dates.date.dt.year
dim_dates["week_by_year"] = dim_dates.date.dt.strftime("%W").astype(int)
dim_dates["day"] = dim_dates.date.dt.day
dim_dates["weekday"] = dim_dates.date.dt.weekday
dim_dates["weekday_name"] = dim_dates.date.dt.day_name()

In [151]:
dim_dates.head()

Unnamed: 0,date,quarter,month,year,week_by_year,day,weekday,weekday_name
0,2016-01-01,1,1,2016,0,1,4,Friday
1,2016-01-02,1,1,2016,0,2,5,Saturday
2,2016-01-03,1,1,2016,0,3,6,Sunday
3,2016-01-04,1,1,2016,1,4,0,Monday
4,2016-01-05,1,1,2016,1,5,1,Tuesday
