In [None]:
import pandas as pd
import numpy as np
import os 


file_path = "../Data/Amazon Sale Report.csv"

df = pd.read_csv(file_path, low_memory=False)

#* Cleaning column names
df.columns = df.columns.str.replace(' ', '_')

#* Filling missing values for specific columns
df['Courier_Status'] = df['Courier_Status'].fillna('unknown')
df['promotion-ids'] = df['promotion-ids'].fillna('no promotion')
df.drop(columns=['index','Unnamed:_22'], inplace=True)

#print(df.count())
print(df.columns)

#customer dimension table

dim_customer = df[['ship-city', 'ship-state', 'ship-postal-code', 'ship-country']].drop_duplicates().dropna()
dim_customer.reset_index(drop=True, inplace=True)
dim_customer.insert(0, 'customer_id', dim_customer.index + 1)

print(dim_customer.head())


#* creating dim table for the date 
df['Date'] = pd.to_datetime(df['Date'], format='%m-%d-%y', errors='coerce')
dim_date = df[['Date']].drop_duplicates().dropna().rename(columns={'Date': 'order_date'})
dim_date['year'] = dim_date['order_date'].dt.year
dim_date['month'] = dim_date['order_date'].dt.month
dim_date['day'] = dim_date['order_date'].dt.day
dim_date.reset_index(drop=True, inplace=True)
dim_date.insert(0, 'date_id', dim_date.index + 1)

print(dim_date.columns)
dim_product = df[['SKU', 'Category', 'Style', 'Size']].drop_duplicates().dropna()
dim_product.reset_index(drop=True, inplace=True)
dim_product.insert(0, 'product_id', dim_product.index + 1)
print(dim_product.columns)
dim_fulfillment = df[['Fulfilment', 'Sales_Channel_', 'ship-service-level']].drop_duplicates().dropna()
dim_fulfillment.reset_index(drop=True, inplace=True)
dim_fulfillment.insert(0, 'fulfillment_id', dim_fulfillment.index + 1)
#print(dim_fulfillment)
dim_promotion = df[['promotion-ids']].drop_duplicates().dropna()
dim_promotion.reset_index(drop=True, inplace=True)
dim_promotion.insert(0, 'promotion_id', dim_promotion.index + 1)
print(dim_promotion)
dim_status = df[['Status']].drop_duplicates().dropna().rename(columns={'Status': 'status_name'})
dim_status.reset_index(drop=True, inplace=True)
dim_status.insert(0, 'status_id', dim_status.index + 1)
print(dim_status)

dim_courier_status = df[['Courier_Status']].drop_duplicates().dropna().rename(columns={'Courier_Status': 'courier_status_name'})
dim_courier_status.reset_index(drop=True, inplace=True)
dim_courier_status.insert(0, 'courier_status_id', dim_courier_status.index + 1)
print(dim_courier_status)
 
fact_sales = df[['Order_ID', 'Date', 'ship-city', 'ship-state', 'ship-postal-code', 'ship-country', 'SKU', 'Category', 'Style', 'Fulfilment', 'Sales_Channel_', 'ship-service-level', 'promotion-ids', 'Status', 'Courier_Status', 'Qty', 'Amount', 'currency', 'B2B', 'fulfilled-by']].dropna()

 
df['Date'] = pd.to_datetime(df['Date'], format='%m-%d-%y', errors='coerce')
fact_sales['Date'] = pd.to_datetime(fact_sales['Date'], format='%m-%d-%y', errors='coerce')

# Merge fact_sales with dimension tables
fact_sales = fact_sales.merge(dim_customer, on=['ship-city', 'ship-state', 'ship-postal-code', 'ship-country'], how='left')
fact_sales = fact_sales.merge(dim_product, on=['SKU', 'Category', 'Style'], how='left')
fact_sales = fact_sales.merge(dim_fulfillment, on=['Fulfilment', 'Sales_Channel_', 'ship-service-level'], how='left')
fact_sales = fact_sales.merge(dim_promotion, on=['promotion-ids'], how='left')
fact_sales = fact_sales.merge(dim_status, left_on='Status', right_on='status_name', how='left').drop(columns=['status_name'])
fact_sales = fact_sales.merge(dim_courier_status, left_on='Courier_Status', right_on='courier_status_name', how='left').drop(columns=['courier_status_name'])

 
fact_sales = fact_sales.merge(dim_date, left_on='Date', right_on='order_date', how='left')

 
fact_sales = fact_sales.rename(columns={'Order_ID': 'order_id', 'Qty': 'quantity', 'Amount': 'amount'})

fact_sales['timestamp'] = pd.Timestamp.now()
 
fact_sales = fact_sales[['order_id', 'date_id', 'customer_id', 'product_id', 'fulfillment_id', 'promotion_id', 'status_id', 'courier_status_id', 'quantity', 'amount', 'currency', 'B2B', 'fulfilled-by', 'timestamp']]

 
fact_sales.reset_index(drop=True, inplace=True)

print(fact_sales['timestamp'])
#* saving the dimension table to a csv file
dim_customer.to_csv('../Data/dim_customer.csv', index=False)
dim_date.to_csv('../Data/dim_date.csv', index=False)
dim_product.to_csv('../Data/dim_product.csv', index=False)
dim_fulfillment.to_csv('../Data/dim_fulfillment.csv', index=False)
dim_promotion.to_csv('../Data/dim_promotion.csv', index=False)
dim_status.to_csv('../Data/dim_status.csv', index=False)
dim_courier_status.to_csv('../Data/dim_courier_status.csv', index=False)
fact_sales.to_csv("../Data/fact_sales.csv", index=False)
