In [1]:
import pandas as pd
import io
import requests
import psycopg2
from sqlalchemy import create_engine

url = 'https://raw.githubusercontent.com/krtmlry/product_sales_2019/main/data/productsales_merged_01.csv'
response = requests.get(url)
df = pd.read_csv(io.StringIO(response.text), sep=',')

# convert data types

In [2]:
df['order_id'] = pd.to_numeric(df['order_id'], errors= 'coerce').fillna(0).astype(int)
df['product'] = df['product'].astype(str)
df['qty_ordered'] = pd.to_numeric(df['qty_ordered'],  errors='coerce').fillna(0).astype(int)
df['price'] = pd.to_numeric(df['price'], errors='coerce').fillna(0).astype(float)
df['order_date'] = pd.to_datetime(df['order_date'],format='mixed',errors='coerce')
df['shipping_address'] = df['shipping_address'].astype(str)
df = df.sort_values('order_date').reset_index(drop=True)
df = df[['order_id','order_date','product','qty_ordered','price','shipping_address']]

In [3]:
# df.info()

# Create datetime_dim to store order_date details; year,month,day,weekday

In [4]:
datetime_dim = df[['order_date']].reset_index(drop=True)
datetime_dim['datetime_dim_id'] = datetime_dim.index+1
datetime_dim['order_year'] = datetime_dim['order_date'].dt.year
datetime_dim['order_month'] = datetime_dim['order_date'].dt.month
datetime_dim['order_day'] = datetime_dim['order_date'].dt.day
datetime_dim['order_weekday'] = datetime_dim['order_date'].dt.weekday
datetime_dim = datetime_dim[['datetime_dim_id','order_date','order_year','order_month','order_day','order_weekday',]]
# datetime_dim.head()

# Extract product names and prices, create product id for each item

In [5]:
unique_products = df[['product','price']].drop_duplicates().reset_index(drop=True)
unique_products = unique_products.sort_values(by='product').reset_index(drop=True)
unique_products['product_id'] = unique_products.index+1
# unique_products

# Extract information and convert into dictionary to be used for mapping

In [6]:
product_details = {}
for index, row in unique_products.iterrows():
    product_id = row['product_id']
    product_name = row['product']
    product_price = row['price']
    product_details[product_id] ={'product':product_name, 'price':product_price}
# product_details

In [7]:
product_dim = df[['product']].reset_index(drop=True)
product_dim['product_dim_id'] = product_dim.index+1
product_dim['product_id'] = df['product'].map({v['product']: k for (k, v) in product_details.items()})
product_dim['price'] = product_dim['product_id'].map(lambda x: product_details.get(x, {}).get('price'))
product_dim = product_dim[['product_dim_id','product_id','product','price']]
# product_dim.head()

# Create customer_dim

In [8]:
customer_dim = df[['shipping_address']].reset_index(drop=True)
customer_dim['customer_dim_id'] = customer_dim.index+1
customer_dim['customer_id'] = customer_dim.index+1 #Natural/ main primary key
customer_dim['city'] = customer_dim['shipping_address'].str.split(', ').str[1]
customer_dim = customer_dim[['customer_dim_id','customer_id','shipping_address','city']]
# customer_dim.head()

# Create order_details as a fact_table

In [9]:
order_details = df.merge(datetime_dim, how='left', left_index=True, right_index=True) \
            .merge(customer_dim, how='left', left_index=True, right_index=True) \
            .merge(product_dim, how='left', left_index=True, right_index=True) \
            [['order_id','datetime_dim_id','customer_dim_id','product_dim_id','qty_ordered']]
order_details['total_price'] = (product_dim['price']*df['qty_ordered']).round(2)
order_details['datetime_dim_id'] = order_details['datetime_dim_id'].astype(pd.Int32Dtype())
order_details['customer_dim_id'] = order_details['customer_dim_id'].astype(pd.Int32Dtype())
order_details['product_dim_id'] = order_details['product_dim_id'].astype(pd.Int32Dtype())

In [10]:
order_details.head()

Unnamed: 0,order_id,datetime_dim_id,customer_dim_id,product_dim_id,qty_ordered,total_price
0,147268,1,1,1,1,11.99
1,148041,2,2,2,1,11.95
2,149343,3,3,3,1,150.0
3,149964,4,4,4,1,2.99
4,149350,5,5,5,2,23.9


# Export tables to Supabase

In [3]:
db_creds ={
    'host': 'hostname', #replace with hostname or url
    'database': 'postgres',
    'user': 'postgres',
    'password': 'password', #replace with password
    'port': 5432}

In [31]:
conn = psycopg2.connect(**db_creds)
cur = conn.cursor()

engine = create_engine(f'postgresql://{db_creds["user"]}:{db_creds["password"]}@{db_creds["host"]}:{db_creds["port"]}/{db_creds["database"]}')

In [32]:
create_datetime_dim = '''
    CREATE TABLE datetime_dim (
    datetime_dim_id PRIMARY KEY,
    order_date TIMESTAMP,
    order_year INTEGER,
    order_month INTEGER,
    order_day INTEGER,
    order_weekday INTEGER
    )
'''

create_customer_dim = '''
    CREATE TABLE customer_dim (
    customer_dim_id PRIMARY KEY,
    customer_id INTEGER,
    purchase_address VARCHAR(250),
    city VARCHAR(250)
    );
'''

create_product_dim = '''
    CREATE TABLE product_dim (
    product_dim_id PRIMARY KEY,
    product_id INTEGER,
    product VARCHAR(250),
    price FLOAT
    );
'''

create_order_details = '''
    CREATE TABLE order_details (
    order_id PRIMARY KEY,
    datetime_dim_id INTEGER,
    customer_dim_id INTEGER,
    product_dim_id INTEGER,
    qty_ordered INTEGER,
    total_price FLOAT,
    FOREIGN KEY (datetime_dim_id) REFERENCES datetime_dim (datetime_dim_id),
    FOREIGN KEY (customer_dim_id) REFERENCES customer_dim (customer_dim_id),
    FOREIGN KEY (product_dim_id) REFERENCES product_dim (product_dim_id)
    );
'''

In [33]:
try:
    cur.execute(create_datetime_dim)
    cur.execute(create_customer_dim)
    cur.execute(create_product_dim)
    cur.execute(create_order_details)
    conn.commit()
except:
    conn.rollback()

In [34]:
datetime_dim.to_sql('datetime_dim', engine, if_exists='replace', index=False)
customer_dim.to_sql('customer_dim', engine, if_exists='replace', index=False)
product_dim.to_sql('product_dim', engine, if_exists='replace', index=False)
order_details.to_sql('order_details', engine, if_exists='replace', index=False)

# Close the database connection
conn.close()