# Retail Data Warehouse Build

Build star schema (DimDate, DimProduct, DimCustomer, DimGeography, FactSales) from `raw_data/Online Retail.xlsx` and run example queries.

If dependencies are missing install:
```
pip install pandas openpyxl
```

In [1]:
# Setup: imports, root + DB connection (change DB_PATH to persist)
import pandas as pd
import sqlite3
from pathlib import Path
from datetime import datetime

# Resolve project root as parent of this notebook file
NOTEBOOK_PATH = Path.cwd()
# If running from a different CWD, attempt to locate raw_data by walking up
potential = [NOTEBOOK_PATH] + list(NOTEBOOK_PATH.parents)
DATA_FILE = 'Online Retail.xlsx'
DATA_PATH = None
for base in potential:
    candidate = base / 'raw_data' / DATA_FILE
    if candidate.exists():
        DATA_PATH = candidate
        break
if DATA_PATH is None:
    print('Data file not found. Checked:')
    for base in potential:
        print(' -', (base / 'raw_data' / DATA_FILE))
else:
    print('Using data file:', DATA_PATH)

DB_PATH = ':memory:'  # change to 'retail_dw.sqlite' to persist
conn = sqlite3.connect(DB_PATH)
conn.execute('PRAGMA foreign_keys = ON;')
print('SQLite version:', conn.execute('select sqlite_version();').fetchone()[0])

Using data file: k:\Code Projects\DSA2040_Practical_Exam_Justice_444\raw_data\Online Retail.xlsx
SQLite version: 3.40.1


## Load Source Data
Expected columns: InvoiceNo, StockCode, Description, Quantity, InvoiceDate, UnitPrice, CustomerID, Country

In [2]:
# Load Excel source into DataFrame; show sample & row count
# Expected columns: InvoiceNo, StockCode, Description, Quantity, InvoiceDate, UnitPrice, CustomerID, Country
from pathlib import Path
if DATA_PATH is None or not Path(DATA_PATH).exists():
    raise FileNotFoundError('Cannot locate data file. Ensure script run from project root or adjust path.')

df = pd.read_excel(DATA_PATH)
df.columns = [c.strip() for c in df.columns]
print(df.head())
print('Rows:', len(df))

  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

          InvoiceDate  UnitPrice  CustomerID         Country  
0 2010-12-01 08:26:00       2.55     17850.0  United Kingdom  
1 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
2 2010-12-01 08:26:00       2.75     17850.0  United Kingdom  
3 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
4 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
Rows: 541909


## Cleaning
- Drop missing InvoiceNo / StockCode
- Fill missing description with 'Unknown'
- Normalize string casing
- Mark returns (negative quantity)
- Anonymous customer for NULL CustomerID

In [3]:
# Clean + derive metrics
# - Remove blank keys, normalize text
# - Mark returns (negative quantity)
# - Derive sales_amount metrics and keep nullable CustomerID

df = df.dropna(subset=['InvoiceNo','StockCode'])
df['Description'] = df['Description'].fillna('Unknown').str.strip()
df['StockCode'] = df['StockCode'].astype(str).str.strip()
df['InvoiceNo'] = df['InvoiceNo'].astype(str).str.strip()
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['Country'] = df['Country'].astype(str).str.strip()
df['CustomerID'] = df['CustomerID'].astype('Int64')

df['is_return'] = (df['Quantity'] < 0).astype(int)
df['sales_amount'] = df['Quantity'] * df['UnitPrice']
df['sales_amount_abs'] = df['sales_amount'].abs()
print('After cleaning rows:', len(df))

After cleaning rows: 541909


## DimDate

In [4]:
min_date = df['InvoiceDate'].min().normalize()
max_date = df['InvoiceDate'].max().normalize()
date_range = pd.date_range(min_date, max_date, freq='D')
dim_date = pd.DataFrame({'full_date': date_range})
dim_date['date_key'] = dim_date['full_date'].dt.strftime('%Y%m%d').astype(int)
dim_date['day'] = dim_date['full_date'].dt.day
dim_date['month'] = dim_date['full_date'].dt.month
dim_date['month_name'] = dim_date['full_date'].dt.strftime('%B')
dim_date['quarter'] = dim_date['full_date'].dt.quarter
dim_date['year'] = dim_date['full_date'].dt.year
dim_date['week_of_year'] = dim_date['full_date'].dt.isocalendar().week.astype(int)
# pandas uses 0=Monday via dayofweek; convert to 1-7 ISO style
dim_date['day_of_week'] = dim_date['full_date'].dt.dayofweek + 1
dim_date['is_weekend'] = dim_date['day_of_week'].isin([6,7]).astype(int)
dim_date = dim_date[['date_key','full_date','day','month','month_name','quarter','year','week_of_year','day_of_week','is_weekend']]
dim_date.head()

Unnamed: 0,date_key,full_date,day,month,month_name,quarter,year,week_of_year,day_of_week,is_weekend
0,20101201,2010-12-01,1,12,December,4,2010,48,3,0
1,20101202,2010-12-02,2,12,December,4,2010,48,4,0
2,20101203,2010-12-03,3,12,December,4,2010,48,5,0
3,20101204,2010-12-04,4,12,December,4,2010,48,6,1
4,20101205,2010-12-05,5,12,December,4,2010,48,7,1


## DimGeography

In [5]:
# Build DimDate spanning min..max invoice dates
min_date = df['InvoiceDate'].min().normalize()
max_date = df['InvoiceDate'].max().normalize()
date_range = pd.date_range(min_date, max_date, freq='D')
dim_date = pd.DataFrame({'full_date': date_range})
dim_date['date_key'] = dim_date['full_date'].dt.strftime('%Y%m%d').astype(int)
dim_date['day'] = dim_date['full_date'].dt.day
dim_date['month'] = dim_date['full_date'].dt.month
dim_date['month_name'] = dim_date['full_date'].dt.strftime('%B')
dim_date['quarter'] = dim_date['full_date'].dt.quarter
dim_date['year'] = dim_date['full_date'].dt.year
dim_date['week_of_year'] = dim_date['full_date'].dt.isocalendar().week.astype(int)
# Convert to ISO 1-7 (Mon-Sun)
dim_date['day_of_week'] = dim_date['full_date'].dt.dayofweek + 1
dim_date['is_weekend'] = dim_date['day_of_week'].isin([6,7]).astype(int)
dim_date = dim_date[['date_key','full_date','day','month','month_name','quarter','year','week_of_year','day_of_week','is_weekend']]
dim_date.head()

Unnamed: 0,date_key,full_date,day,month,month_name,quarter,year,week_of_year,day_of_week,is_weekend
0,20101201,2010-12-01,1,12,December,4,2010,48,3,0
1,20101202,2010-12-02,2,12,December,4,2010,48,4,0
2,20101203,2010-12-03,3,12,December,4,2010,48,5,0
3,20101204,2010-12-04,4,12,December,4,2010,48,6,1
4,20101205,2010-12-05,5,12,December,4,2010,48,7,1


## DimCustomer

In [6]:
# Build DimGeography (country -> surrogate key); region placeholder
countries = sorted(df['Country'].dropna().unique())
dim_geog = pd.DataFrame({'country': countries})
dim_geog['region'] = None  # can enrich later
dim_geog['geography_key'] = range(1, len(dim_geog)+1)
dim_geog = dim_geog[['geography_key','country','region']]
dim_geog.head()

Unnamed: 0,geography_key,country,region
0,1,Australia,
1,2,Austria,
2,3,Bahrain,
3,4,Belgium,
4,5,Brazil,


## DimProduct (simple category inference)

In [7]:
# Build DimCustomer with surrogate keys + anonymous row (key 0)
customer_ids = df['CustomerID'].dropna().unique()
dim_customer = pd.DataFrame({'customer_id': sorted(customer_ids)})
dim_customer['customer_key'] = range(1, len(dim_customer)+1)
dim_customer['customer_name'] = None
dim_customer['gender'] = None
dim_customer['birth_year'] = None
dim_customer['segment'] = 'Retail'
# map predominant country per customer (mode)
cust_country = (df.dropna(subset=['CustomerID'])
                  .groupby('CustomerID')['Country']
                  .agg(lambda x: x.value_counts().idxmax()))
dim_customer = dim_customer.merge(cust_country.rename('Country'), left_on='customer_id', right_index=True, how='left')
dim_customer = dim_customer.merge(dim_geog[['geography_key','country']], left_on='Country', right_on='country', how='left')
dim_customer['customer_since_date_key'] = None
dim_customer = dim_customer[['customer_key','customer_id','customer_name','gender','birth_year','geography_key','customer_since_date_key','segment']]
# anonymous fallback
anon = pd.DataFrame([{'customer_key':0,'customer_id':None,'customer_name':'Anonymous','gender':None,'birth_year':None,'geography_key':dim_geog.sample(1)['geography_key'].iloc[0],'customer_since_date_key':None,'segment':'Retail'}])
dim_customer = pd.concat([anon, dim_customer], ignore_index=True)
dim_customer.head()

Unnamed: 0,customer_key,customer_id,customer_name,gender,birth_year,geography_key,customer_since_date_key,segment
0,0,,Anonymous,,,25,,Retail
1,1,12346.0,,,,37,,Retail
2,2,12347.0,,,,18,,Retail
3,3,12348.0,,,,13,,Retail
4,4,12349.0,,,,20,,Retail


## Create Tables (DDL)

In [8]:
# Build DimProduct; simple keyword-based category inference placeholder

def infer_category(desc: str) -> str:
    if not isinstance(desc, str): return 'Unknown'
    d = desc.lower()
    if 'mug' in d: return 'Mugs'
    if 'bag' in d: return 'Bags'
    if 'card' in d: return 'Cards'
    if 'candle' in d: return 'Candles'
    return 'Other'

products = df[['StockCode','Description']].drop_duplicates().copy()
products['category'] = products['Description'].apply(infer_category)
products['subcategory'] = None
products['unit_of_measure'] = 'each'
products['first_sale_date_key'] = None
products['is_active'] = 1
products['product_key'] = range(1, len(products)+1)

dim_product = products[['product_key','StockCode','Description','category','subcategory','unit_of_measure','first_sale_date_key','is_active']]
dim_product.rename(columns={'StockCode':'stock_code','Description':'description'}, inplace=True)
dim_product.head()

Unnamed: 0,product_key,stock_code,description,category,subcategory,unit_of_measure,first_sale_date_key,is_active
0,1,85123A,WHITE HANGING HEART T-LIGHT HOLDER,Other,,each,,1
1,2,71053,WHITE METAL LANTERN,Other,,each,,1
2,3,84406B,CREAM CUPID HEARTS COAT HANGER,Other,,each,,1
3,4,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,Other,,each,,1
4,5,84029E,RED WOOLLY HOTTIE WHITE HEART.,Other,,each,,1


## Load Dimensions

In [9]:
# Create tables (simplified DDL version; constraints trimmed for clarity)
conn.executescript(
'''
DROP TABLE IF EXISTS FactSales;
DROP TABLE IF EXISTS DimProduct;
DROP TABLE IF EXISTS DimCustomer;
DROP TABLE IF EXISTS DimGeography;
DROP TABLE IF EXISTS DimDate;
CREATE TABLE DimDate (date_key INTEGER PRIMARY KEY, full_date DATE, day INTEGER, month INTEGER, month_name TEXT, quarter INTEGER, year INTEGER, week_of_year INTEGER, day_of_week INTEGER, is_weekend INTEGER);
CREATE TABLE DimGeography (geography_key INTEGER PRIMARY KEY, country TEXT, region TEXT);
CREATE TABLE DimProduct (product_key INTEGER PRIMARY KEY, stock_code TEXT, description TEXT, category TEXT, subcategory TEXT, unit_of_measure TEXT, first_sale_date_key INTEGER, is_active INTEGER);
CREATE TABLE DimCustomer (customer_key INTEGER PRIMARY KEY, customer_id INTEGER, customer_name TEXT, gender TEXT, birth_year INTEGER, geography_key INTEGER, customer_since_date_key INTEGER, segment TEXT);
CREATE TABLE FactSales (fact_sales_key INTEGER PRIMARY KEY, date_key INTEGER, product_key INTEGER, customer_key INTEGER, geography_key INTEGER, invoice_no TEXT, quantity INTEGER, unit_price NUMERIC, sales_amount NUMERIC, sales_amount_abs NUMERIC, is_return INTEGER, load_timestamp DATETIME DEFAULT (datetime('now')));
'''
)
print('Tables created.')

Tables created.


## Prepare Fact Rows

In [10]:
# Load dimension DataFrames into SQLite
for name, frame in [('DimDate',dim_date), ('DimGeography',dim_geog), ('DimProduct',dim_product), ('DimCustomer',dim_customer)]:
    frame.to_sql(name, conn, if_exists='append', index=False)
    print(name, conn.execute(f'SELECT COUNT(*) FROM {name}').fetchone()[0])

DimDate 374
DimGeography 38
DimProduct 5749
DimCustomer 4373


## Load Fact

In [11]:
# Prepare fact rows: map natural keys to surrogate keys
product_map = dim_product.set_index('stock_code')['product_key'].to_dict()
customer_map = dim_customer.set_index('customer_id')['customer_key'].to_dict()
geog_map = dim_geog.set_index('country')['geography_key'].to_dict()
date_map = dim_date.set_index('full_date')['date_key'].to_dict()

tx = df.copy()
tx['date_key'] = tx['InvoiceDate'].dt.normalize().map(date_map)
tx['product_key'] = tx['StockCode'].map(product_map)
tx['customer_key'] = tx['CustomerID'].map(customer_map).fillna(0).astype(int)
tx['geography_key'] = tx['Country'].map(geog_map)

fact = tx[['date_key','product_key','customer_key','geography_key','InvoiceNo','Quantity','UnitPrice','sales_amount','sales_amount_abs','is_return']].copy()
fact.rename(columns={'InvoiceNo':'invoice_no','Quantity':'quantity','UnitPrice':'unit_price'}, inplace=True)
fact = fact.dropna(subset=['date_key','product_key','geography_key'])
print('Fact rows:', len(fact))

Fact rows: 541909


## Example Queries

In [12]:
# Load fact table
fact.to_sql('FactSales', conn, if_exists='append', index=False)
print('FactSales rows:', conn.execute('SELECT COUNT(*) FROM FactSales').fetchone()[0])

FactSales rows: 541909


## Data Quality Checks

In [13]:
# Example analytics queries
import pandas as pd

q1 = '''
SELECT d.year, d.quarter, p.category, ROUND(SUM(f.sales_amount),2) total_sales
FROM FactSales f
JOIN DimDate d ON f.date_key = d.date_key
JOIN DimProduct p ON f.product_key = p.product_key
WHERE f.is_return = 0
GROUP BY d.year, d.quarter, p.category
ORDER BY d.year, d.quarter, p.category;
'''
print('Sales by category per quarter')
display(pd.read_sql(q1, conn))

q2 = '''
SELECT p.category,
  SUM(CASE WHEN f.is_return=1 THEN -f.quantity ELSE 0 END) units_returned,
  SUM(CASE WHEN f.is_return=0 THEN f.quantity ELSE 0 END) units_sold,
  ROUND(1.0 * SUM(CASE WHEN f.is_return=1 THEN -f.quantity ELSE 0 END) / NULLIF(SUM(CASE WHEN f.is_return=0 THEN f.quantity ELSE 0 END),0),4) return_rate
FROM FactSales f
JOIN DimProduct p ON f.product_key = p.product_key
GROUP BY p.category
ORDER BY return_rate DESC;
'''
print('Return rate by category')
display(pd.read_sql(q2, conn))

q3 = '''
SELECT g.country, d.year, d.quarter, ROUND(SUM(f.sales_amount),2) net_sales
FROM FactSales f
JOIN DimDate d ON f.date_key = d.date_key
JOIN DimGeography g ON f.geography_key = g.geography_key
GROUP BY g.country, d.year, d.quarter
ORDER BY g.country, d.year, d.quarter LIMIT 20;
'''
print('Geography sales sample')
display(pd.read_sql(q3, conn))

Sales by category per quarter


Unnamed: 0,year,quarter,category,total_sales
0,2010,4,Bags,39446.91
1,2010,4,Candles,8867.57
2,2010,4,Cards,17231.98
3,2010,4,Mugs,10387.86
4,2010,4,Other,747811.82
5,2011,1,Bags,154435.56
6,2011,1,Candles,22860.45
7,2011,1,Cards,22284.14
8,2011,1,Mugs,24290.11
9,2011,1,Other,1708765.55


Return rate by category


Unnamed: 0,category,units_returned,units_sold,return_rate
0,Unknown,1871,9981,0.1875
1,Other,465824,4776096,0.0975
2,Mugs,3385,72635,0.0466
3,Bags,10392,543308,0.0191
4,Candles,1453,76854,0.0189
5,Cards,1606,182107,0.0088


Geography sales sample


Unnamed: 0,country,year,quarter,net_sales
0,Australia,2010,4,1005.1
1,Australia,2011,1,40700.47
2,Australia,2011,2,39126.68
3,Australia,2011,3,32288.5
4,Australia,2011,4,23956.52
5,Austria,2010,4,257.04
6,Austria,2011,1,2226.48
7,Austria,2011,2,1906.01
8,Austria,2011,3,2708.03
9,Austria,2011,4,3056.76


# Basic data quality checks (0 issues expected ideally)
checks = {
  'amount_consistency': 'SELECT COUNT(*) c FROM FactSales WHERE sales_amount != quantity * unit_price',
  'null_fks': 'SELECT COUNT(*) c FROM FactSales WHERE date_key IS NULL OR product_key IS NULL OR customer_key IS NULL OR geography_key IS NULL',
  'return_flag': 'SELECT COUNT(*) c FROM FactSales WHERE (quantity < 0 AND is_return = 0) OR (quantity > 0 AND is_return=1)'
}
for name, sql in checks.items():
    print(name, conn.execute(sql).fetchone()[0])