In [None]:
import pandas as pd
from faker import Faker
import random
from datetime import datetime, timedelta

fake = Faker()

# Function to generate dim_date data for November to December 2024
def generate_dim_date():
    start_date = datetime(2024, 1, 1)
    end_date = datetime(2024, 12, 31)
    date_list = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]
    
    data = []
    for date in date_list:
        record = {
            'date_key': date.strftime('%Y-%m-%d'),
            'date': date,
            'quarter': 4,
            'year': date.year,
            'month': date.month,
            'day': date.day,
            'day_of_week': date.weekday(),
            'day_type': 'Weekend' if date.weekday() >= 5 else 'Weekday',
            'is_holiday': random.choice([True, False]),
            'is_campaign_day': random.choice([True, False]),
            'is_salary_day': random.choice([True, False]),
        }
        data.append(record)
    return pd.DataFrame(data)

# Function to generate dim_item data
def generate_dim_item(num_records):
    data = []
    for _ in range(num_records):
        record = {
            'sku_key': fake.uuid4(),
            'sku_id': fake.uuid4(),
            'shop_id': fake.uuid4(),
            'listing_id': fake.uuid4(),
            'listing_name': fake.word(),
            'listing_description': fake.sentence(),
            'category_lvl_1': fake.word(),
            'category_lvl_2': fake.word(),
            'model_id': fake.uuid4(),
            'item_id': fake.uuid4(),
            'model_name': fake.word(),
            'item_description': fake.sentence(),
            'weight': random.uniform(0.1, 10.0),
            'length': random.uniform(0.1, 100.0),
            'width': random.uniform(0.1, 100.0),
            'height': random.uniform(0.1, 100.0),
            'item_price': random.uniform(1.0, 1000.0),
            'is_active': random.choice([True, False]),
            'create_time': fake.date_time_this_decade(),
            'banned_time': fake.date_time_this_decade(),
            'last_modified_time': fake.date_time_this_decade(),
            'is_wh': random.choice([True, False]),
        }
        data.append(record)
    return pd.DataFrame(data)

# Function to generate dim_warehouse data
def generate_dim_warehouse(num_records):
    warehouse_types = ['WHA', 'WHB', 'WHC']  # Restrict warehouse types
    data = []
    for _ in range(num_records):
        record = {
            'wh_key': fake.uuid4(),
            'wh_id': fake.uuid4(),
            'wh_name': fake.company(),
            'wh_type': random.choice(warehouse_types),  # Restricted to WHA, WHB, WHC
            'wh_region': fake.state(),
            'wh_city': fake.city(),
            'wh_brgy': fake.street_name(),
            'wh_postal_code': fake.postcode(),
            'total_land_area': random.uniform(1000.0, 10000.0),
            'operating_hours': random.randint(8, 24),
            'is_active': random.choice([True, False]),
        }
        data.append(record)
    return pd.DataFrame(data)


# Generate dimension tables
date_data = generate_dim_date()
item_data = generate_dim_item(100)
warehouse_data = generate_dim_warehouse(100)

# Function to generate fact_platform_performance_summary data
def generate_fact_platform_performance_summary(num_records, date_keys):
    data = []
    for _ in range(num_records):
        record = {
            'date_key': random.choice(date_keys),
            'l1d_ado': random.uniform(0, 100),
            'l7d_ado': random.uniform(0, 100),
            'l30d_ado': random.uniform(0, 100),
            'l1d_adgmv': random.uniform(0, 100),
            'l7d_adgmv': random.uniform(0, 100),
            'l30d_adgmv': random.uniform(0, 100),
            'l1d_avg_active_buyers': random.uniform(0, 100),
            'l7d_avg_active_buyers': random.uniform(0, 100),
            'l30d_avg_active_buyers': random.uniform(0, 100),
            'l1d_avg_active_shops': random.uniform(0, 100),
            'l7d_avg_active_shops': random.uniform(0, 100),
            'l30d_avg_active_shops': random.uniform(0, 100),
            'l1d_otd_time': random.uniform(0, 100),
            'l7d_otd_time': random.uniform(0, 100),
            'l30d_otd_time': random.uniform(0, 100),
        }
        data.append(record)
    return pd.DataFrame(data)

# Function to generate fact_sku_performance_summary data
def generate_fact_sku_performance_summary(num_records, date_keys, sku_ids):
    data = []
    for _ in range(num_records):
        record = {
            'date_key': random.choice(date_keys),
            'sku_id': random.choice(sku_ids),
            'l1d_ado': random.uniform(0, 100),
            'l7d_ado': random.uniform(0, 100),
            'l30d_ado': random.uniform(0, 100),
            'l90d_ado': random.uniform(0, 100),
            'l1d_adgmv': random.uniform(0, 100),
            'l7d_adgmv': random.uniform(0, 100),
            'l30d_adgmv': random.uniform(0, 100),
            'l90d_adgmv': random.uniform(0, 100),
        }
        data.append(record)
    return pd.DataFrame(data)

# Function to generate fact_seller_performance_summary data
def generate_fact_seller_performance_summary(num_records, date_keys, shop_ids):
    data = []
    for _ in range(num_records):
        record = {
            'date_key': random.choice(date_keys),
            'shop_id': random.choice(shop_ids),
            'l1d_ado': random.uniform(0, 100),
            'l7d_ado': random.uniform(0, 100),
            'l30d_ado': random.uniform(0, 100),
            'l90d_ado': random.uniform(0, 100),
            'l1d_adgmv': random.uniform(0, 100),
            'l7d_adgmv': random.uniform(0, 100),
            'l30d_adgmv': random.uniform(0, 100),
            'l90d_adgmv': random.uniform(0, 100),
        }
        data.append(record)
    return pd.DataFrame(data)

# Function to generate fact_warehouse_summary data
def generate_fact_warehouse_summary(num_records, date_keys, dim_warehouse_df):
    # Filter for WHA, WHB, WHC types
    valid_wh_keys = dim_warehouse_df[dim_warehouse_df['wh_type'].isin(['WHA', 'WHB', 'WHC'])]['wh_key'].tolist()
    
    data = []
    existing_combinations = set()
    while len(data) < num_records:
        date_key = random.choice(date_keys)
        wh_key = random.choice(valid_wh_keys)  # Choose only from WHA, WHB, WHC
        
        combination = (wh_key, date_key)
        if combination not in existing_combinations:
            record = {
                'date_key': date_key,
                'wh_key': wh_key,
                'total_items': random.randint(0, 1000),
                'total_manhours': random.uniform(0, 1000),
                'total_active_manhours': random.uniform(0, 1000),
                'l1d_ado': random.uniform(0, 100),
                'l7d_ado': random.uniform(0, 100),
                'l30d_ado': random.uniform(0, 100),
                'l1d_adi': random.uniform(0, 100),
                'l7d_adi': random.uniform(0, 100),
                'l30d_adi': random.uniform(0, 100),
                'l1d_prod_rate': random.uniform(0, 100),
                'l7d_prod_rate': random.uniform(0, 100),
                'l30d_prod_rate': random.uniform(0, 100),
                'l1d_idle_rate': random.uniform(0, 100),
                'l7d_idle_rate': random.uniform(0, 100),
                'l30d_idle_rate': random.uniform(0, 100),
            }
            data.append(record)
            existing_combinations.add(combination)
    return pd.DataFrame(data)


# Function to generate fact_staff_prod_summary data
def generate_fact_staff_prod_summary(num_records, date_keys, wh_keys):
    data = []
    for _ in range(num_records):
        record = {
            'staff_key': fake.uuid4(),
            'date_key': random.choice(date_keys),
            'wh_key': random.choice(wh_keys),
            'total_items': random.randint(0, 1000),
            'total_manhours': random.uniform(0, 1000),
            'total_active_manhours': random.uniform(0, 1000),
            'prod_rate': random.uniform(0, 100),
            'idle_rate': random.uniform(0, 100),
        }
        data.append(record)
    return pd.DataFrame(data)

# Retrieve keys for fact table generation
date_keys = date_data['date_key'].tolist()
sku_ids = item_data['sku_id'].tolist()
shop_ids = item_data['shop_id'].tolist()
wh_keys = warehouse_data['wh_key'].tolist()

# Generate fact tables
platform_data = generate_fact_platform_performance_summary(1000, date_keys)
sku_data = generate_fact_sku_performance_summary(1000, date_keys, sku_ids)
seller_data = generate_fact_seller_performance_summary(1000, date_keys, shop_ids)
warehouse_data_fact = generate_fact_warehouse_summary(1000, date_keys, warehouse_data)
staff_data = generate_fact_staff_prod_summary(1000, date_keys, wh_keys)

# Save each DataFrame to a CSV file
date_data.to_csv('dim_date.csv', index=False)
item_data.to_csv('dim_item.csv', index=False)
warehouse_data.to_csv('dim_warehouse.csv', index=False)
platform_data.to_csv('fact_platform_performance_summary.csv', index=False)
sku_data.to_csv('fact_sku_performance_summary.csv', index=False)
seller_data.to_csv('fact_seller_performance_summary.csv', index=False)
warehouse_data_fact.to_csv('fact_warehouse_summary.csv', index=False)
staff_data.to_csv('fact_staff_prod_summary.csv', index=False)


TypeError: generate_fact_warehouse_summary() missing 1 required positional argument: 'dim_warehouse_df'