In [2]:
"""
Generate sample daily spending data and save as .parquet file
Simulates real-world financial transaction data from banks/platforms like Shopee, Grab, PayNow
- Generates 2000+ transactions per person over 3 years
- All data stored in single consolidated parquet file (industry standard)
- Maintains realistic data inconsistencies (multiple date formats, currency formats)
"""

import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from datetime import datetime, timedelta
import random
import os

# Configuration
NUM_PERSONS = 3
RECORDS_PER_PERSON = 2000
START_DATE = datetime(2022, 1, 1)
END_DATE = datetime(2024, 12, 31)
TOTAL_DAYS = (END_DATE - START_DATE).days

# Person profiles
PERSONS = ['John Tan', 'Mary Lim', 'David Wong']

# Transaction categories with realistic spending patterns
CATEGORIES = {
    'Food': {
        'locations': ['Hawker Centre', 'Food Court', 'Restaurant ABC', 'McDonald\'s', 'KFC', 'Starbucks', 'Toast Box'],
        'descriptions': ['Chicken Rice', 'Lunch', 'Dinner', 'Breakfast', 'Coffee', 'Snacks'],
        'amount_range': (3.0, 80.0),
        'frequency_weight': 30  # Higher = more frequent
    },
    'Transport': {
        'locations': ['MRT Station', 'Grab', 'Gojek', 'ComfortDelGro', 'Bus Interchange'],
        'descriptions': ['Train Fare', 'Taxi to office', 'Bus fare', 'Airport transfer', 'Shopping trip'],
        'amount_range': (1.5, 45.0),
        'frequency_weight': 25
    },
    'Groceries': {
        'locations': ['NTUC FairPrice', 'Cold Storage', 'Giant', 'Sheng Siong', 'Shopee Mart'],
        'descriptions': ['Weekly Groceries', 'Fresh produce', 'Household items', 'Snacks shopping'],
        'amount_range': (30.0, 200.0),
        'frequency_weight': 10
    },
    'Entertainment': {
        'locations': ['Cinema', 'Netflix', 'Spotify', 'Disney+', 'Steam', 'Nintendo eShop'],
        'descriptions': ['Movie Tickets', 'Streaming subscription', 'Gaming', 'Concert tickets'],
        'amount_range': (10.0, 120.0),
        'frequency_weight': 8
    },
    'Shopping': {
        'locations': ['Shopee', 'Lazada', 'Qoo10', 'Zalora', 'Taobao', 'Amazon'],
        'descriptions': ['Online shopping', 'Clothing', 'Electronics', 'Home decor', 'Books'],
        'amount_range': (15.0, 500.0),
        'frequency_weight': 12
    },
    'Utilities': {
        'locations': ['SP Services', 'PUB', 'Singtel', 'Starhub', 'M1'],
        'descriptions': ['Electricity Bill', 'Water Bill', 'Internet Bill', 'Mobile Bill'],
        'amount_range': (40.0, 180.0),
        'frequency_weight': 3
    },
    'Healthcare': {
        'locations': ['Clinic', 'Hospital', 'Guardian', 'Watsons', 'Dental Clinic'],
        'descriptions': ['Medical checkup', 'Medicine', 'Dental visit', 'Health supplements'],
        'amount_range': (20.0, 300.0),
        'frequency_weight': 5
    },
    'Education': {
        'locations': ['Coursera', 'Udemy', 'Bookstore', 'Library Fine', 'Workshop'],
        'descriptions': ['Online course', 'Books', 'Training', 'Certification'],
        'amount_range': (15.0, 250.0),
        'frequency_weight': 4
    },
    'Fitness': {
        'locations': ['Gym', 'ActiveSG', 'Yoga Studio', 'Swimming Complex'],
        'descriptions': ['Gym membership', 'Fitness class', 'Sports equipment', 'Pool entry'],
        'amount_range': (5.0, 150.0),
        'frequency_weight': 3
    }
}

PAYMENT_METHODS = ['Cash', 'Credit Card', 'Debit Card', 'PayNow', 'GrabPay', 'Mobile Payment', 
                   'Bank Transfer', 'Apple Pay', 'Google Pay', 'EZ-Link']

# Date format variations (simulating inconsistent data sources)
DATE_FORMATS = [
    lambda d: d.strftime('%d/%m/%Y'),      # 15/10/2024
    lambda d: d.strftime('%d-%b-%Y'),      # 16-Oct-2024
    lambda d: d.strftime('%Y-%m-%d'),      # 2024-10-16
    lambda d: d.strftime('%d/%m/%y'),      # 17/10/24
]

# Amount format variations (simulating different data sources)
def format_amount(amount):
    formats = [
        f'${amount:.2f}',           # $12.50
        f'{amount:.2f}',            # 12.50
        f'{amount:.2f} SGD',        # 12.50 SGD
        f'SGD {amount:.2f}',        # SGD 12.50
    ]
    return random.choice(formats)

# Generate weighted category list
weighted_categories = []
for category, data in CATEGORIES.items():
    weighted_categories.extend([category] * data['frequency_weight'])

print("=" * 80)
print("GENERATING FINANCIAL TRANSACTION DATA")
print("=" * 80)
print(f"📊 Configuration:")
print(f"   - Number of persons: {NUM_PERSONS}")
print(f"   - Records per person: {RECORDS_PER_PERSON}")
print(f"   - Date range: {START_DATE.date()} to {END_DATE.date()}")
print(f"   - Total records: {NUM_PERSONS * RECORDS_PER_PERSON}")
print(f"   - Time span: 3 years")
print("\n🔄 Generating transactions...\n")

# Generate all transactions
all_transactions = []

for person in PERSONS:
    print(f"   Generating {RECORDS_PER_PERSON} transactions for {person}...")
    
    for _ in range(RECORDS_PER_PERSON):
        # Random date within the range
        random_days = random.randint(0, TOTAL_DAYS)
        transaction_date = START_DATE + timedelta(days=random_days)
        
        # Select category and details
        category = random.choice(weighted_categories)
        category_data = CATEGORIES[category]
        
        location = random.choice(category_data['locations'])
        description = random.choice(category_data['descriptions'])
        amount = round(random.uniform(*category_data['amount_range']), 2)
        payment_method = random.choice(PAYMENT_METHODS)
        
        # Format date and amount with variations
        date_str = random.choice(DATE_FORMATS)(transaction_date)
        amount_str = format_amount(amount)
        
        all_transactions.append({
            'person_name': person,
            'spending_date': date_str,
            'category': category,
            'amount': amount_str,
            'location': location,
            'description': description,
            'payment_method': payment_method
        })

# Create DataFrame
df = pd.DataFrame(all_transactions)

# Sort by person and date (for realistic data organization)
# Note: dates are strings with mixed formats, so sort will be imperfect (realistic!)
df = df.sort_values(['person_name', 'spending_date']).reset_index(drop=True)

# Display sample
print("\n" + "=" * 80)
print("SAMPLE DATA (First 10 rows)")
print("=" * 80)
print(df.head(10).to_string(index=False))
print("\n")

# Save as single consolidated .parquet file (industry standard)
output_file = '../data/daily_spending_sample.parquet'
os.makedirs('../data', exist_ok=True)
df.to_parquet(output_file, engine='pyarrow', compression='snappy', index=False)

print("=" * 80)
print("FILE SAVED")
print("=" * 80)
print(f"✅ Parquet file created: {output_file}")
file_size_bytes = os.path.getsize(output_file)
file_size_kb = file_size_bytes / 1024
file_size_mb = file_size_kb / 1024
print(f"📊 File size: {file_size_mb:.2f} MB ({file_size_kb:.2f} KB)")
print(f"📁 Location: {os.path.abspath(output_file)}")

# Verify the file by reading it back
df_verify = pd.read_parquet(output_file)
print("\n" + "=" * 80)
print("VERIFICATION: Data Statistics")
print("=" * 80)
print(f"✓ Total rows: {len(df_verify):,}")
print(f"✓ Columns: {df_verify.columns.tolist()}")
print(f"\n📈 Records per person:")
for person in PERSONS:
    count = len(df_verify[df_verify['person_name'] == person])
    print(f"   - {person}: {count:,} transactions")

print(f"\n📊 Category distribution (top 5):")
category_counts = df_verify['category'].value_counts().head(5)
for cat, count in category_counts.items():
    print(f"   - {cat}: {count:,} transactions")

print(f"\n💳 Payment method distribution (top 5):")
payment_counts = df_verify['payment_method'].value_counts().head(5)
for method, count in payment_counts.items():
    print(f"   - {method}: {count:,} transactions")

print("\n" + "=" * 80)
print("✅ Data generation complete! Ready for ETL pipeline processing.")
print("=" * 80)

GENERATING FINANCIAL TRANSACTION DATA
📊 Configuration:
   - Number of persons: 3
   - Records per person: 2000
   - Date range: 2022-01-01 to 2024-12-31
   - Total records: 6000
   - Time span: 3 years

🔄 Generating transactions...

   Generating 2000 transactions for John Tan...
   Generating 2000 transactions for Mary Lim...
   Generating 2000 transactions for David Wong...

SAMPLE DATA (First 10 rows)
person_name spending_date   category     amount        location     description payment_method
 David Wong   01-Apr-2022  Groceries     155.66     Shopee Mart Household items     Debit Card
 David Wong   01-Apr-2023  Transport     $40.10     MRT Station   Shopping trip Mobile Payment
 David Wong   01-Apr-2023   Shopping 333.95 SGD          Zalora Online shopping      Apple Pay
 David Wong   01-Apr-2023       Food  SGD 17.51  Restaurant ABC          Dinner    Credit Card
 David Wong   01-Aug-2023  Transport  SGD 26.22 Bus Interchange  Taxi to office        EZ-Link
 David Wong   01-Aug-2