In [None]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import logging

# Add project root to path
project_root = Path.cwd().parent.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from config import DATA_DIR_BRONZE, DATA_DIR_SILVER, LISTINGS_BRONZE, LISTINGS_SILVER
from elferspot_listings.data_processing import process_bronze_to_silver
from elferspot_listings.utils.helpers import setup_logging, load_data

# Setup logging
logger = setup_logging(level='INFO')
logger.info("Bronze to Silver transformation initialized")

## Step 1: Load Bronze Data

Load the most recent bronze data file.

In [None]:
# Find the most recent bronze file
bronze_files = sorted(DATA_DIR_BRONZE.glob("listings_bronze*.xlsx"))

if bronze_files:
    bronze_path = bronze_files[-1]
    logger.info(f"Using bronze file: {bronze_path.name}")
else:
    # Fallback to config path
    bronze_path = LISTINGS_BRONZE
    logger.warning(f"No bronze files found, using: {bronze_path}")

# Load data
if bronze_path.exists():
    df_bronze = load_data(bronze_path)
    print(f"✓ Loaded {len(df_bronze):,} rows from bronze")
    print(f"  Columns: {len(df_bronze.columns)}")
    print(f"  File: {bronze_path.name}")
else:
    raise FileNotFoundError(f"Bronze file not found: {bronze_path}")

df_bronze.head()

## Step 2: Initial Data Quality Check

In [None]:
print("=== Data Quality Report ===")
print(f"\nShape: {df_bronze.shape}")
print(f"\nDuplicates (by URL): {df_bronze['URL'].duplicated().sum()}")
print(f"\nMissing Values:")
print(df_bronze.isnull().sum()[df_bronze.isnull().sum() > 0])
print(f"\nData Types:")
print(df_bronze.dtypes.value_counts())

## Step 3: Run Bronze to Silver Transformation

Use the automated pipeline to clean and standardize the data.

In [None]:
# Define output path
silver_path = DATA_DIR_SILVER / f"listings_silver_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.xlsx"

# Run transformation
df_silver = process_bronze_to_silver(
    bronze_path=bronze_path,
    silver_path=silver_path,
    drop_shop_links=True
)

print(f"\n✓ Transformation complete!")
print(f"  Bronze rows: {len(df_bronze):,}")
print(f"  Silver rows: {len(df_silver):,}")
print(f"  Rows removed: {len(df_bronze) - len(df_silver):,} ({(1 - len(df_silver)/len(df_bronze))*100:.1f}%)")

## Step 4: Review Transformed Data

In [None]:
# Display sample
print("=== Silver Data Sample ===")
display(df_silver.head(10))

# Key columns check
print("\n=== Key Columns ===")
key_cols = ['Model', 'Series', 'Mileage_km', 'price_in_eur', 'Year of construction']
for col in key_cols:
    if col in df_silver.columns:
        print(f"{col}: {df_silver[col].notna().sum()} non-null ({df_silver[col].notna().sum()/len(df_silver)*100:.1f}%)")

## Step 5: Data Quality Validation

In [None]:
# Validate mileage conversion
print("=== Mileage Statistics ===")
if 'Mileage_km' in df_silver.columns:
    print(df_silver['Mileage_km'].describe())
    print(f"\nMileage range: {df_silver['Mileage_km'].min():,.0f} - {df_silver['Mileage_km'].max():,.0f} km")

# Validate price conversion
print("\n=== Price Statistics (EUR) ===")
if 'price_in_eur' in df_silver.columns:
    print(df_silver['price_in_eur'].describe())
    print(f"\nPrice range: €{df_silver['price_in_eur'].min():,.0f} - €{df_silver['price_in_eur'].max():,.0f}")

# Check derived features
print("\n=== Derived Features ===")
derived_features = ['owners_known', 'is_fully_restored', 'Paint-to-Sample (PTS)']
for feat in derived_features:
    if feat in df_silver.columns:
        print(f"{feat}: {df_silver[feat].value_counts().to_dict()}")

## Step 6: Visualize Data Distribution

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Price distribution
if 'price_in_eur' in df_silver.columns:
    axes[0, 0].hist(df_silver['price_in_eur'].dropna(), bins=50, edgecolor='black')
    axes[0, 0].set_title('Price Distribution (EUR)')
    axes[0, 0].set_xlabel('Price (EUR)')
    axes[0, 0].set_ylabel('Frequency')

# Mileage distribution
if 'Mileage_km' in df_silver.columns:
    axes[0, 1].hist(df_silver['Mileage_km'].dropna(), bins=50, edgecolor='black')
    axes[0, 1].set_title('Mileage Distribution')
    axes[0, 1].set_xlabel('Mileage (km)')
    axes[0, 1].set_ylabel('Frequency')

# Model distribution
if 'Model' in df_silver.columns:
    model_counts = df_silver['Model'].value_counts().head(10)
    axes[1, 0].barh(model_counts.index, model_counts.values)
    axes[1, 0].set_title('Top 10 Models')
    axes[1, 0].set_xlabel('Count')

# Year distribution
if 'Year of construction' in df_silver.columns:
    year_counts = df_silver['Year of construction'].value_counts().sort_index()
    axes[1, 1].plot(year_counts.index, year_counts.values)
    axes[1, 1].set_title('Listings by Year')
    axes[1, 1].set_xlabel('Year')
    axes[1, 1].set_ylabel('Count')

plt.tight_layout()
plt.show()

## Summary

✓ **Bronze to Silver transformation complete**

- **Input:** {bronze_path.name}
- **Output:** {silver_path.name}
- **Rows processed:** {len(df_bronze):,} → {len(df_silver):,}
- **Data quality:** Cleaned, standardized, and validated

**Next Step:** Run `03_silver_to_gold.ipynb` to apply feature engineering and prepare for modeling.