In [None]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import logging

# Add project root to path
project_root = Path.cwd().parent.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from config import DATA_DIR_SILVER, DATA_DIR_GOLD, LISTINGS_SILVER, LISTINGS_GOLD
from elferspot_listings.data_processing import process_silver_to_gold
from elferspot_listings.utils.helpers import setup_logging, load_data

# Setup logging
logger = setup_logging(level='INFO')
logger.info("Silver to Gold transformation initialized")

## Step 1: Load Silver Data

In [None]:
# Find the most recent silver file
silver_files = sorted(DATA_DIR_SILVER.glob("listings_silver*.xlsx"))

if silver_files:
    silver_path = silver_files[-1]
    logger.info(f"Using silver file: {silver_path.name}")
else:
    # Fallback to config path
    silver_path = LISTINGS_SILVER
    logger.warning(f"No silver files found, using: {silver_path}")

# Load data
if silver_path.exists():
    df_silver = load_data(silver_path)
    print(f"✓ Loaded {len(df_silver):,} rows from silver")
    print(f"  Columns: {len(df_silver.columns)}")
    print(f"  File: {silver_path.name}")
else:
    raise FileNotFoundError(f"Silver file not found: {silver_path}")

df_silver.head()

## Step 2: Analyze Data Distribution

Check for outliers and data quality before transformation.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Price distribution with outliers highlighted
if 'price_in_eur' in df_silver.columns:
    prices = df_silver['price_in_eur'].dropna()
    log_prices = np.log(prices)
    mean_log = log_prices.mean()
    std_log = log_prices.std()
    
    axes[0].hist(log_prices, bins=50, edgecolor='black', alpha=0.7)
    axes[0].axvline(mean_log - 3*std_log, color='r', linestyle='--', label='-3σ')
    axes[0].axvline(mean_log + 3*std_log, color='r', linestyle='--', label='+3σ')
    axes[0].set_title('Log(Price) Distribution')
    axes[0].set_xlabel('Log(Price EUR)')
    axes[0].set_ylabel('Frequency')
    axes[0].legend()
    
    # Price vs Mileage scatter
    if 'Mileage_km' in df_silver.columns:
        axes[1].scatter(df_silver['Mileage_km'], df_silver['price_in_eur'], alpha=0.5)
        axes[1].set_title('Price vs Mileage')
        axes[1].set_xlabel('Mileage (km)')
        axes[1].set_ylabel('Price (EUR)')
        axes[1].set_xscale('log')
        axes[1].set_yscale('log')

plt.tight_layout()
plt.show()

# Count potential outliers
outliers_count = ((log_prices < mean_log - 3*std_log) | (log_prices > mean_log + 3*std_log)).sum()
print(f"\nPotential outliers (±3σ): {outliers_count} ({outliers_count/len(prices)*100:.1f}%)")

## Step 3: Run Silver to Gold Transformation

Apply feature engineering and outlier removal.

In [None]:
# Define output path
gold_path = DATA_DIR_GOLD / f"listings_gold_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.xlsx"

# Run transformation
df_gold = process_silver_to_gold(
    silver_path=silver_path,
    gold_path=gold_path,
    remove_price_outliers=True,
    outlier_std=3.0
)

print(f"\n✓ Transformation complete!")
print(f"  Silver rows: {len(df_silver):,}")
print(f"  Gold rows: {len(df_gold):,}")
print(f"  Rows removed: {len(df_silver) - len(df_gold):,} ({(1 - len(df_gold)/len(df_silver))*100:.1f}%)")

## Step 4: Review Engineered Features

In [None]:
# Display sample with new features
print("=== Gold Data Sample ===")
feature_cols = ['Model', 'model_category', 'Mileage_km', 'log_mileage', 'Mileage_sq', 
                'price_in_eur', 'log_price', 'listing_score']
available_cols = [col for col in feature_cols if col in df_gold.columns]
display(df_gold[available_cols].head(10))

# Feature statistics
print("\n=== Engineered Feature Statistics ===")
for col in ['log_mileage', 'Mileage_sq', 'log_price', 'listing_score']:
    if col in df_gold.columns:
        print(f"\n{col}:")
        print(df_gold[col].describe())

## Step 5: Model Category Analysis

In [None]:
# Analyze model categories
if 'model_category' in df_gold.columns:
    print("=== Model Category Distribution ===")
    category_stats = df_gold.groupby('model_category').agg({
        'price_in_eur': ['count', 'mean', 'median', 'std'],
        'Mileage_km': 'mean',
        'listing_score': 'mean'
    }).round(2)
    
    display(category_stats.sort_values(('price_in_eur', 'count'), ascending=False))
    
    # Visualize
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Category counts
    cat_counts = df_gold['model_category'].value_counts()
    axes[0].barh(cat_counts.index, cat_counts.values)
    axes[0].set_title('Listings by Model Category')
    axes[0].set_xlabel('Count')
    
    # Average price by category
    cat_prices = df_gold.groupby('model_category')['price_in_eur'].mean().sort_values()
    axes[1].barh(cat_prices.index, cat_prices.values)
    axes[1].set_title('Average Price by Category')
    axes[1].set_xlabel('Average Price (EUR)')
    
    plt.tight_layout()
    plt.show()

## Step 6: Listing Score Analysis

In [None]:
# Analyze listing scores
if 'listing_score' in df_gold.columns:
    print("=== Listing Quality Score Analysis ===")
    print(df_gold['listing_score'].describe())
    
    # Score distribution
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.hist(df_gold['listing_score'], bins=30, edgecolor='black')
    plt.title('Listing Score Distribution')
    plt.xlabel('Score')
    plt.ylabel('Frequency')
    
    # Score vs Price correlation
    plt.subplot(1, 2, 2)
    plt.scatter(df_gold['listing_score'], df_gold['price_in_eur'], alpha=0.5)
    plt.title('Listing Score vs Price')
    plt.xlabel('Listing Score')
    plt.ylabel('Price (EUR)')
    plt.yscale('log')
    
    plt.tight_layout()
    plt.show()
    
    # Correlation
    if 'price_in_eur' in df_gold.columns:
        corr = df_gold[['listing_score', 'price_in_eur']].corr().iloc[0, 1]
        print(f"\nCorrelation (listing_score vs price): {corr:.3f}")

## Step 7: Feature Correlation Analysis

In [None]:
# Correlation matrix for numeric features
numeric_features = df_gold.select_dtypes(include=[np.number]).columns.tolist()
# Limit to key features for readability
key_features = ['price_in_eur', 'log_price', 'Mileage_km', 'log_mileage', 
                'Mileage_sq', 'listing_score', 'Year of construction']
key_features = [f for f in key_features if f in numeric_features]

if len(key_features) > 1:
    plt.figure(figsize=(10, 8))
    corr_matrix = df_gold[key_features].corr()
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
                square=True, linewidths=1)
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.show()

## Step 8: Final Data Validation

In [None]:
print("=== Final Gold Data Validation ===")
print(f"\nShape: {df_gold.shape}")
print(f"\nMissing values in key columns:")
key_cols = ['Model', 'price_in_eur', 'Mileage_km', 'log_price', 'log_mileage']
for col in key_cols:
    if col in df_gold.columns:
        missing = df_gold[col].isnull().sum()
        print(f"  {col}: {missing} ({missing/len(df_gold)*100:.1f}%)")

print(f"\nPrice range: €{df_gold['price_in_eur'].min():,.0f} - €{df_gold['price_in_eur'].max():,.0f}")
print(f"Mileage range: {df_gold['Mileage_km'].min():,.0f} - {df_gold['Mileage_km'].max():,.0f} km")
print(f"\nData is ready for modeling!")

## Summary

✓ **Silver to Gold transformation complete**

- **Input:** {silver_path.name}
- **Output:** {gold_path.name}
- **Rows processed:** {len(df_silver):,} → {len(df_gold):,}
- **Features engineered:** log transforms, quality scores, model categories
- **Data quality:** Outliers removed, features validated

**Next Step:** Run notebooks in `04_modeling/` to train price prediction models.