# Exploratory Data Analysis - Retail Demand Forecasting

This notebook performs comprehensive EDA on the retail pricing and demand dataset to identify:
1. Main demand drivers (price, promo, holiday, weather, patterns)
2. Price elasticity by SKU and promo effect sizes
3. Stockout detection and impact on demand estimation

---

## 1. Setup and Data Loading

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Configure visualization settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

ModuleNotFoundError: No module named 'scipy'

In [None]:
# Load the data
try:
    df = pd.read_csv('../retail_pricing_demand_2024.csv')
    print(f"✓ Full dataset loaded: {len(df):,} rows")
except:
    try:
        df = pd.read_csv('../retail_pricing_demand_2024_sample.csv')
        print(f"✓ Sample dataset loaded: {len(df):,} rows")
    except:
        # Try without going up a directory
        df = pd.read_csv('retail_pricing_demand_2024_sample.csv')
        print(f"✓ Sample dataset loaded: {len(df):,} rows")

# Convert date column
df['date'] = pd.to_datetime(df['date'])

# Display basic information
print(f"\nDate range: {df['date'].min()} to {df['date'].max()}")
print(f"Stores: {df['store_id'].nunique()} unique")
print(f"SKUs: {df['sku_id'].nunique()} unique")
print(f"Store-SKU combinations: {df.groupby(['store_id', 'sku_id']).ngroups}")

# Display first few rows
df.head()

## 2. Data Quality Assessment

In [None]:
# Check data types and missing values
print("Data Types and Missing Values:")
print("=" * 60)

info_df = pd.DataFrame({
    'Column': df.columns,
    'Type': df.dtypes.values,
    'Non-Null': df.count().values,
    'Missing': df.isnull().sum().values,
    'Missing %': (df.isnull().sum().values / len(df) * 100).round(2)
})

display(info_df.style.background_gradient(subset=['Missing %'], cmap='Reds'))

In [None]:
# Stockout analysis
if 'stockout_flag' in df.columns:
    stockout_rate = df['stockout_flag'].mean() * 100
    
    # Create stockout summary
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Stockout frequency
    stockout_counts = df.groupby('date')['stockout_flag'].sum()
    axes[0].plot(stockout_counts.index, stockout_counts.values, alpha=0.7)
    axes[0].set_title('Stockout Occurrences Over Time')
    axes[0].set_xlabel('Date')
    axes[0].set_ylabel('Number of Stockouts')
    axes[0].grid(True, alpha=0.3)
    
    # Impact on demand
    normal_demand = df[df['stockout_flag'] == 0]['units_sold'].mean()
    stockout_demand = df[df['stockout_flag'] == 1]['units_sold'].mean()
    
    demand_comparison = pd.DataFrame({
        'Condition': ['Normal', 'Stockout'],
        'Average Demand': [normal_demand, stockout_demand]
    })
    
    axes[1].bar(demand_comparison['Condition'], demand_comparison['Average Demand'], 
                color=['green', 'red'], alpha=0.7)
    axes[1].set_title('Demand Impact of Stockouts')
    axes[1].set_ylabel('Average Units Sold')
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nStockout Statistics:")
    print(f"  • Stockout Rate: {stockout_rate:.2f}%")
    print(f"  • Total Stockout Days: {df['stockout_flag'].sum()}")
    print(f"  • Average Demand (Normal): {normal_demand:.2f} units")
    print(f"  • Average Demand (Stockout): {stockout_demand:.2f} units")
    print(f"  • Stockout Impact: {((stockout_demand - normal_demand) / normal_demand * 100):.1f}%")
else:
    print("No stockout data available in dataset")

## 3. Demand Analysis

In [None]:
# Demand distribution analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Distribution plot
axes[0, 0].hist(df['units_sold'], bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].axvline(df['units_sold'].mean(), color='red', linestyle='--', label=f'Mean: {df["units_sold"].mean():.1f}')
axes[0, 0].axvline(df['units_sold'].median(), color='green', linestyle='--', label=f'Median: {df["units_sold"].median():.1f}')
axes[0, 0].set_title('Distribution of Units Sold')
axes[0, 0].set_xlabel('Units Sold')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend()

# Box plot by store
df.boxplot(column='units_sold', by='store_id', ax=axes[0, 1])
axes[0, 1].set_title('Demand Distribution by Store')
axes[0, 1].set_xlabel('Store ID')
axes[0, 1].set_ylabel('Units Sold')
plt.sca(axes[0, 1])
plt.xticks(rotation=0)

# Box plot by SKU
df.boxplot(column='units_sold', by='sku_id', ax=axes[1, 0])
axes[1, 0].set_title('Demand Distribution by SKU')
axes[1, 0].set_xlabel('SKU ID')
axes[1, 0].set_ylabel('Units Sold')
plt.sca(axes[1, 0])
plt.xticks(rotation=0)

# Time series of average daily demand
daily_demand = df.groupby('date')['units_sold'].mean()
axes[1, 1].plot(daily_demand.index, daily_demand.values, alpha=0.7)
axes[1, 1].set_title('Average Daily Demand Over Time')
axes[1, 1].set_xlabel('Date')
axes[1, 1].set_ylabel('Average Units Sold')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Summary statistics
print("\nDemand Statistics:")
print(f"  • Mean: {df['units_sold'].mean():.2f} units")
print(f"  • Median: {df['units_sold'].median():.2f} units")
print(f"  • Std Dev: {df['units_sold'].std():.2f} units")
print(f"  • Min: {df['units_sold'].min():.0f} units")
print(f"  • Max: {df['units_sold'].max():.0f} units")
print(f"  • Coefficient of Variation: {(df['units_sold'].std() / df['units_sold'].mean()):.2f}")

In [None]:
# Store-SKU performance heatmap
performance_matrix = df.pivot_table(
    values='units_sold',
    index='store_id',
    columns='sku_id',
    aggfunc='mean'
)

plt.figure(figsize=(10, 6))
sns.heatmap(performance_matrix, annot=True, fmt='.1f', cmap='YlOrRd', 
            cbar_kws={'label': 'Average Units Sold'})
plt.title('Store-SKU Performance Matrix\n(Average Daily Units Sold)')
plt.xlabel('SKU ID')
plt.ylabel('Store ID')
plt.show()

# Top and bottom performers
all_combos = df.groupby(['store_id', 'sku_id'])['units_sold'].mean().sort_values()
print("\nTop 3 Performers:")
for (store, sku), demand in all_combos.tail(3).items():
    print(f"  • {store}-{sku}: {demand:.2f} units/day")

print("\nBottom 3 Performers:")
for (store, sku), demand in all_combos.head(3).items():
    print(f"  • {store}-{sku}: {demand:.2f} units/day")

## 4. Price Analysis

In [None]:
# Price analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Price distributions
price_data = pd.DataFrame({
    'Base Price': df['base_price'],
    'Final Price': df['final_price'],
    'Competitor Price': df['competitor_price']
})

price_data.boxplot(ax=axes[0, 0])
axes[0, 0].set_title('Price Distributions Comparison')
axes[0, 0].set_ylabel('Price ($)')
axes[0, 0].grid(True, alpha=0.3)

# Discount distribution
df['discount'] = df['base_price'] - df['final_price']
df['discount_pct'] = (df['discount'] / df['base_price'] * 100)

axes[0, 1].hist(df[df['discount'] > 0]['discount_pct'], bins=30, edgecolor='black', alpha=0.7)
axes[0, 1].set_title('Distribution of Discount Percentages\n(When Discounted)')
axes[0, 1].set_xlabel('Discount %')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].grid(True, alpha=0.3)

# Price competitiveness
df['price_vs_comp'] = df['final_price'] / df['competitor_price']
axes[1, 0].hist(df['price_vs_comp'], bins=50, edgecolor='black', alpha=0.7)
axes[1, 0].axvline(1.0, color='red', linestyle='--', label='Price Parity')
axes[1, 0].set_title('Price Competitiveness\n(Final Price / Competitor Price)')
axes[1, 0].set_xlabel('Price Ratio')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Scatter: Price vs Demand
axes[1, 1].scatter(df['final_price'], df['units_sold'], alpha=0.5)
axes[1, 1].set_title('Price vs Demand Relationship')
axes[1, 1].set_xlabel('Final Price ($)')
axes[1, 1].set_ylabel('Units Sold')
axes[1, 1].grid(True, alpha=0.3)

# Add trendline
z = np.polyfit(df['final_price'], df['units_sold'], 1)
p = np.poly1d(z)
axes[1, 1].plot(df['final_price'].sort_values(), p(df['final_price'].sort_values()), 
               "r--", alpha=0.8, label=f'Trend: y = {z[0]:.2f}x + {z[1]:.2f}')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

# Price statistics
print("\nPrice Statistics:")
print(f"  • Base Price Mean: ${df['base_price'].mean():.2f}")
print(f"  • Final Price Mean: ${df['final_price'].mean():.2f}")
print(f"  • Average Discount: ${df['discount'].mean():.2f} ({df[df['discount'] > 0]['discount_pct'].mean():.1f}%)")
print(f"  • Competitor Price Mean: ${df['competitor_price'].mean():.2f}")
print(f"  • Undercut Rate: {(df['final_price'] < df['competitor_price']).mean() * 100:.1f}%")
print(f"  • Average Price Ratio (vs Competitor): {df['price_vs_comp'].mean():.3f}")

## 5. Price Elasticity Analysis

In [None]:
# Calculate price elasticity by SKU
elasticities = []

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, sku in enumerate(df['sku_id'].unique()):
    if idx < 3:  # Plot first 3 SKUs
        sku_data = df[df['sku_id'] == sku].copy()
        sku_data = sku_data[sku_data['units_sold'] > 0]  # Remove zeros for log
        
        if len(sku_data) > 10:
            # Log-log regression for elasticity
            log_price = np.log(sku_data['final_price'])
            log_demand = np.log(sku_data['units_sold'])
            
            # Remove infinite values
            mask = np.isfinite(log_price) & np.isfinite(log_demand)
            if mask.sum() > 10:
                slope, intercept, r_value, p_value, std_err = stats.linregress(
                    log_price[mask], log_demand[mask]
                )
                
                elasticities.append({
                    'SKU': sku,
                    'Elasticity': slope,
                    'R_squared': r_value**2,
                    'P_value': p_value,
                    'Interpretation': 'Elastic' if abs(slope) > 1 else 'Inelastic'
                })
                
                # Plot
                axes[idx].scatter(log_price[mask], log_demand[mask], alpha=0.5)
                axes[idx].plot(log_price[mask], slope * log_price[mask] + intercept, 
                             'r-', label=f'Elasticity: {slope:.2f}')
                axes[idx].set_title(f'{sku} Price Elasticity')
                axes[idx].set_xlabel('Log(Price)')
                axes[idx].set_ylabel('Log(Demand)')
                axes[idx].legend()
                axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Display elasticity results
elasticity_df = pd.DataFrame(elasticities)
print("\nPrice Elasticity by SKU:")
display(elasticity_df.style.background_gradient(subset=['Elasticity'], cmap='RdYlGn_r'))

if len(elasticity_df) > 0:
    print(f"\n📈 Average Price Elasticity: {elasticity_df['Elasticity'].mean():.3f}")
    print(f"   • Interpretation: A 1% price increase leads to {abs(elasticity_df['Elasticity'].mean()):.2f}% demand decrease")

## 6. Promotion Effect Analysis

In [None]:
# Promotion analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Promotion frequency over time
promo_daily = df.groupby('date')['promo_flag'].mean() * 100
axes[0, 0].plot(promo_daily.index, promo_daily.values, alpha=0.7)
axes[0, 0].set_title('Promotion Frequency Over Time')
axes[0, 0].set_xlabel('Date')
axes[0, 0].set_ylabel('% of Products on Promotion')
axes[0, 0].grid(True, alpha=0.3)

# Demand comparison: Promo vs No Promo
promo_comparison = df.groupby('promo_flag')['units_sold'].mean()
axes[0, 1].bar(['No Promo', 'Promo'], promo_comparison.values, 
              color=['blue', 'orange'], alpha=0.7)
axes[0, 1].set_title('Average Demand: Promotion Impact')
axes[0, 1].set_ylabel('Average Units Sold')

# Promotion depth distribution
if 'promo_depth' in df.columns:
    promo_depths = df[df['promo_flag'] == 1]['promo_depth'] * 100
    axes[1, 0].hist(promo_depths, bins=20, edgecolor='black', alpha=0.7)
    axes[1, 0].set_title('Distribution of Promotion Depths')
    axes[1, 0].set_xlabel('Discount %')
    axes[1, 0].set_ylabel('Frequency')
    axes[1, 0].grid(True, alpha=0.3)
    
    # Relationship between discount depth and demand lift
    promo_data = df[df['promo_flag'] == 1].copy()
    if len(promo_data) > 10:
        axes[1, 1].scatter(promo_data['promo_depth'] * 100, promo_data['units_sold'], alpha=0.5)
        axes[1, 1].set_title('Discount Depth vs Demand')
        axes[1, 1].set_xlabel('Discount %')
        axes[1, 1].set_ylabel('Units Sold')
        axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Promotion statistics
promo_rate = df['promo_flag'].mean() * 100
no_promo_demand = df[df['promo_flag'] == 0]['units_sold'].mean()
promo_demand = df[df['promo_flag'] == 1]['units_sold'].mean()
promo_lift = (promo_demand - no_promo_demand) / no_promo_demand * 100

print("\n🎯 Promotion Analysis:")
print(f"  • Promotion Rate: {promo_rate:.1f}%")
print(f"  • Average Demand (No Promo): {no_promo_demand:.2f} units")
print(f"  • Average Demand (With Promo): {promo_demand:.2f} units")
print(f"  • Promotion Lift: {promo_lift:.1f}%")

if 'promo_depth' in df.columns:
    avg_discount = df[df['promo_flag'] == 1]['promo_depth'].mean() * 100
    print(f"  • Average Discount Depth: {avg_discount:.1f}%")

## 7. Temporal Patterns Analysis

In [None]:
# Temporal patterns
df['day_of_week'] = df['date'].dt.dayofweek
df['day_name'] = df['date'].dt.day_name()
df['month'] = df['date'].dt.month

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Day of week patterns
dow_demand = df.groupby('day_name')['units_sold'].mean()
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
dow_demand = dow_demand.reindex(day_order)

axes[0, 0].bar(range(7), dow_demand.values, tick_label=day_order, alpha=0.7)
axes[0, 0].set_title('Average Demand by Day of Week')
axes[0, 0].set_ylabel('Average Units Sold')
axes[0, 0].set_xticklabels(day_order, rotation=45)
axes[0, 0].grid(True, alpha=0.3, axis='y')

# Weekly patterns
weekly_demand = df.groupby('week_of_year')['units_sold'].mean()
axes[0, 1].plot(weekly_demand.index, weekly_demand.values, alpha=0.7)
axes[0, 1].set_title('Average Demand by Week of Year')
axes[0, 1].set_xlabel('Week of Year')
axes[0, 1].set_ylabel('Average Units Sold')
axes[0, 1].grid(True, alpha=0.3)

# Monthly patterns
monthly_demand = df.groupby('month')['units_sold'].mean()
axes[1, 0].bar(monthly_demand.index, monthly_demand.values, alpha=0.7)
axes[1, 0].set_title('Average Demand by Month')
axes[1, 0].set_xlabel('Month')
axes[1, 0].set_ylabel('Average Units Sold')
axes[1, 0].set_xticks(range(1, 13))
axes[1, 0].grid(True, alpha=0.3, axis='y')

# Weekend vs Weekday
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
weekend_comparison = df.groupby('is_weekend')['units_sold'].mean()
axes[1, 1].bar(['Weekday', 'Weekend'], weekend_comparison.values, 
              color=['gray', 'green'], alpha=0.7)
axes[1, 1].set_title('Weekday vs Weekend Demand')
axes[1, 1].set_ylabel('Average Units Sold')

plt.tight_layout()
plt.show()

# Temporal statistics
print("\nTemporal Patterns:")
print(f"  • Peak Day: {dow_demand.idxmax()} ({dow_demand.max():.2f} units)")
print(f"  • Low Day: {dow_demand.idxmin()} ({dow_demand.min():.2f} units)")
print(f"  • Weekend vs Weekday: {(weekend_comparison[1] - weekend_comparison[0]) / weekend_comparison[0] * 100:.1f}% difference")
print(f"  • Peak Week: Week {weekly_demand.idxmax()} ({weekly_demand.max():.2f} units)")
print(f"  • Weekly Volatility (CV): {(weekly_demand.std() / weekly_demand.mean()):.2f}")

## 8. Holiday and Weather Impact

In [None]:
# Holiday and weather analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Holiday impact
holiday_comparison = df.groupby('holiday_flag')['units_sold'].mean()
axes[0, 0].bar(['Regular Day', 'Holiday'], holiday_comparison.values, 
              color=['blue', 'red'], alpha=0.7)
axes[0, 0].set_title('Holiday Impact on Demand')
axes[0, 0].set_ylabel('Average Units Sold')

# Weather distribution
axes[0, 1].hist(df['weather_index'], bins=30, edgecolor='black', alpha=0.7)
axes[0, 1].set_title('Weather Index Distribution')
axes[0, 1].set_xlabel('Weather Index (0-1)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].grid(True, alpha=0.3)

# Weather vs Demand
axes[1, 0].scatter(df['weather_index'], df['units_sold'], alpha=0.3)
axes[1, 0].set_title('Weather Impact on Demand')
axes[1, 0].set_xlabel('Weather Index')
axes[1, 0].set_ylabel('Units Sold')

# Add trendline
z = np.polyfit(df['weather_index'], df['units_sold'], 1)
p = np.poly1d(z)
axes[1, 0].plot(df['weather_index'].sort_values(), p(df['weather_index'].sort_values()), 
               "r--", alpha=0.8, label=f'Trend: y = {z[0]:.2f}x + {z[1]:.2f}')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Weather segments analysis
df['weather_segment'] = pd.cut(df['weather_index'], 
                               bins=[0, 0.33, 0.66, 1.0],
                               labels=['Poor', 'Fair', 'Good'])
weather_demand = df.groupby('weather_segment')['units_sold'].mean()
axes[1, 1].bar(['Poor', 'Fair', 'Good'], weather_demand.values, 
              color=['gray', 'yellow', 'green'], alpha=0.7)
axes[1, 1].set_title('Demand by Weather Condition')
axes[1, 1].set_ylabel('Average Units Sold')

plt.tight_layout()
plt.show()

# Holiday and weather statistics
holiday_rate = df['holiday_flag'].mean() * 100
no_holiday_demand = df[df['holiday_flag'] == 0]['units_sold'].mean()
holiday_demand = df[df['holiday_flag'] == 1]['units_sold'].mean()
holiday_lift = (holiday_demand - no_holiday_demand) / no_holiday_demand * 100

weather_corr = df['weather_index'].corr(df['units_sold'])

print("\nHoliday Impact:")
print(f"  • Holiday Rate: {holiday_rate:.1f}%")
print(f"  • Average Demand (Regular): {no_holiday_demand:.2f} units")
print(f"  • Average Demand (Holiday): {holiday_demand:.2f} units")
print(f"  • Holiday Lift: {holiday_lift:.1f}%")

print("\nWeather Impact:")
print(f"  • Weather Index Mean: {df['weather_index'].mean():.3f}")
print(f"  • Correlation with Demand: {weather_corr:.3f}")
for condition in ['Poor', 'Fair', 'Good']:
    if condition in weather_demand.index:
        print(f"  • {condition} Weather: {weather_demand[condition]:.2f} units")

## 9. Correlation Analysis

In [None]:
# Correlation analysis
num_cols = ['units_sold', 'final_price', 'base_price', 'competitor_price',
            'promo_flag', 'promo_depth', 'holiday_flag', 'weather_index']

# Filter to available columns
available_cols = [col for col in num_cols if col in df.columns]
corr_matrix = df[available_cols].corr()

# Create correlation heatmap
plt.figure(figsize=(12, 8))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, vmin=-1, vmax=1, mask=mask,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix', fontsize=16)
plt.show()

# Display correlations with units_sold
corr_with_demand = corr_matrix['units_sold'].sort_values(ascending=False)
print("\nCorrelations with Units Sold:")
for feature, corr in corr_with_demand.items():
    if feature != 'units_sold':
        strength = 'Strong' if abs(corr) > 0.5 else 'Moderate' if abs(corr) > 0.3 else 'Weak'
        direction = 'Positive' if corr > 0 else 'Negative'
        print(f"  • {feature:20s}: {corr:7.3f} ({strength} {direction})")

## 10. Key Insights and Recommendations

In [None]:
# Generate key insights
print("=" * 80)
print("KEY INSIGHTS SUMMARY")
print("=" * 80)

insights = []

# Price elasticity insight
if len(elasticity_df) > 0:
    avg_elasticity = elasticity_df['Elasticity'].mean()
    insights.append(f"1. Average price elasticity is {avg_elasticity:.2f}, indicating {'elastic' if abs(avg_elasticity) > 1 else 'inelastic'} demand")
else:
    insights.append("1. Price elasticity analysis requires more data points")

# Promotion effectiveness
insights.append(f"2. Promotions increase demand by {promo_lift:.1f}% on average")

# Holiday impact
insights.append(f"3. Holidays boost demand by {holiday_lift:.1f}%")

# Weather correlation
if abs(weather_corr) > 0.1:
    insights.append(f"4. Weather has {'positive' if weather_corr > 0 else 'negative'} impact on demand (correlation: {weather_corr:.3f})")
else:
    insights.append(f"4. Weather shows minimal impact on demand (correlation: {weather_corr:.3f})")

# Price competitiveness
undercut_rate = (df['final_price'] < df['competitor_price']).mean() * 100
insights.append(f"5. Products are priced below competitors {undercut_rate:.1f}% of the time")

# Stockout impact
if 'stockout_flag' in df.columns:
    stockout_rate = df['stockout_flag'].mean() * 100
    if stockout_rate > 0:
        insights.append(f"6. Stockouts occur {stockout_rate:.1f}% of the time, significantly impacting demand estimation")

# Store variation
store_demand = df.groupby('store_id')['units_sold'].mean()
store_cv = store_demand.std() / store_demand.mean()
insights.append(f"7. High variation across stores (CV: {store_cv:.2f}), suggesting location-specific strategies needed")

# Weekly patterns
weekly_demand = df.groupby('week_of_year')['units_sold'].mean()
weekly_cv = weekly_demand.std() / weekly_demand.mean()
if weekly_cv > 0.2:
    insights.append(f"8. Strong weekly seasonality detected (CV: {weekly_cv:.2f})")

for insight in insights:
    print(f"\n{insight}")

print("\n" + "=" * 80)
print("RECOMMENDATIONS FOR MODELING")
print("=" * 80)

recommendations = [
    "1. Include lagged demand features (7-day, 14-day) to capture autocorrelation",
    "2. Create price elasticity features that vary by SKU",
    "3. Engineer interaction terms between price and promotions",
    "4. Account for stockouts using censored demand estimation",
    "5. Use store-specific fixed effects or hierarchical modeling",
    "6. Include competitor price gap as a key feature",
    "7. Model holiday and weather effects with appropriate transformations",
    "8. Consider separate models for each SKU given different elasticities",
    "9. Implement ensemble methods to capture non-linear relationships",
    "10. Use time-based validation to avoid data leakage"
]

for rec in recommendations:
    print(f"\n{rec}")

print("\n" + "=" * 80)
print("EDA COMPLETE - Ready for modeling phase")
print("=" * 80)

## 11. Export Results

In [None]:
# Save key results to CSV for later use
results = {
    'metric': [
        'avg_demand',
        'demand_std',
        'promo_lift_pct',
        'holiday_lift_pct',
        'avg_elasticity',
        'weather_correlation',
        'stockout_rate',
        'undercut_rate'
    ],
    'value': [
        df['units_sold'].mean(),
        df['units_sold'].std(),
        promo_lift,
        holiday_lift,
        elasticity_df['Elasticity'].mean() if len(elasticity_df) > 0 else -1.5,
        weather_corr,
        df['stockout_flag'].mean() * 100 if 'stockout_flag' in df.columns else 0,
        undercut_rate
    ]
}

results_df = pd.DataFrame(results)
results_df.to_csv('eda_results.csv', index=False)

print("EDA results saved to 'eda_results.csv'")
print("\nSummary of Key Metrics:")
display(results_df.round(2))