# Sales Forecasting System - Exploratory Data Analysis (EDA)

This notebook explores the cleaned sales dataset to uncover insights, trends, and relationships between features.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-v0_8')

# df = pd.read_csv('cleaned_sales_data.csv')
# For demonstration, regenerate a dataset similar to cleaned output
np.random.seed(42)
dates = pd.date_range('2020-01-01', '2023-12-31', freq='D')
df = pd.DataFrame({
    'date': dates,
    'sales': np.random.normal(5000, 1500, len(dates)).clip(0),
    'quantity': np.random.randint(50, 500, len(dates)),
    'price': np.random.uniform(10, 100, len(dates)),
    'region': np.random.choice(['North', 'South', 'East', 'West'], len(dates)),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Food', 'Books'], len(dates))
})
df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.to_period('M').dt.to_timestamp()
df['dayofweek'] = df['date'].dt.dayofweek
df['weekofyear'] = df['date'].dt.isocalendar().week.astype(int)
df.head()

## Summary Statistics & Missingness

In [None]:
df.describe(include='all')

In [None]:
sns.pairplot(df[['sales','quantity','price']].sample(500))
plt.suptitle('Pairplot of Numerical Features', y=1.02)
plt.show()

## Seasonal Patterns

In [None]:
monthly = df.groupby('month').agg({'sales':'sum','quantity':'sum'}).reset_index()
fig, ax = plt.subplots(2,1, figsize=(12,8), sharex=True)
ax[0].plot(monthly['month'], monthly['sales'], marker='o'); ax[0].set_title('Monthly Sales');
ax[1].plot(monthly['month'], monthly['quantity'], color='orange', marker='o'); ax[1].set_title('Monthly Quantity');
plt.tight_layout(); plt.show()

## Category and Region Insights

In [None]:
plt.figure(figsize=(12,4))
sns.barplot(x='product_category', y='sales', data=df, estimator=sum, ci=None)
plt.title('Total Sales by Product Category'); plt.show()

plt.figure(figsize=(12,4))
sns.barplot(x='region', y='sales', data=df, estimator=sum, ci=None)
plt.title('Total Sales by Region'); plt.show()

## Correlations

In [None]:
corr = df[['sales','quantity','price']].corr()
plt.figure(figsize=(5,4)); sns.heatmap(corr, annot=True, cmap='Blues', vmin=-1, vmax=1); plt.title('Correlation Heatmap'); plt.show()