In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
df_features = pd.read_csv('../data/features.csv')
df_sales = pd.read_csv('../data/sales.csv')
df_stores = pd.read_csv('../data/stores.csv')

In [None]:
df_features['Date'] = pd.to_datetime(df_features['Date'], format='%d/%m/%Y')
df_sales['Date'] = pd.to_datetime(df_sales['Date'], format='%d/%m/%Y')


In [None]:
df_features.columns

In [None]:
df_features.head()

In [None]:
df_sales.columns

In [None]:
df_sales.head()

In [None]:
df_stores.columns

In [None]:
df_stores.head()

In [None]:
df_features = df_features.rename(columns={'IsHoliday': 'IsHolidayFeatures'})
df_combined = pd.merge(df_sales, df_stores, on='Store', how='left')
df_final = pd.merge(df_combined, df_features, on=['Store', 'Date'], how='left')
df_final['week'] = df_final['Date'].dt.isocalendar().week
df_final['month'] = df_final['Date'].dt.month
df_final['day'] = df_final['Date'].dt.day
df_final.set_index('Date', inplace=True)
df_final = df_final.sort_index()


In [None]:

weekly_sales = df_final.groupby('Date')['Weekly_Sales'].sum().reset_index()

plt.figure(figsize=(12, 6))
sns.lineplot(data=weekly_sales, x='Date', y='Weekly_Sales', marker='o', color='b')
plt.title('Weekly sales over time', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Total sales', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
weekly_sales_by_week = df_final.groupby('week')['Weekly_Sales'].mean().reset_index()

plt.figure(figsize=(12, 6))

sns.lineplot(data=weekly_sales_by_week, x='week', y='Weekly_Sales', marker='o', color='b')

plt.title('Mean weekly sales trend by week of year', fontsize=16)
plt.xlabel('week', fontsize=12)
plt.ylabel('Mean sales', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
df_corr = df_final[['Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment']]
correlation_matrix = df_corr.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', vmin=-1, vmax=1, cbar=True)
plt.title('Correlation Between Environmental Factors and Sales', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
df_final['Total_Discount'] = df_final[['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']].sum(axis=1)

sales_by_holiday = df_final.groupby('IsHoliday')['Weekly_Sales'].mean().reset_index()

plt.figure(figsize=(10, 6))
sns.barplot(data=sales_by_holiday, x='IsHoliday', y='Weekly_Sales', palette='Blues')
plt.title('Average Weekly Sales During Holidays vs Non-Holidays', fontsize=16)
plt.xlabel('Is Holiday?', fontsize=12)
plt.ylabel('Average Weekly Sales ($)', fontsize=12)
plt.tight_layout()
plt.show()


In [None]:
df_final['day_type'] = df_final['day'].apply(lambda x: 'Weekend' if x in [6, 7] else 'Weekday')
avg_sales = df_final.groupby('day_type')['Weekly_Sales'].mean()
plt.bar(avg_sales.index, avg_sales.values, color=['blue', 'orange'])
plt.title('Average Weekly Sales: Weekdays vs. Weekends')
plt.ylabel('Average Weekly Sales')
plt.xlabel('Day Type')
plt.show()
