In [None]:
import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor




In [None]:
df = pd.read_csv('../data/improved_data_52.csv')
df2 = pd.read_csv('../data/improved_data_52.csv')

In [None]:
df.describe()

In [None]:
df['Weekly_Sales'].var()

In [None]:
df.head()

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from xgboost import plot_importance

target = 'Weekly_Sales'

X = df2.drop(columns=target, axis=1)
y = df2[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

categorical_features = ['Store', 'Dept', 'IsHoliday', 'Type']
numerical_features = ['Size', 'Fuel_Price', 'week', 'month', 'day', 'Temperature', 'MarkDown1', 'MarkDown2',
                      'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment']

categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

model = XGBRegressor(objective='reg:squarederror', random_state=42)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R-squared (R2): {r2:.2f}")

residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True, bins=30)
plt.title('Residual Distribution')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()

plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.title('Predicted vs True Values')
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.show()

xgb_model = pipeline.named_steps['model']
if hasattr(xgb_model, 'feature_importances_'):
    plt.figure(figsize=(10, 6))
    plot_importance(xgb_model, max_num_features=10, importance_type='weight', title='Top 10 Feature Importances')
    plt.show()

try:
    ohe_columns = pipeline.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(categorical_features)
    
    all_feature_names = np.concatenate([numerical_features, ohe_columns])
    
    filtered_feature_names = [name for name in all_feature_names if not name.startswith(('Dept_', 'Store_', 'Type_'))]
    
    xgb_model = pipeline.named_steps['model']
    feature_importances = pd.DataFrame({'Feature': all_feature_names, 
                                        'Importance': xgb_model.feature_importances_})
    
    feature_importances_filtered = feature_importances[feature_importances['Feature'].isin(filtered_feature_names)]
    
    feature_importances_filtered = feature_importances_filtered.sort_values(by='Importance', ascending=False)
    
    print("\nFeature Importances (excluding Dept_):\n", feature_importances_filtered.head(10))

except Exception as e:
    print("\nFeature importance calculation failed:", e)



In [None]:
X_test['original_date'] = pd.to_datetime(X_test['original_date'])
X_test['Predicted_Weekly_Sales'] = y_pred
X_test['True_Weekly_Sales'] = y_test




In [None]:

weekly_sales_true = X_test.groupby('original_date')['True_Weekly_Sales'].sum().reset_index()

plt.figure(figsize=(12, 6))
sns.lineplot(data=weekly_sales_true, x='original_date', y='True_Weekly_Sales', marker='o', color='b')
plt.title('Weekly sales over time', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Total sales', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
weekly_sales_pred = X_test.groupby('original_date')['Predicted_Weekly_Sales'].sum().reset_index()

plt.figure(figsize=(12, 6))
sns.lineplot(data=weekly_sales_pred, x='original_date', y='Predicted_Weekly_Sales', marker='o', color='b')
plt.title('Weekly sales over time', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Total sales', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()