In [1]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging
from datetime import datetime
import os
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle
import warnings
warnings.filterwarnings('ignore')

# Set Up Logging
Set up logging to track the progress and save logs to a file with a timestamp.

In [2]:
# Set up logging
logging.basicConfig(
    filename=f'sales_forecast_{datetime.now().strftime("%Y-%m-%d")}.log',
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('sales_forecast')
logger.info("Starting sales forecasting analysis for Rossmann Pharmaceuticals")

# Load and Explore Data
Load the datasets (store.csv, train.csv, test.csv) and display their shapes and first few rows.

In [None]:
# Load the data
logger.info("Loading data files")
store_data = pd.read_csv('store.csv')
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Display basic information about the datasets
logger.info("Displaying basic information about the datasets")
print("Store Data Shape:", store_data.shape)
print("Train Data Shape:", train_data.shape)
print("Test Data Shape:", test_data.shape)

# Display the first few rows of each dataset
print("\nStore Data Preview:")
print(store_data.head())

print("\nTrain Data Preview:")
print(train_data.head())

print("\nTest Data Preview:")
print(test_data.head())

Store Data Shape: (1115, 10)
Train Data Shape: (1017209, 9)
Test Data Shape: (41088, 8)

Store Data Preview:
   Store StoreType Assortment  CompetitionDistance  CompetitionOpenSinceMonth  \
0      1         c          a               1270.0                        9.0   
1      2         a          a                570.0                       11.0   
2      3         a          a              14130.0                       12.0   
3      4         c          c                620.0                        9.0   
4      5         a          a              29910.0                        4.0   

   CompetitionOpenSinceYear  Promo2  Promo2SinceWeek  Promo2SinceYear  \
0                    2008.0       0              NaN              NaN   
1                    2007.0       1             13.0           2010.0   
2                    2006.0       1             14.0           2011.0   
3                    2009.0       0              NaN              NaN   
4                    2015.0       0    

# Check for Missing Values
Check for missing values in the datasets and log the results.

In [None]:
# Check for missing values
logger.info("Checking for missing values")
print("\nMissing values in Store Data:")
print(store_data.isnull().sum())

print("\nMissing values in Train Data:")
print(train_data.isnull().sum())

print("\nMissing values in Test Data:")
print(test_data.isnull().sum())


Missing values in Store Data:
Store                          0
StoreType                      0
Assortment                     0
CompetitionDistance            3
CompetitionOpenSinceMonth    354
CompetitionOpenSinceYear     354
Promo2                         0
Promo2SinceWeek              544
Promo2SinceYear              544
PromoInterval                544
dtype: int64

Missing values in Train Data:
Store            0
DayOfWeek        0
Date             0
Sales            0
Customers        0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
dtype: int64

Missing values in Test Data:
Id                0
Store             0
DayOfWeek         0
Date              0
Open             11
Promo             0
StateHoliday      0
SchoolHoliday     0
dtype: int64


# Data Preprocessing
Merge train and store data, convert date columns, and extract date-related features such as year, month, and day.

In [None]:
# Data preprocessing
logger.info("Starting data preprocessing")

# Merge train data with store data
train_store = pd.merge(train_data, store_data, on='Store', how='left')
print("\nMerged Train and Store Data Shape:", train_store.shape)

# Convert Date column to datetime
train_store['Date'] = pd.to_datetime(train_store['Date'])

# Extract date features
logger.info("Extracting date features")
train_store['Year'] = train_store['Date'].dt.year
train_store['Month'] = train_store['Date'].dt.month
train_store['Day'] = train_store['Date'].dt.day
train_store['DayOfWeek'] = train_store['Date'].dt.dayofweek
train_store['WeekOfYear'] = train_store['Date'].dt.isocalendar().week


Merged Train and Store Data Shape: (1017209, 18)


# Exploratory Data Analysis (EDA)
Perform EDA by plotting sales distribution, average sales by day of the week, and other visualizations to understand the data.

In [None]:
# EDA - Exploratory Data Analysis
logger.info("Starting Exploratory Data Analysis")

# Plot sales distribution
plt.figure(figsize=(12, 6))
sns.histplot(train_store['Sales'], kde=True)
plt.title('Sales Distribution')
plt.xlabel('Sales')
plt.ylabel('Frequency')
plt.savefig('sales_distribution.png')
plt.close()

# Plot average sales by day of week
plt.figure(figsize=(10, 6))
avg_sales_by_dow = train_store.groupby('DayOfWeek')['Sales'].mean().reset_index()
sns.barplot(x='DayOfWeek', y='Sales', data=avg_sales_by_dow)
plt.title('Average Sales by Day of Week')
plt.xlabel('Day of Week (0=Monday, 6=Sunday)')
plt.ylabel('Average Sales')
plt.savefig('avg_sales_by_dow.png')
plt.close()

# Feature Engineering
Create new features such as weekend flag, month part categories, and handle missing values in the dataset.

In [None]:
# Create weekend flag
train_store['IsWeekend'] = train_store['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)

# Create month part categories (beginning, middle, end)
train_store['MonthPart'] = train_store['Day'].apply(
    lambda x: 'Beginning' if x <= 10 else ('Middle' if x <= 20 else 'End')
)

# Handle missing values
logger.info("Handling missing values")
train_store['CompetitionDistance'].fillna(train_store['CompetitionDistance'].max() * 2, inplace=True)
train_store['CompetitionOpenSinceMonth'].fillna(train_store['CompetitionOpenSinceMonth'].median(), inplace=True)
train_store['CompetitionOpenSinceYear'].fillna(train_store['CompetitionOpenSinceYear'].median(), inplace=True)
train_store['Promo2SinceWeek'].fillna(0, inplace=True)
train_store['Promo2SinceYear'].fillna(0, inplace=True)
train_store['PromoInterval'].fillna('', inplace=True)

# Prepare Data for Modeling
Convert categorical variables to numeric using one-hot encoding, drop unnecessary columns, and split the data into training and validation sets.

In [None]:
# Convert categorical variables to numeric using one-hot encoding
logger.info("Converting categorical variables to numeric")
train_store = pd.get_dummies(train_store, columns=['StoreType', 'Assortment', 'StateHoliday', 'MonthPart'])

# Drop unnecessary columns
train_store.drop(['Date', 'PromoInterval'], axis=1, inplace=True)

# Feature selection for model
logger.info("Selecting features for modeling")
features = train_store.drop(['Sales', 'Customers'], axis=1)
target = train_store['Sales']

# Split data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.2, random_state=42)

# Train Random Forest Model
Train a Random Forest Regressor using a pipeline with StandardScaler and evaluate its performance on the validation set.

In [None]:
# Create and train a RandomForestRegressor with a pipeline
logger.info("Training Random Forest model")
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
])

pipeline.fit(X_train, y_train)

# Make predictions on validation set
y_pred = pipeline.predict(X_val)

# Evaluate Model Performance
Calculate evaluation metrics such as MAE, RMSE, and R2 score for the model.

In [None]:
# Evaluate the model
logger.info("Evaluating model performance")
mae = mean_absolute_error(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)

print("\nModel Evaluation Metrics:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R2 Score: {r2:.4f}")


Model Evaluation Metrics:
Mean Absolute Error (MAE): 486.38
Root Mean Squared Error (RMSE): 836.73
R2 Score: 0.9527


# Feature Importance Analysis
Analyze and plot the feature importance to understand which features contribute the most to the model.

In [None]:
# Feature importance
rf = pipeline.named_steps['rf']
feature_importance = pd.DataFrame({
    'Feature': features.columns,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

# Plot feature importance
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance.head(15))
plt.title('Feature Importance')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()


Top 10 Most Important Features:
                      Feature  Importance
2                        Open    0.459827
5         CompetitionDistance    0.105815
0                       Store    0.084487
3                       Promo    0.073461
7    CompetitionOpenSinceYear    0.043970
6   CompetitionOpenSinceMonth    0.037660
1                   DayOfWeek    0.032290
13                        Day    0.023369
10            Promo2SinceYear    0.021889
14                 WeekOfYear    0.020745


# Save the Trained Model
Serialize the trained model using pickle and save it with a timestamped filename.

In [None]:
# Serialize the model with timestamp
logger.info("Serializing the model")
timestamp = datetime.now().strftime("%d-%m-%Y-%H-%M-%S-%f")
model_filename = f'rf_model_{timestamp}.pkl'

with open(model_filename, 'wb') as file:
    pickle.dump(pipeline, file)

print(f"\nModel saved as {model_filename}")
logger.info(f"Analysis completed. Model saved as {model_filename}")


Model saved as rf_model_28-04-2025-10-53-15-922500.pkl
