In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging
from datetime import datetime
import os
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Set up logging
logging.basicConfig(
    filename=f'sales_forecast_{datetime.now().strftime("%Y-%m-%d")}.log',
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('sales_forecast')

logger.info("Starting sales forecasting analysis for Rossmann Pharmaceuticals")


In [4]:
# Load the data
logger.info("Loading data files")
store_data = pd.read_csv('store.csv')
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')


In [6]:
# Display basic information about the datasets
logger.info("Displaying basic information about the datasets")
print("Store Data Shape:", store_data.shape)
print("Train Data Shape:", train_data.shape)
print("Test Data Shape:", test_data.shape)
# Display the first few rows of each dataset
print("\nStore Data Preview:")
print(store_data.head())

print("\nTrain Data Preview:")
print(train_data.head())

print("\nTest Data Preview:")
print(test_data.head())



Store Data Shape: (1115, 10)
Train Data Shape: (1017209, 9)
Test Data Shape: (41088, 8)

Store Data Preview:
   Store StoreType Assortment  CompetitionDistance  CompetitionOpenSinceMonth  \
0      1         c          a               1270.0                        9.0   
1      2         a          a                570.0                       11.0   
2      3         a          a              14130.0                       12.0   
3      4         c          c                620.0                        9.0   
4      5         a          a              29910.0                        4.0   

   CompetitionOpenSinceYear  Promo2  Promo2SinceWeek  Promo2SinceYear  \
0                    2008.0       0              NaN              NaN   
1                    2007.0       1             13.0           2010.0   
2                    2006.0       1             14.0           2011.0   
3                    2009.0       0              NaN              NaN   
4                    2015.0       0    

In [7]:
# Check for missing values
logger.info("Checking for missing values")
print("\nMissing values in Store Data:")
print(store_data.isnull().sum())

print("\nMissing values in Train Data:")
print(train_data.isnull().sum())

print("\nMissing values in Test Data:")
print(test_data.isnull().sum())



Missing values in Store Data:
Store                          0
StoreType                      0
Assortment                     0
CompetitionDistance            3
CompetitionOpenSinceMonth    354
CompetitionOpenSinceYear     354
Promo2                         0
Promo2SinceWeek              544
Promo2SinceYear              544
PromoInterval                544
dtype: int64

Missing values in Train Data:
Store            0
DayOfWeek        0
Date             0
Sales            0
Customers        0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
dtype: int64

Missing values in Test Data:
Id                0
Store             0
DayOfWeek         0
Date              0
Open             11
Promo             0
StateHoliday      0
SchoolHoliday     0
dtype: int64


In [8]:
# Data preprocessing
logger.info("Starting data preprocessing")


In [9]:
# Merge train data with store data
train_store = pd.merge(train_data, store_data, on='Store', how='left')
print("\nMerged Train and Store Data Shape:", train_store.shape)

# Convert Date column to datetime
train_store['Date'] = pd.to_datetime(train_store['Date'])



Merged Train and Store Data Shape: (1017209, 18)


In [10]:
# Extract date features
logger.info("Extracting date features")
train_store['Year'] = train_store['Date'].dt.year
train_store['Month'] = train_store['Date'].dt.month
train_store['Day'] = train_store['Date'].dt.day
train_store['DayOfWeek'] = train_store['Date'].dt.dayofweek
train_store['WeekOfYear'] = train_store['Date'].dt.isocalendar().week


In [11]:
# Create weekend flag
train_store['IsWeekend'] = train_store['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)

# Create month part categories (beginning, middle, end)
train_store['MonthPart'] = train_store['Day'].apply(
    lambda x: 'Beginning' if x <= 10 else ('Middle' if x <= 20 else 'End')
)

In [12]:
# EDA - Exploratory Data Analysis
logger.info("Starting Exploratory Data Analysis")

# Plot sales distribution
plt.figure(figsize=(12, 6))
sns.histplot(train_store['Sales'], kde=True)
plt.title('Sales Distribution')
plt.xlabel('Sales')
plt.ylabel('Frequency')
plt.savefig('sales_distribution.png')
plt.close()

In [13]:
# Plot average sales by day of week
plt.figure(figsize=(10, 6))
avg_sales_by_dow = train_store.groupby('DayOfWeek')['Sales'].mean().reset_index()
sns.barplot(x='DayOfWeek', y='Sales', data=avg_sales_by_dow)
plt.title('Average Sales by Day of Week')
plt.xlabel('Day of Week (0=Monday, 6=Sunday)')
plt.ylabel('Average Sales')
plt.savefig('avg_sales_by_dow.png')
plt.close()

In [14]:
# Correlation between sales and customers
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Customers', y='Sales', data=train_store)
plt.title('Correlation between Sales and Number of Customers')
plt.xlabel('Number of Customers')
plt.ylabel('Sales')
plt.savefig('sales_vs_customers.png')
plt.close()

In [15]:
# Correlation coefficient
corr = train_store['Sales'].corr(train_store['Customers'])
print(f"\nCorrelation between Sales and Customers: {corr:.4f}")



Correlation between Sales and Customers: 0.8947


In [16]:
#Effect of promotions on sales
plt.figure(figsize=(10, 6))
sns.boxplot(x='Promo', y='Sales', data=train_store)
plt.title('Effect of Promotions on Sales')
plt.xlabel('Promotion (0=No, 1=Yes)')
plt.ylabel('Sales')
plt.savefig('promo_effect_on_sales.png')
plt.close()

In [17]:
# Effect of store type on sales
plt.figure(figsize=(10, 6))
sns.boxplot(x='StoreType', y='Sales', data=train_store)
plt.title('Sales by Store Type')
plt.xlabel('Store Type')
plt.ylabel('Sales')
plt.savefig('sales_by_store_type.png')
plt.close()

# Effect of assortment on sales
plt.figure(figsize=(10, 6))
sns.boxplot(x='Assortment', y='Sales', data=train_store)
plt.title('Sales by Assortment Type')
plt.xlabel('Assortment Type')
plt.ylabel('Sales')
plt.savefig('sales_by_assortment.png')
plt.close()

In [18]:
# Sales during state holidays
plt.figure(figsize=(10, 6))
sns.boxplot(x='StateHoliday', y='Sales', data=train_store[train_store['StateHoliday'] != '0'])
plt.title('Sales during Different State Holidays')
plt.xlabel('State Holiday Type')
plt.ylabel('Sales')
plt.savefig('sales_during_holidays.png')
plt.close()

# Effect of competition distance on sales
plt.figure(figsize=(12, 6))
# Group by competition distance ranges
train_store['CompDistRange'] = pd.cut(train_store['CompetitionDistance'], 
                                     bins=[0, 1000, 5000, 10000, 20000, 200000], 
                                     labels=['<1km', '1-5km', '5-10km', '10-20km', '>20km'])
sns.boxplot(x='CompDistRange', y='Sales', data=train_store.dropna(subset=['CompDistRange']))
plt.title('Effect of Competition Distance on Sales')
plt.xlabel('Competition Distance Range')
plt.ylabel('Sales')
plt.savefig('competition_distance_effect.png')
plt.close()

In [19]:
# Check sales behavior before, during, and after holidays
# For this, we need to create features for days before and after holidays
logger.info("Analyzing sales behavior around holidays")

# First, let's create a dataframe with just the holiday dates
holiday_dates = train_store[train_store['StateHoliday'] != '0'][['Date', 'StateHoliday']].drop_duplicates()

# Function to calculate days to closest holiday
def days_to_holiday(date, holiday_dates):
    if holiday_dates.empty:
        return np.nan
    days = (holiday_dates['Date'] - date).dt.days
    future_days = days[days > 0]
    if future_days.empty:
        return np.nan
    return future_days.min()

In [20]:
# Function to calculate days since closest holiday
def days_since_holiday(date, holiday_dates):
    if holiday_dates.empty:
        return np.nan
    days = (date - holiday_dates['Date']).dt.days
    past_days = days[days > 0]
    if past_days.empty:
        return np.nan
    return past_days.min()


In [21]:
# This operation can be time-consuming for large datasets, so let's use a sample for demonstration
sample_dates = pd.DataFrame({'Date': pd.date_range(start='2015-01-01', end='2015-01-31')})
sample_dates['DaysToHoliday'] = sample_dates['Date'].apply(lambda x: days_to_holiday(x, holiday_dates))
sample_dates['DaysSinceHoliday'] = sample_dates['Date'].apply(lambda x: days_since_holiday(x, holiday_dates))
print("\nSample of days to/since holiday calculation:")
print(sample_dates.head())




Sample of days to/since holiday calculation:
        Date  DaysToHoliday  DaysSinceHoliday
0 2015-01-01              5                 6
1 2015-01-02              4                 1
2 2015-01-03              3                 2
3 2015-01-04              2                 3
4 2015-01-05              1                 4


In [22]:
# Check how Promo2 affects sales
plt.figure(figsize=(10, 6))
sns.boxplot(x='Promo2', y='Sales', data=train_store)
plt.title('Effect of Promo2 on Sales')
plt.xlabel('Promo2 Participation (0=No, 1=Yes)')
plt.ylabel('Sales')
plt.savefig('promo2_effect_on_sales.png')
plt.close()

In [23]:
# Prepare data for modeling
logger.info("Preparing data for modeling")

# Let's handle missing values first
train_store_model = train_store.copy()

# Fill CompetitionDistance NAs with a large value (assuming no nearby competition)
train_store_model['CompetitionDistance'].fillna(train_store_model['CompetitionDistance'].max() * 2, inplace=True)

# Fill CompetitionOpenSince Month/Year with median values
train_store_model['CompetitionOpenSinceMonth'].fillna(train_store_model['CompetitionOpenSinceMonth'].median(), inplace=True)
train_store_model['CompetitionOpenSinceYear'].fillna(train_store_model['CompetitionOpenSinceYear'].median(), inplace=True)

# For Promo2 related columns, fill with 0s for stores not participating
train_store_model['Promo2SinceWeek'].fillna(0, inplace=True)
train_store_model['Promo2SinceYear'].fillna(0, inplace=True)
train_store_model['PromoInterval'].fillna('', inplace=True)
# Convert categorical variables to numeric using one-hot encoding
logger.info("Converting categorical variables to numeric")
train_store_model = pd.get_dummies(train_store_model, columns=['StoreType', 'Assortment', 'StateHoliday', 'MonthPart'])

# Ensure all columns are numeric or handle categorical columns properly
for col in train_store_model.columns:
    try:
        if train_store_model[col].dtype == 'object':  # Handle object columns
            logger.warning(f"Column {col} has non-numeric data. Attempting to convert.")
            # Check for non-numeric values and replace them with NaN
            train_store_model[col] = pd.to_numeric(train_store_model[col], errors='coerce')
        elif pd.api.types.is_categorical_dtype(train_store_model[col]):  # Handle categorical columns
            logger.warning(f"Column {col} is categorical. Adding '0' as a category and filling missing values.")
            train_store_model[col] = train_store_model[col].cat.add_categories([0]).fillna(0)
    except ValueError as ve:
        logger.error(f"ValueError processing column {col}: {ve}")
        print(f"ValueError processing column {col}: {ve}")
    except Exception as e:
        logger.error(f"Error processing column {col}: {e}")
        print(f"Error processing column {col}: {e}")
        

Error processing column StateHoliday_0: 'DataFrame' object has no attribute 'dtype'
Error processing column StateHoliday_0: 'DataFrame' object has no attribute 'dtype'


In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
import logging

logger = logging.getLogger(__name__)

# Optional: map non-numeric distance values (example column)
if 'Distance' in train_store_model.columns:
    distance_mapping = {
        '<1km': 0,
        '1-2km': 1,
        '2-5km': 2,
        '5-10km': 3,
        '>10km': 4
    }
    train_store_model['Distance'] = train_store_model['Distance'].map(distance_mapping)

# One-hot encode all categorical columns
categorical_cols = train_store_model.select_dtypes(include=['object', 'category']).columns
train_store_model = pd.get_dummies(train_store_model, columns=categorical_cols)

# Fill any remaining NaN values
train_store_model.fillna(0, inplace=True)

# Drop unnecessary columns
train_store_model.drop(['Date', 'PromoInterval'], axis=1, inplace=True, errors='ignore')

# Define features and target
logger.info("Selecting features for modeling")
features = train_store_model.drop(['Sales', 'Customers'], axis=1)
target = train_store_model['Sales']

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.2, random_state=42)

# Create pipeline and train model
logger.info("Training Random Forest model")
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
])

pipeline.fit(X_train, y_train)

# Predict on validation data
y_pred = pipeline.predict(X_val)


In [25]:
# Evaluate the model
logger.info("Evaluating model performance")
mae = mean_absolute_error(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)

print("\nModel Evaluation Metrics:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R2 Score: {r2:.4f}")



Model Evaluation Metrics:
Mean Absolute Error (MAE): 486.06
Root Mean Squared Error (RMSE): 836.13
R2 Score: 0.9527


In [26]:
# Feature importance
rf = pipeline.named_steps['rf']
feature_importance = pd.DataFrame({
    'Feature': features.columns,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)
print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))



Top 10 Most Important Features:
                      Feature  Importance
2                        Open    0.459827
5         CompetitionDistance    0.101005
0                       Store    0.082170
3                       Promo    0.073461
7    CompetitionOpenSinceYear    0.043484
6   CompetitionOpenSinceMonth    0.037005
1                   DayOfWeek    0.032394
13                        Day    0.023301
10            Promo2SinceYear    0.021833
14                 WeekOfYear    0.021146


In [27]:
# Plot feature importance
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance.head(15))
plt.title('Feature Importance')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()
