In [15]:
import pandas as pd
import holidays
import numpy as np

Read in .csv file.

In [16]:
df = pd.read_csv('/workspaces/Room_7_Bakery_Prediction/0_DataPreparation/Data/complete_dataset.csv')

Convert to datetime.

In [17]:
df['date'] = pd.to_datetime(df['date'])
df.dtypes

id                            float64
date                   datetime64[ns]
Warengruppe                   float64
umsatz                        float64
KielerWoche                   float64
Bewoelkung                    float64
Temperatur                    float64
Windgeschwindigkeit           float64
Wettercode                    float64
dtype: object

Dataset information.

In [18]:
print(f"Columns: {df.columns.tolist()}")
print(f"Shape: {df.shape}")
print(f"Time period: {df['date'].min()} to {df['date'].max()}")
print(f"Number of rows: {len(df):,}")
print(f"Number of product groups: {df['Warengruppe'].nunique()}")
print("\nFirst rows:")
print(df.head(10))

Columns: ['id', 'date', 'Warengruppe', 'umsatz', 'KielerWoche', 'Bewoelkung', 'Temperatur', 'Windgeschwindigkeit', 'Wettercode']
Shape: (11164, 9)
Time period: 2013-07-01 00:00:00 to 2019-07-30 00:00:00
Number of rows: 11,164
Number of product groups: 6

First rows:
          id       date  Warengruppe      umsatz  KielerWoche  Bewoelkung  \
0  1307015.0 2013-07-01          5.0  317.475875          NaN         6.0   
1  1307014.0 2013-07-01          4.0   65.890169          NaN         6.0   
2  1307012.0 2013-07-01          2.0  535.856285          NaN         6.0   
3  1307013.0 2013-07-01          3.0  201.198426          NaN         6.0   
4  1307011.0 2013-07-01          1.0  148.828353          NaN         6.0   
5  1307021.0 2013-07-02          1.0  159.793757          NaN         3.0   
6  1307022.0 2013-07-02          2.0  546.780787          NaN         3.0   
7  1307023.0 2013-07-02          3.0  265.261254          NaN         3.0   
8  1307024.0 2013-07-02          4.0   7

Add public holidays.

In [19]:
# Public holidays for Schleswig-Holstein from 2013-2019
de_sh_holidays = holidays.Germany(prov='SH', years=range(2013, 2020))

# Feature: Is_Holiday
df['Is_Holiday'] = df['date'].apply(lambda x: 1 if x in de_sh_holidays else 0)

# Sort for correct lag features
df = df.sort_values(['date', 'Warengruppe']).reset_index(drop=True)

Adding additional features 'Day_Before_Holiday' and 'Day_After_Holiday'.

In [20]:
# Feature: Day before holiday (important for bakeries - higher sales!)
# Grouped by product group to avoid incorrect shifts across groups
df['Day_Before_Holiday'] = df.groupby('Warengruppe')['Is_Holiday'].shift(-1).fillna(0).astype(int)

# Feature: Day after holiday
df['Day_After_Holiday'] = df.groupby('Warengruppe')['Is_Holiday'].shift(1).fillna(0).astype(int)

print(f"   • Number of holidays: {df['Is_Holiday'].sum()} days")
print(f"   • Day before holiday: {df['Day_Before_Holiday'].sum()} days")
print(f"   • Day after holiday: {df['Day_After_Holiday'].sum()} days")


   • Number of holidays: 120 days
   • Day before holiday: 120 days
   • Day after holiday: 120 days


Adjust timeframe to our data.

In [22]:
# Which holidays are in the time period?
# Convert pandas Timestamps to date objects for comparison
min_date = df['date'].min().date()
max_date = df['date'].max().date()

holidays_in_period = sorted([date for date in de_sh_holidays 
                             if min_date <= date <= max_date])
print(f"\n   Holidays in period ({len(holidays_in_period)} days):")
for holiday in holidays_in_period[:10]:  # Show first 10
    print(f"   - {holiday.strftime('%d.%m.%Y')} ({de_sh_holidays.get(holiday)})")
if len(holidays_in_period) > 10:
    print(f"   ... and {len(holidays_in_period) - 10} more")


   Holidays in period (56 days):
   - 03.10.2013 (German Unity Day)
   - 25.12.2013 (Christmas Day)
   - 26.12.2013 (Second Day of Christmas)
   - 01.01.2014 (New Year's Day)
   - 18.04.2014 (Good Friday)
   - 21.04.2014 (Easter Monday)
   - 01.05.2014 (Labor Day)
   - 29.05.2014 (Ascension Day)
   - 09.06.2014 (Whit Monday)
   - 03.10.2014 (German Unity Day)
   ... and 46 more
