# Additional Features

Goal of this notebook is to add additional features to our dataset.

In [77]:
import pandas as pd
import numpy as np

In [78]:
df = pd.read_csv('/workspaces/Room_7_Bakery_Prediction/0_DataPreparation/0.2 Additional Data/complete_dataset_with_features.csv')
df['date'] = pd.to_datetime(df['date'])
df.dtypes

id                            float64
date                   datetime64[ns]
Warengruppe                   float64
umsatz                        float64
KielerWoche                   float64
Bewoelkung                    float64
Temperatur                    float64
Windgeschwindigkeit           float64
Wettercode                    float64
Is_Holiday                      int64
Day_Before_Holiday              int64
Day_After_Holiday               int64
Is_Vacation                     int64
Vacation_Type                  object
dtype: object

## Days, Months, Weekend

Adding day_of_the_week, month and is_weekend to the dataset.

In [79]:
# Add day_of_the_week (0=Monday, 6=Sunday)
df['day_of_the_week'] = df['date'].dt.dayofweek

# Add month (1-12)
df['month'] = df['date'].dt.month

# Add is_weekend (True for Saturday and Sunday)
df['is_weekend'] = df['day_of_the_week'].isin([5, 6])

# Display the first few rows to verify
df[['date', 'day_of_the_week', 'month', 'is_weekend']].head(10)

Unnamed: 0,date,day_of_the_week,month,is_weekend
0,2013-07-01,0,7,False
1,2013-07-01,0,7,False
2,2013-07-01,0,7,False
3,2013-07-01,0,7,False
4,2013-07-01,0,7,False
5,2013-07-02,1,7,False
6,2013-07-02,1,7,False
7,2013-07-02,1,7,False
8,2013-07-02,1,7,False
9,2013-07-02,1,7,False


Days until/since weekend

In [80]:
# Days until next weekend (Saturday=5, Sunday=6)
# Negative = days since last weekend, Positive = days until next weekend
def days_to_weekend(day_of_week):
    # 0=Monday, 6=Sunday
    if day_of_week in [5, 6]:  # Already weekend
        return 0
    else:
        # Days until Saturday (5)
        return (5 - day_of_week) % 7

df['days_to_weekend'] = df['day_of_the_week'].apply(days_to_weekend)

## Weather

**Adding features for the weather data.**  
Converting the Okta-Values of 'Bewölkung' into categorical data.  
    - **0** = Clear sky  
	- **1-2** = Partly cloudy  
	- **3-4** = Cloudy  
	- **5-6** = Very cloudy  
	- **7-8** = Overcast  

In [81]:
# Bin Okta values of 'Bewölkung' into categories
okta_bins = [-0.1, 0.5, 2.5, 4.5, 6.5, 8.5]  # covers 0, 1-2, 3-4, 5-6, 7-8
okta_labels = ["Clear sky", "Partly cloudy", "Cloudy", "Very cloudy", "Overcast"]

# Ensure numeric, then create categorical column
bew = pd.to_numeric(df['Bewoelkung'], errors='coerce')
df['bewoelkung_category'] = pd.cut(
    bew,
    bins=okta_bins,
    labels=okta_labels,
    include_lowest=True
)

df[['Bewoelkung', 'bewoelkung_category']].head(10)

Unnamed: 0,Bewoelkung,bewoelkung_category
0,6.0,Very cloudy
1,6.0,Very cloudy
2,6.0,Very cloudy
3,6.0,Very cloudy
4,6.0,Very cloudy
5,3.0,Cloudy
6,3.0,Cloudy
7,3.0,Cloudy
8,3.0,Cloudy
9,3.0,Cloudy


In [82]:
print(np.sort(df['Wettercode'].dropna().unique()).tolist())

[0.0, 3.0, 5.0, 10.0, 17.0, 20.0, 21.0, 22.0, 28.0, 45.0, 49.0, 53.0, 55.0, 61.0, 63.0, 65.0, 68.0, 69.0, 71.0, 73.0, 75.0, 77.0, 79.0, 95.0]


Converting WMO-Code of 'Wettercode' into categorical data.  
0.0 = Cloud development not observed or not observable  
3.0 = Clouds generally forming or developing  
5.0 = Haze  
10.0 = Mist  
17.0 = Thunderstorm, but no precipitation at the time of observation  
20.0 = Drizzle (not freezing) or snow grains  
21.0 = Rain (not freezing)  
22.0 = Snow  
28.0 = Fog or ice fog  
45.0 = Fog or ice fog, sky invisible  
49.0 = Fog, depositing rime, sky invisible  
53.0 = Moderate drizzle, not freezing, continuous  
55.0 = Heavy drizzle, not freezing, continuous  
61.0 = Slight rain, not freezing, intermittent  
63.0 = Moderate rain, not freezing, continuous  
65.0 = Heavy rain, not freezing, continuous  
68.0 = Rain or drizzle and snow, slight  
69.0 = Rain or drizzle and snow, moderate or heavy  
71.0 = Slight continuous fall of snowflakes  
73.0 = Moderate continuous fall of snowflakes  
75.0 = Heavy continuous fall of snowflakes  
77.0 = Snow grains (with or without fog)  
79.0 = Ice pellets  
95.0 = Thunderstorm, slight or moderate, without hail but with rain and/or snow at time of observation

In [83]:
# Map WMO weather codes to categories
wmo_map = {
    0.0: "Cloud development not observed or not observable",
    3.0: "Clouds generally forming or developing",
    5.0: "Haze",
    10.0: "Mist",
    17.0: "Thunderstorm, no precipitation at observation",
    20.0: "Drizzle (not freezing) or snow grains",
    21.0: "Rain (not freezing)",
    22.0: "Snow",
    28.0: "Fog or ice fog",
    45.0: "Fog or ice fog, sky invisible",
    49.0: "Fog, depositing rime, sky invisible",
    53.0: "Moderate drizzle, not freezing, continuous",
    55.0: "Heavy drizzle, not freezing, continuous",
    61.0: "Slight rain, not freezing, intermittent",
    63.0: "Moderate rain, not freezing, continuous",
    65.0: "Heavy rain, not freezing, continuous",
    68.0: "Rain or drizzle and snow, slight",
    69.0: "Rain or drizzle and snow, moderate or heavy",
    71.0: "Slight continuous fall of snowflakes",
    73.0: "Moderate continuous fall of snowflakes",
    75.0: "Heavy continuous fall of snowflakes",
    77.0: "Snow grains (with or without fog)",
    79.0: "Ice pellets",
    95.0: "Thunderstorm, slight/moderate, no hail but rain/snow"
}

# Ensure numeric then map; keep unknowns as 'Other/Unknown'
wmo_numeric = pd.to_numeric(df['Wettercode'], errors='coerce')
df['wettercode_category'] = wmo_numeric.map(wmo_map).fillna('Missing')

df[['Wettercode', 'wettercode_category']].head(10)

Unnamed: 0,Wettercode,wettercode_category
0,20.0,Drizzle (not freezing) or snow grains
1,20.0,Drizzle (not freezing) or snow grains
2,20.0,Drizzle (not freezing) or snow grains
3,20.0,Drizzle (not freezing) or snow grains
4,20.0,Drizzle (not freezing) or snow grains
5,,Missing
6,,Missing
7,,Missing
8,,Missing
9,,Missing


Adding a new categorie for season.  
**Spring** = March - May  
**Summer** = June - August  
**Autumn** = September - November  
**Winter** = December - February

In [84]:
# Add season based on month
def get_season(month):
    if month in [3, 4, 5]:
        return "Spring"
    elif month in [6, 7, 8]:
        return "Summer"
    elif month in [9, 10, 11]:
        return "Autumn"
    else:  # 12, 1, 2
        return "Winter"

df['season'] = df['month'].apply(get_season)

df[['date', 'month', 'season']].head(10)


Unnamed: 0,date,month,season
0,2013-07-01,7,Summer
1,2013-07-01,7,Summer
2,2013-07-01,7,Summer
3,2013-07-01,7,Summer
4,2013-07-01,7,Summer
5,2013-07-02,7,Summer
6,2013-07-02,7,Summer
7,2013-07-02,7,Summer
8,2013-07-02,7,Summer
9,2013-07-02,7,Summer


Adding categorical data for temperature.  
Low: ≤ 10°C (cold, winter-like conditions)  
Medium: 10°C - 20°C (mild, spring/autumn conditions)  
High: > 20°C (warm, summer conditions)

In [85]:
# Categorize temperature into Low, Medium, and High
# Temperature thresholds (in Celsius)
temp_bins = [-np.inf, 0, 10, 20, np.inf]
temp_labels = ["Freezing", "Low", "Medium", "High"]

# Ensure numeric then categorize
temp_numeric = pd.to_numeric(df['Temperatur'], errors='coerce')
df['temperature_category'] = pd.cut(
    temp_numeric,
    bins=temp_bins,
    labels=temp_labels,
    include_lowest=True
)

df[['Temperatur', 'temperature_category']].head(10)

Unnamed: 0,Temperatur,temperature_category
0,17.8375,Medium
1,17.8375,Medium
2,17.8375,Medium
3,17.8375,Medium
4,17.8375,Medium
5,17.3125,Medium
6,17.3125,Medium
7,17.3125,Medium
8,17.3125,Medium
9,17.3125,Medium


Day-over-day temperature change

In [86]:
# Day-over-day temperature change
temp_numeric = pd.to_numeric(df['Temperatur'], errors='coerce')
df['temp_change_1d'] = temp_numeric.diff()

Temperature trend (3-day direction)

In [87]:
# Temperature trend: 3-day direction (positive = warming, negative = cooling)
temp_numeric = pd.to_numeric(df['Temperatur'], errors='coerce')
df['temp_trend_3d'] = temp_numeric.diff(periods=3)

## Stochastic

7-day & 14-day rolling averages of umsatz

In [88]:
# 7-day and 14-day rolling averages of umsatz
df['umsatz_ma7'] = df['umsatz'].rolling(window=7, min_periods=1).mean()
df['umsatz_ma14'] = df['umsatz'].rolling(window=14, min_periods=1).mean()

7-day rolling std dev (volatility)

In [89]:
# 7-day rolling standard deviation (volatility indicator)
df['umsatz_volatility_7d'] = df['umsatz'].rolling(window=7, min_periods=1).std()

7-day rolling sum

In [90]:
# 7-day rolling sum of umsatz (weekly total)
df['umsatz_sum_7d'] = df['umsatz'].rolling(window=7, min_periods=1).sum()

Lag 1 & Lag 7 of 'umsatz'

In [91]:
# Lag 1 (previous day) and Lag 7 (same day last week)
df['umsatz_lag1'] = df['umsatz'].shift(1)
df['umsatz_lag7'] = df['umsatz'].shift(7)

Lag 1 & overall percentage change

In [92]:
# Percentage change in umsatz (day-over-day)
df['umsatz_pct_change'] = df['umsatz'].pct_change()

# Lag 1 percentage change
df['umsatz_pct_change_lag1'] = df['umsatz_pct_change'].shift(1)

  df['umsatz_pct_change'] = df['umsatz'].pct_change()


## Interaction Features

is_weekend * season interaction feature

In [93]:
# Create interaction feature: is_weekend * season
# Convert categorical to numeric for interaction
season_map = {"Winter": 1, "Spring": 2, "Summer": 3, "Autumn": 4}
df['season_numeric'] = df['season'].map(season_map)
df['is_weekend_int'] = df['is_weekend'].astype(int)

# Interaction feature
df['weekend_season_interaction'] = df['is_weekend_int'] * df['season_numeric']

# Optional: create categorical labels for the interaction
def create_interaction_label(row):
    if row['is_weekend']:
        return f"Weekend_{row['season']}"
    else:
        return f"Weekday_{row['season']}"

df['weekend_season_category'] = df.apply(create_interaction_label, axis=1)

Is_Holiday * temperature_category

In [105]:
# Interaction: Holiday * Temperature
temp_map = {"Freezing": 1, "Low": 2, "Medium": 3, "High": 4}
temp_cat_numeric = pd.to_numeric(df['temperature_category'].map(temp_map), errors='coerce')
df['holiday_temp_interaction'] = df['Is_Holiday'] * temp_cat_numeric

# Also create categorical labels
def create_holiday_temp_label(row):
    if row['Is_Holiday'] == 0:
        return 'No_Holiday'
    else:
        return f"Holiday_{row['temperature_category']}"

df['holiday_temp_category'] = df.apply(create_holiday_temp_label, axis=1)

Day_Before_Holiday * is_weekend

In [96]:
# Interaction: Day Before Holiday * Weekend
df['day_before_holiday_int'] = df['Day_Before_Holiday'].astype(int)
df['day_before_holiday_weekend_interaction'] = df['day_before_holiday_int'] * df['is_weekend_int']

# Categorical labels
def create_day_before_holiday_label(row):
    if row['Day_Before_Holiday'] == 0:
        return 'Regular_Day'
    elif row['is_weekend']:
        return 'Day_Before_Holiday_Weekend'
    else:
        return 'Day_Before_Holiday_Weekday'

df['day_before_holiday_weekend_category'] = df.apply(create_day_before_holiday_label, axis=1)

KielerWoche * temperature_category

In [106]:
# Interaction: KielerWoche * Temperature
# Handle KielerWoche: convert to binary (1 if in Kiel Week, 0 otherwise)
df['KielerWoche_binary'] = df['KielerWoche'].notna().astype(int)

temp_map = {"Freezing": 1, "Low": 2, "Medium": 3, "High": 4}
temp_cat_numeric = pd.to_numeric(df['temperature_category'].map(temp_map), errors='coerce')

df['kielerweek_temp_interaction'] = df['KielerWoche_binary'] * temp_cat_numeric

# Categorical labels
def create_kielerweek_temp_label(row):
    if row['KielerWoche_binary'] == 0:
        return 'No_KielerWoche'
    else:
        return f"KielerWoche_{row['temperature_category']}"

df['kielerweek_temp_category'] = df.apply(create_kielerweek_temp_label, axis=1)


temperature_category * season

In [102]:
# Interaction: Temperature Category * Season
# Ensure numeric conversion
temp_cat_numeric = pd.to_numeric(df['temperature_category'].map(temp_map), errors='coerce')
season_numeric = pd.to_numeric(df['season_numeric'], errors='coerce')

df['temp_season_interaction'] = temp_cat_numeric * season_numeric

# Categorical labels
def create_temp_season_label(row):
    return f"{row['temperature_category']}_{row['season']}"

df['temp_season_category'] = df.apply(create_temp_season_label, axis=1)

is_weekend * temperature_category

In [103]:
# Interaction: Weekend * Temperature Category
temp_cat_numeric = pd.to_numeric(df['temperature_category'].map(temp_map), errors='coerce')
df['weekend_temp_interaction'] = df['is_weekend_int'] * temp_cat_numeric

# Categorical labels
def create_weekend_temp_label(row):
    day_type = "Weekend" if row['is_weekend'] else "Weekday"
    return f"{day_type}_{row['temperature_category']}"

df['weekend_temp_category'] = df.apply(create_weekend_temp_label, axis=1)

bewoelkung_category * is_weekend

In [104]:
# Interaction: Cloud Cover (Bewölkung) * Weekend
# Convert bewoelkung_category to numeric
bewoelkung_map = {"Clear sky": 1, "Partly cloudy": 2, "Cloudy": 3, "Very cloudy": 4, "Overcast": 5}
bewoelkung_numeric = pd.to_numeric(df['bewoelkung_category'].map(bewoelkung_map), errors='coerce')

df['bewoelkung_weekend_interaction'] = bewoelkung_numeric * df['is_weekend_int']

# Categorical labels
def create_bewoelkung_weekend_label(row):
    day_type = "Weekend" if row['is_weekend'] else "Weekday"
    return f"{day_type}_{row['bewoelkung_category']}"

df['bewoelkung_weekend_category'] = df.apply(create_bewoelkung_weekend_label, axis=1)

In [107]:
df.info

<bound method DataFrame.info of               id       date  Warengruppe      umsatz  KielerWoche  Bewoelkung  \
0      1307011.0 2013-07-01          1.0  148.828353          NaN         6.0   
1      1307012.0 2013-07-01          2.0  535.856285          NaN         6.0   
2      1307013.0 2013-07-01          3.0  201.198426          NaN         6.0   
3      1307014.0 2013-07-01          4.0   65.890169          NaN         6.0   
4      1307015.0 2013-07-01          5.0  317.475875          NaN         6.0   
...          ...        ...          ...         ...          ...         ...   
11159  1907301.0 2019-07-30          1.0         NaN          NaN         7.0   
11160  1907302.0 2019-07-30          2.0         NaN          NaN         7.0   
11161  1907303.0 2019-07-30          3.0         NaN          NaN         7.0   
11162  1907304.0 2019-07-30          4.0         NaN          NaN         7.0   
11163  1907305.0 2019-07-30          5.0         NaN          NaN         7.0

## Export

In [108]:
output_file = 'complete_dataset_with_additional_features.csv'
df.to_csv(output_file, index=False)