Download the test dataset from Kaggle and merge it with your current data set.

Split your dataset into a training dataset from 01.07.2013 to 31.07.2017, a validation dataset from 01.08.2017 to 31.07.2018, and the test set from 01.08.2018 to 31.07.2019.

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

# Load the datasets
print("Loading datasets...")
test_df = pd.read_csv('test.csv')
merged_bakery_df = pd.read_csv('/workspaces/Room_7_Bakery_Prediction/0_DataPreparation/0.3 Additional Features/complete_dataset_with_additional_features.csv')

# Display basic info about each dataset
print(f"\nTest dataset shape: {test_df.shape}")
print(f"Merged bakery dataset shape: {merged_bakery_df.shape}")

print("\nTest dataset columns:")
print(test_df.columns.tolist())

print("\nMerged bakery dataset columns:")
print(merged_bakery_df.columns.tolist())

print("\nFirst few rows of test dataset:")
print(test_df.head())

print("\nFirst few rows of merged bakery dataset:")
print(merged_bakery_df.head())

Loading datasets...

Test dataset shape: (1830, 3)
Merged bakery dataset shape: (11164, 50)

Test dataset columns:
['id', 'Datum', 'Warengruppe']

Merged bakery dataset columns:
['id', 'date', 'Warengruppe', 'umsatz', 'KielerWoche', 'Bewoelkung', 'Temperatur', 'Windgeschwindigkeit', 'Wettercode', 'Is_Holiday', 'Day_Before_Holiday', 'Day_After_Holiday', 'Is_Vacation', 'Vacation_Type', 'day_of_the_week', 'month', 'is_weekend', 'days_to_weekend', 'bewoelkung_category', 'wettercode_category', 'season', 'temperature_category', 'temp_change_1d', 'temp_trend_3d', 'umsatz_ma7', 'umsatz_ma14', 'umsatz_volatility_7d', 'umsatz_sum_7d', 'umsatz_lag1', 'umsatz_lag7', 'umsatz_pct_change', 'umsatz_pct_change_lag1', 'season_numeric', 'is_weekend_int', 'weekend_season_interaction', 'weekend_season_category', 'holiday_temp_interaction', 'holiday_temp_category', 'day_before_holiday_int', 'day_before_holiday_weekend_interaction', 'day_before_holiday_weekend_category', 'KielerWoche_binary', 'kielerweek_tem

In [2]:
# Prepare test data for merging
print("Preparing test data for merging...")

# Rename 'Datum' to 'date' in test dataset to match merged_bakery_df
test_df_prepared = test_df.copy()
test_df_prepared = test_df_prepared.rename(columns={'Datum': 'date'})

# Convert date columns to datetime for both datasets
test_df_prepared['date'] = pd.to_datetime(test_df_prepared['date'])
merged_bakery_df['date'] = pd.to_datetime(merged_bakery_df['date'])

print(f"Test data date range: {test_df_prepared['date'].min()} to {test_df_prepared['date'].max()}")
print(f"Merged bakery data date range: {merged_bakery_df['date'].min()} to {merged_bakery_df['date'].max()}")

# For test data, we don't have sales (umsatz) data - this will be what we predict
# Add placeholder columns for test data to match structure
test_df_prepared['umsatz'] = np.nan  # This is what we want to predict

# Merge weather data from merged_bakery_df to test_df_prepared based on date
weather_columns = ['date', 'KielerWoche', 'Bewoelkung', 'Temperatur', 'Windgeschwindigkeit', 'Wettercode']
weather_data = merged_bakery_df[weather_columns].drop_duplicates(subset=['date'])

# Merge test data with weather data
test_df_merged = pd.merge(test_df_prepared, weather_data, on='date', how='left')

print(f"\nTest data after merging with weather: {test_df_merged.shape}")
print("Sample of merged test data:")
print(test_df_merged.head())

Preparing test data for merging...
Test data date range: 2018-08-01 00:00:00 to 2019-07-30 00:00:00
Merged bakery data date range: 2013-07-01 00:00:00 to 2019-07-30 00:00:00

Test data after merging with weather: (1830, 9)
Sample of merged test data:
        id       date  Warengruppe  umsatz  KielerWoche  Bewoelkung  \
0  1808011 2018-08-01            1     NaN          NaN         0.0   
1  1808021 2018-08-02            1     NaN          NaN         0.0   
2  1808031 2018-08-03            1     NaN          NaN         1.0   
3  1808041 2018-08-04            1     NaN          NaN         4.0   
4  1808051 2018-08-05            1     NaN          NaN         7.0   

   Temperatur  Windgeschwindigkeit  Wettercode  
0     23.7625                 10.0         0.0  
1     26.1875                 10.0         0.0  
2     27.6625                 10.0         0.0  
3     25.1375                 12.0         NaN  
4     21.3000                 14.0        61.0  


In [3]:
# Combine the historical data with the test data
print("Combining all datasets...")

# Filter out rows with NaN values in important columns from merged_bakery_df for training
historical_data = merged_bakery_df.dropna(subset=['Warengruppe', 'umsatz']).copy()

print(f"Historical data (after removing NaN): {historical_data.shape}")
print(f"Historical data date range: {historical_data['date'].min()} to {historical_data['date'].max()}")

# Combine historical data and test data
all_data = pd.concat([historical_data, test_df_merged], ignore_index=True)
all_data = all_data.sort_values('date').reset_index(drop=True)

print(f"Combined dataset shape: {all_data.shape}")
print(f"Combined data date range: {all_data['date'].min()} to {all_data['date'].max()}")

# Define date ranges for splitting
train_start = pd.to_datetime('2013-07-01')
train_end = pd.to_datetime('2017-07-31')
val_start = pd.to_datetime('2017-08-01') 
val_end = pd.to_datetime('2018-07-31')
test_start = pd.to_datetime('2018-08-01')
test_end = pd.to_datetime('2019-07-31')

print(f"\nSplit ranges:")
print(f"Training: {train_start.date()} to {train_end.date()}")
print(f"Validation: {val_start.date()} to {val_end.date()}")
print(f"Test: {test_start.date()} to {test_end.date()}")

Combining all datasets...
Historical data (after removing NaN): (9334, 50)
Historical data date range: 2013-07-01 00:00:00 to 2018-07-31 00:00:00
Combined dataset shape: (11164, 50)
Combined data date range: 2013-07-01 00:00:00 to 2019-07-30 00:00:00

Split ranges:
Training: 2013-07-01 to 2017-07-31
Validation: 2017-08-01 to 2018-07-31
Test: 2018-08-01 to 2019-07-31


In [4]:
# Split the data according to specified date ranges
print("Splitting data into train, validation, and test sets...")

# Training set: 01.07.2013 to 31.07.2017
train_data = all_data[
    (all_data['date'] >= train_start) & 
    (all_data['date'] <= train_end)
].copy()

# Validation set: 01.08.2017 to 31.07.2018  
val_data = all_data[
    (all_data['date'] >= val_start) & 
    (all_data['date'] <= val_end)
].copy()

# Test set: 01.08.2018 to 31.07.2019
test_data = all_data[
    (all_data['date'] >= test_start) & 
    (all_data['date'] <= test_end)
].copy()

print(f"\nDataset splits:")
print(f"Training set: {train_data.shape[0]} samples ({train_data['date'].min().date()} to {train_data['date'].max().date()})")
print(f"Validation set: {val_data.shape[0]} samples ({val_data['date'].min().date()} to {val_data['date'].max().date()})")
print(f"Test set: {test_data.shape[0]} samples ({test_data['date'].min().date()} to {test_data['date'].max().date()})")

# Check for missing values in each set
print(f"\nMissing umsatz values:")
print(f"Training set: {train_data['umsatz'].isnull().sum()} / {len(train_data)}")
print(f"Validation set: {val_data['umsatz'].isnull().sum()} / {len(val_data)}")
print(f"Test set: {test_data['umsatz'].isnull().sum()} / {len(test_data)} (expected - these are prediction targets)")

# Display sample from each set
print(f"\nSample from training set:")
print(train_data.head(3))
print(f"\nSample from validation set:")
print(val_data.head(3))
print(f"\nSample from test set:")
print(test_data.head(3))

Splitting data into train, validation, and test sets...

Dataset splits:
Training set: 7493 samples (2013-07-01 to 2017-07-31)
Validation set: 1841 samples (2017-08-01 to 2018-07-31)
Test set: 1830 samples (2018-08-01 to 2019-07-30)

Missing umsatz values:
Training set: 0 / 7493
Validation set: 0 / 1841
Test set: 1830 / 1830 (expected - these are prediction targets)

Sample from training set:
          id       date  Warengruppe      umsatz  KielerWoche  Bewoelkung  \
0  1307011.0 2013-07-01          1.0  148.828353          NaN         6.0   
1  1307012.0 2013-07-01          2.0  535.856285          NaN         6.0   
2  1307013.0 2013-07-01          3.0  201.198426          NaN         6.0   

   Temperatur  Windgeschwindigkeit  Wettercode  Is_Holiday  ...  \
0     17.8375                 15.0        20.0         0.0  ...   
1     17.8375                 15.0        20.0         0.0  ...   
2     17.8375                 15.0        20.0         0.0  ...   

   day_before_holiday_week

In [7]:
# Save the datasets to CSV files
print("Saving datasets to CSV files...")

# Save to data folder
train_data.to_csv('/workspaces/Room_7_Bakery_Prediction/0_DataPreparation/0.6 Merge with test dataset and split/train_data.csv', index=False)
val_data.to_csv('/workspaces/Room_7_Bakery_Prediction/0_DataPreparation/0.6 Merge with test dataset and split/val_data.csv', index=False)
test_data.to_csv('/workspaces/Room_7_Bakery_Prediction/0_DataPreparation/0.6 Merge with test dataset and split/test_data.csv', index=False)

# Also save the complete combined dataset
all_data.to_csv('/workspaces/Room_7_Bakery_Prediction/0_DataPreparation/0.6 Merge with test dataset and split/complete_dataset.csv', index=False)

print("âœ… Datasets saved successfully!")
print("\nFiles created:")
print("- data/train_data.csv")
print("- data/val_data.csv") 
print("- data/test_data.csv")
print("- data/complete_dataset.csv")

print(f"\nğŸ“Š Final Summary:")
print(f"Training period: 2013-07-01 to 2017-07-31 ({train_data.shape[0]} samples)")
print(f"Validation period: 2017-08-01 to 2018-07-31 ({val_data.shape[0]} samples)")
print(f"Test period: 2018-08-01 to 2019-07-30 ({test_data.shape[0]} samples)")
print(f"Total samples: {train_data.shape[0] + val_data.shape[0] + test_data.shape[0]}")
print(f"\nâœ¨ Ready for model training and evaluation!")

Saving datasets to CSV files...
âœ… Datasets saved successfully!

Files created:
- data/train_data.csv
- data/val_data.csv
- data/test_data.csv
- data/complete_dataset.csv

ğŸ“Š Final Summary:
Training period: 2013-07-01 to 2017-07-31 (7493 samples)
Validation period: 2017-08-01 to 2018-07-31 (1841 samples)
Test period: 2018-08-01 to 2019-07-30 (1830 samples)
Total samples: 11164

âœ¨ Ready for model training and evaluation!
