In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os

data = pd.read_csv("../../../0_DataPreparation/00_data/data_with_lag/data_with_lag.csv")
data_test = pd.read_csv("../../../0_DataPreparation/00_data/data_with_lag/Testdaten_with_lag.csv")

# Min-Max-Skalierung und andere Transformationen gleichzeitig auf beiden DataFrames anwenden
def preprocess_data(df):
    # Min-Max-Skalierung
    df['Number_of_ships_scaled_current'] = (
        df['Number_of_ships_current'] - df['Number_of_ships_current'].min()
    ) / (df['Number_of_ships_current'].max() - df['Number_of_ships_current'].min())

    # Bewölkung zusammenfassen
    df['Cloud_ok_current'] = (
        df[['Cloud_Clear_current', 'Cloud_Partly_Cloudy_current']].sum(axis=1) > 0
    ).astype(int)

    # Filtern der Warengruppe
    df = df[df["Warengruppe_4"] == 1]

    return df

# Transformation auf beide DataFrames anwenden
data = preprocess_data(data)
data_test = preprocess_data(data_test)

# Liste der gewünschten Spalten --> am Ende auch unten bei Testdaten löschen!!
desired_columns = ['Datum',
            'Umsatz',
            #'Temp_Very_Cold_current', 
            'Temp_Cold_current', 
            #'Temp_Mild_current', 
            'Temp_Warm_current', 
            'Temp_Hot_current', 
            #'Cloud_Clear', 
            #'Cloud_Partly_Cloudy', 
            'Cloud_ok_current',
            'Cloud_Cloudy_current', 
            #'Wind_Light', 
            #'Wind_Moderate', 
            #'Wind_ok',
            #'Wind_Strong', 
            #'Weather_Good', 
            #'Weather_Light_Issues', 
            #'Weather_Moderate', 
            #'Weather_Severe', 
            'KielerWoche_current', 
            'Montag_current', 
            'Dienstag_current', 
            'Mittwoch_current', 
            'Donnerstag_current', 
            'Freitag_current',
            #'Samstag_current',
            'Sonntag_current',
            'VPI_current',
            #'Number_of_ships',
            #'Number_of_ships_scaled_current',
            #'Ship',
            #'Heimspiel', --> weglassen
            'Feiertag_current',
            'is_holiday_current',
            'Weihnachtsmarkt_current',
            'Markt_current',
            #'Ostertag_current',
            #'Silvester_current',
            'Werktag_current',
            'Frühling_current',
            'Sommer_current',
            #'Herbst_current',
            'Winter_current',
            'wetter_sehr_schlecht_current',
            'wetter_sehr_schön_current',
            'Monat_2_current',
            'Monat_8_current',
            'Monat_7_current',
            'Monat_6_current',
            'Monat_12_current',
            #'Monat_1_current',
            #'Monat_11',
            #'zwischen_den_jahren',
            #'wetter_sehr_schön_prev',
            #'Temp_Hot_prev'
            #'is_holiday_next'
            'Heimspiel_prev',
            'Ostertag_prev',
            'KielerWoche_next'
            ]

# Wähle nur die gewünschten Spalten aus
data = data[desired_columns]
desired_columns.remove('Umsatz') # da Umsatz nicht im Testdatensatz enthalten ist, muss es hier entfernt werden
data_test = data_test[desired_columns]


data.head()  # Print first few rows to verify
data_test.head()

Unnamed: 0,Datum,Temp_Cold_current,Temp_Warm_current,Temp_Hot_current,Cloud_ok_current,Cloud_Cloudy_current,KielerWoche_current,Montag_current,Dienstag_current,Mittwoch_current,...,wetter_sehr_schlecht_current,wetter_sehr_schön_current,Monat_2_current,Monat_8_current,Monat_7_current,Monat_6_current,Monat_12_current,Heimspiel_prev,Ostertag_prev,KielerWoche_next
1065,2018-08-01,0,1,0,1,0,0,0,0,1,...,0,1,0,1,0,0,0,0.0,0.0,0.0
1066,2018-08-02,0,0,1,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0.0,0.0,0.0
1067,2018-08-03,0,0,1,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0.0,0.0,0.0
1068,2018-08-04,0,0,1,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0.0,0.0,0.0
1069,2018-08-05,0,1,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0.0,0.0,0.0


In [2]:
# Handle missing values by removing rows with any missing values
data = data.dropna()

# Data split Zeitreihen
# Define your date thresholds
train_end_date = '2018-07-31'
validation_end_date = '2018-07-31'

# Split the data based on the date thresholds
train_data = data[data['Datum'] <= train_end_date]
vali_data = data[(data['Datum'] > train_end_date) & (data['Datum'] <= validation_end_date)]
test_data = data_test

# Check the dimensions of the datasets
print("Training dataset dimensions:", train_data.shape)
print("Validation dataset dimensions:", vali_data.shape)
print("Test dataset dimensions:", test_data.shape)

# Separating features and labels
training_features = train_data.drop('Umsatz', axis=1).drop('Datum', axis=1)
validation_features = vali_data.drop('Umsatz', axis=1).drop('Datum', axis=1)
test_features = test_data.drop('Datum', axis=1)

training_labels = train_data[['Umsatz']]
validation_labels = vali_data[['Umsatz']]
#test_labels = test_data[['Umsatz']]  --> Umsatz Spalte gibts hier nicht

# Print dimensions of the dataframes
print()
print()
print("Training features dimensions:", training_features.shape)
print("Validation features dimensions:", validation_features.shape)
print("Test features dimensions:", test_features.shape)
print()
print("Training labels dimensions:", training_labels.shape)
print("Validation labels dimensions:", validation_labels.shape)
#print("Test labels dimensions:", test_labels.shape)

Training dataset dimensions: (1766, 33)
Validation dataset dimensions: (0, 33)
Test dataset dimensions: (354, 32)


Training features dimensions: (1766, 31)
Validation features dimensions: (0, 31)
Test features dimensions: (354, 31)

Training labels dimensions: (1766, 1)
Validation labels dimensions: (0, 1)


In [3]:
# Create subdirectory for the pickle files
subdirectory = "pickle_data_Konditorei"
os.makedirs(subdirectory, exist_ok=True)

# Export of the prepared data to subdirectory as pickle files
training_features.to_pickle(f"{subdirectory}/training_features.pkl")
validation_features.to_pickle(f"{subdirectory}/validation_features.pkl")
test_features.to_pickle(f"{subdirectory}/test_features.pkl")
training_labels.to_pickle(f"{subdirectory}/training_labels.pkl")
validation_labels.to_pickle(f"{subdirectory}/validation_labels.pkl")
#test_labels.to_pickle(f"{subdirectory}/test_labels.pkl")