In [35]:
# Import necessary libraries
import pandas as pd  # Data manipulation and handling
import statsmodels.api as sm  # Building and summarizing regression models
from sklearn.preprocessing import StandardScaler  # Scaling features for regression
from sklearn.model_selection import train_test_split  # Splitting data into train and test sets
import matplotlib.pyplot as plt  # Visualization library
import seaborn as sns  # Advanced data visualizations

In [36]:
#Load the data
data = pd.read_csv('/workspaces/bakery_sales_prediction/2_BaselineModel/00_Julius/test/data_simple.csv')
# Display the first few rows of the dataset for inspection
print(data.head())

        Datum      Umsatz  Warengruppe_1  Warengruppe_2  Warengruppe_3  \
0  2013-07-01  148.828353              1              0              0   
1  2013-07-02  159.793757              1              0              0   
2  2013-07-03  111.885594              1              0              0   
3  2013-07-04  168.864941              1              0              0   
4  2013-07-05  171.280754              1              0              0   

   Warengruppe_4  Warengruppe_5  Warengruppe_6  Temp_Very_Cold  Temp_Cold  \
0              0              0              0               0          0   
1              0              0              0               0          0   
2              0              0              0               0          0   
3              0              0              0               0          0   
4              0              0              0               0          0   

   ...  Cloud_Partly_Cloudy  Cloud_Cloudy  Wind_Light  Wind_Moderate  \
0  ...              

In [37]:
#Because of all the temperature variables, here is a function to split the copied string from the csv headers into a list of strings with all the variables
def split_temperature_cloud_string(input_string):
    return input_string.split(',')

# Example usage
input_str = "Kuchen,Saisonbrot,Temp_Very_Cold,Temp_Cold,Temp_Mild,Temp_Warm,Temp_Hot,Cloud_Clear,Cloud_Partly_Cloudy,Cloud_Cloudy,Wind_Light,Wind_Moderate,Wind_Strong,Weather_Good,Weather_Light_Issues,Weather_Moderate,Weather_Severe,Weather_Unknown,VPI,is_holiday"
result = split_temperature_cloud_string(input_str)
print(result)

['Kuchen', 'Saisonbrot', 'Temp_Very_Cold', 'Temp_Cold', 'Temp_Mild', 'Temp_Warm', 'Temp_Hot', 'Cloud_Clear', 'Cloud_Partly_Cloudy', 'Cloud_Cloudy', 'Wind_Light', 'Wind_Moderate', 'Wind_Strong', 'Weather_Good', 'Weather_Light_Issues', 'Weather_Moderate', 'Weather_Severe', 'Weather_Unknown', 'VPI', 'is_holiday']


In [38]:
# Select features and target variable
features = ['Warengruppe_1', 'Warengruppe_2', 'Warengruppe_3', 'Warengruppe_4', 'Warengruppe_5', 'Warengruppe_6', 'Temp_Very_Cold', 'Temp_Cold', 'Temp_Mild', 'Temp_Warm', 'Temp_Hot', 'Cloud_Clear', 'Cloud_Partly_Cloudy', 'Cloud_Cloudy', 'Wind_Light', 'Wind_Moderate', 'Wind_Strong', 'Weather_Good', 'Weather_Light_Issues', 'Weather_Moderate', 'Weather_Severe', 'Weather_Unknown']
X = data[features[0:5]]
y = data['Umsatz']  # Target variable: Umsatz_total
print(X)
print(y)

      Warengruppe_1  Warengruppe_2  Warengruppe_3  Warengruppe_4  \
0                 1              0              0              0   
1                 1              0              0              0   
2                 1              0              0              0   
3                 1              0              0              0   
4                 1              0              0              0   
...             ...            ...            ...            ...   
9329              0              0              0              0   
9330              0              0              0              0   
9331              0              0              0              0   
9332              0              0              0              0   
9333              0              0              0              0   

      Warengruppe_5  
0                 0  
1                 0  
2                 0  
3                 0  
4                 0  
...             ...  
9329              0  
9330   

In [39]:
# Scale the feature data to standardize the range
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [40]:
# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [41]:
# Add a constant term to the training data for the regression intercept
X_train = sm.add_constant(X_train)

In [42]:
# Train the linear regression model using Statsmodels
model = sm.OLS(y_train, X_train).fit()

In [43]:
# Print the summary of the regression model
# Includes metrics like R-squared, coefficients, and p-values for each feature
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 Umsatz   R-squared:                       0.638
Model:                            OLS   Adj. R-squared:                  0.638
Method:                 Least Squares   F-statistic:                     2633.
Date:                Fri, 06 Dec 2024   Prob (F-statistic):               0.00
Time:                        21:56:30   Log-Likelihood:                -44081.
No. Observations:                7467   AIC:                         8.817e+04
Df Residuals:                    7461   BIC:                         8.822e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        207.6030      1.026    202.304      0.0

In [49]:
df_testdata = pd.read_csv('/workspaces/bakery_sales_prediction/2_BaselineModel/00_Julius/test/testdata_simple.csv')


In [50]:
prediction = model.predict(df_testdata.drop(['ID'],axis=1))

KeyError: "['ID'] not found in axis"