In [1]:
# Centralizing data processing.
""" 

In this section we are centralizing testing code before deployment by testing and decoding the various steps.
"""
import pandas as pd
import os
def prepare_sales_data(file_path):
    # Load the sales data
    df = pd.read_csv(file_path)
    df = pd.melt(df,id_vars=['Product Code', 'Product Name'], var_name='Date', value_name='Sales')
    # Example data preparation steps
    # Convert date columns to periods
    df['Date'] = pd.to_datetime(df['Date'])
        
    # Potentially other preprocessing like filling missing values, etc.
    # df = ...

    return df

In [2]:

df=prepare_sales_data(os.path.join("data", "sales_data.csv"))
# Date features
df['weekday'] = pd.to_datetime(df['Date']).dt.weekday
df['month'] = pd.to_datetime(df['Date']).dt.month
df['year'] = pd.to_datetime(df['Date']).dt.year
df['dayofyear'] = pd.to_datetime(df['Date']).dt.dayofyear
df['weekofyear'] = pd.to_datetime(df['Date']).dt.isocalendar().week

# Rolling window features Fill NaNs in rolling features with the overall mean and std (assuming that initial periods can use overall stats)
df['rolling_mean_7'] = df.groupby(['Product Code','Product Name'])['Sales'].shift(1).rolling(window=7).mean().fillna(df['Sales'].mean())
df['rolling_mean_30'] = df.groupby(['Product Code','Product Name'])['Sales'].shift(1).rolling(window=30).mean().fillna(df['Sales'].mean())
df['rolling_std_7'] = df.groupby(['Product Code','Product Name'])['Sales'].shift(1).rolling(window=7).std().fillna(df['Sales'].mean())
df['rolling_std_30'] = df.groupby(['Product Code','Product Name'])['Sales'].shift(1).rolling(window=30).std().fillna(df['Sales'].mean())

                    

In [3]:
# Define the features and target
features = ['weekday', 'month', 'year', 'dayofyear', 'weekofyear', 'rolling_mean_7', 'rolling_mean_30',
            'rolling_std_7', 'rolling_std_30']
target = 'Sales'

# Split features and target
X_product = df[features]
y_product = df[target]

# Display the first few rows of features and target
print(X_product.head())
print(y_product.head())

   weekday  month  year  dayofyear  weekofyear  rolling_mean_7  \
0        6      1  2023          1          52         15.2528   
1        6      1  2023          1          52         15.2528   
2        6      1  2023          1          52         15.2528   
3        6      1  2023          1          52         15.2528   
4        6      1  2023          1          52         15.2528   

   rolling_mean_30  rolling_std_7  rolling_std_30  
0          15.2528        15.2528         15.2528  
1          15.2528        15.2528         15.2528  
2          15.2528        15.2528         15.2528  
3          15.2528        15.2528         15.2528  
4          15.2528        15.2528         15.2528  
0    0
1    0
2    5
3    2
4    0
Name: Sales, dtype: int64


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_product, y_product, test_size=0.2, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Display the first few rows of the scaled training data
print(pd.DataFrame(X_train_scaled, columns=features).head())

    weekday     month      year  dayofyear  weekofyear  rolling_mean_7  \
0 -0.502924 -0.725227  1.559396  -0.817928   -0.807999       -0.395664   
1 -1.001913  0.447006 -0.641274   0.325527    0.335992       -1.245198   
2  1.493032  1.033123 -0.641274   0.911668    0.874341       -0.829099   
3  0.495054  0.153948 -0.641274   0.287092    0.268698       -0.413001   
4 -1.500902  0.153948 -0.641274   0.046870    0.066818        0.349845   

   rolling_mean_30  rolling_std_7  rolling_std_30  
0         1.727924       0.186625        0.754991  
1        -0.849904      -1.123902       -0.692279  
2        -0.909655      -0.001049       -0.607692  
3        -0.474327      -0.640184       -0.718926  
4        -0.499934      -0.037420       -0.415720  


In [5]:
import numpy as np
import pandas as pd

# Fill NaNs in X_train_scaled with the mean of the respective columns
X_train_scaled = pd.DataFrame(X_train_scaled, columns=features)
X_train_scaled.fillna(X_train_scaled.mean(), inplace=True)

# Fill NaNs in X_val_scaled with the mean of the respective columns
X_val_scaled = pd.DataFrame(X_val_scaled, columns=features)
X_val_scaled.fillna(X_val_scaled.mean(), inplace=True)

# Convert back to numpy arrays
X_train_scaled = X_train_scaled.values
X_val_scaled = X_val_scaled.values

# Check for NaNs again
print("NaNs in X_train_scaled after filling:", np.isnan(X_train_scaled).sum())
print("NaNs in X_val_scaled after filling:", np.isnan(X_val_scaled).sum())

NaNs in X_train_scaled after filling: 0
NaNs in X_val_scaled after filling: 0
