In [1]:
import pandas as pd
import sklearn as sk
from pandas.core.interchange.dataframe_protocol import Column

In [7]:
df = pd.read_csv('zara_cleaned.csv')
df.head()

Unnamed: 0,Product ID,Promotion,Seasonal,Sales Volume,name,price,section,Aisle,End-cap,Front of Store,jackets,jeans,shoes,sweaters,t-shirts
0,185102,False,False,2823,BASIC PUFFER JACKET,19.99,False,True,False,False,True,False,False,False,False
1,188771,False,False,654,TUXEDO JACKET,169.0,False,True,False,False,True,False,False,False,False
2,180176,True,True,2220,SLIM FIT SUIT JACKET,129.0,False,False,True,False,True,False,False,False,False
3,112917,True,True,1568,STRETCH SUIT JACKET,129.0,False,True,False,False,True,False,False,False,False
4,192936,False,True,2942,DOUBLE FACED JACKET,139.0,False,False,True,False,True,False,False,False,False


In [12]:
X = df.drop(columns=["Sales Volume", "name", "Product ID"])
y = df["Sales Volume"]

X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, test_size=0.2)

In [28]:
X.mean()

Promotion          0.476190
Seasonal           0.507937
price             86.252540
section            0.134921
Aisle              0.384921
End-cap            0.341270
Front of Store     0.273810
jackets            0.555556
jeans              0.031746
shoes              0.123016
sweaters           0.162698
t-shirts           0.126984
dtype: float64

In [29]:
y.mean()

np.float64(1823.702380952381)

# Random Forest Regressor

In [14]:
rf = sk.ensemble.RandomForestRegressor()
rf.fit(X_train, y_train)

In [22]:
import numpy as np
from sklearn.metrics import mean_squared_error

# Predicting on the test data
y_pred = rf.predict(X_test)

# Calculating Mean Squared Error (MSE)
VAR = mean_squared_error(y_test, y_pred)
print("VAR:", VAR)

# Calculating Root Mean Squared Error (RMSE)
rmse = np.sqrt(VAR)
print("σ:", rmse)


VAR: 623506.4276176834
σ: 789.6242319088767


# Neural Network

In [42]:
from sklearn.neural_network import MLPRegressor

nn = MLPRegressor(
    hidden_layer_sizes=(100,),  # One hidden layer with 100 neurons
    activation='relu',          # 'relu' activation function
    solver='lbfgs',              # 'adam' solver for weight optimization
    max_iter=5000,               # Set the number of iterations (epochs)
    random_state=42             # Set the seed for reproducibility
)
nn.fit(X_train, y_train)

In [43]:
# Predicting on the test data
y_pred = nn.predict(X_test)

# Calculating Mean Squared Error (MSE)
VAR = mean_squared_error(y_test, y_pred)
print("VAR:", VAR)

# Calculating Root Mean Squared Error (RMSE)
rmse = np.sqrt(VAR)
print("σ:", rmse)

VAR: 567219.7848035875
σ: 753.139950343618


# Hyperparameter Tuning

In [44]:
from sklearn.model_selection import GridSearchCV

# Define the model
mlp = MLPRegressor(max_iter=1000, random_state=42)

# Define the hyperparameter grid
param_grid = {
    'hidden_layer_sizes': [(50, ), (100,), (150, ), (50, 50), (50, 100), (100, 50), (50, 50, 50), (50, 100, 50)],  # Different layer structures
    'activation': ['relu', 'tanh'],  # Activation functions
    'solver': ['lbfgs', 'adam'],     # Optimizers
    'alpha': [0.0001, 0.001, 0.01],  # Regularization term
    'learning_rate': ['constant', 'adaptive']  # Learning rate options
}

# Setup GridSearchCV
grid_search = GridSearchCV(mlp, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Get the best model
hyper_model = grid_search.best_estimator_

# Train and predict using the best model
hyper_model.fit(X_train, y_train)
y_pred = hyper_model.predict(X_test)


Best Hyperparameters: {'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'constant', 'solver': 'lbfgs'}
Mean Squared Error: 537379.8057559978
R-squared: -0.051992972534542714


In [45]:
# Predicting on the test data
y_pred = hyper_model.predict(X_test)

# Calculating Mean Squared Error (MSE)
VAR = mean_squared_error(y_test, y_pred)
print("VAR:", VAR)

# Calculating Root Mean Squared Error (RMSE)
rmse = np.sqrt(VAR)
print("σ:", rmse)

VAR: 537379.8057559978
σ: 733.0619385536244
