# Creating the model to predict the sales 

In [3]:
# Import Dataset and useful libraries 
import os 
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
os.chdir('../scripts/')
import utils as util 

source1 = "../../Data/test_preprocessed.csv"
source2 = "../../Data/train_preprocessed.csv"


test_preprocessed = util.read_csv_file(source1).get('data')
train_preprocessed = util.read_csv_file(source2).get('data')


In [4]:
test_preprocessed.head() 

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,...,IsEndOfMonth,StoreType_b,StoreType_c,StoreType_d,Assortment_b,Assortment_c,StateHoliday_a,"PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec",PromoInterval_Unknown
0,1,1,4,2015-09-17,1.0,1,0,1270.0,9.0,2008.0,...,0,0,1,0,0,0,0,0,0,1
1,2,3,4,2015-09-17,1.0,1,0,14130.0,12.0,2006.0,...,0,0,0,0,0,0,0,1,0,0
2,3,7,4,2015-09-17,1.0,1,0,24000.0,4.0,2013.0,...,0,0,0,0,0,1,0,0,0,1
3,4,8,4,2015-09-17,1.0,1,0,7520.0,10.0,2014.0,...,0,0,0,0,0,0,0,0,0,1
4,5,9,4,2015-09-17,1.0,1,0,2030.0,8.0,2000.0,...,0,0,0,0,0,1,0,0,0,1


### To implement a machine learning model for this regression problem, we’ll:
#
- Use the Random Forest Regressor, a robust tree-based algorithm suitable for structured data.
- Implement the model using Sklearn Pipelines to streamline preprocessing, model training, and inference.
### Pipeline Design
The pipeline will:
#
### Preprocess the data:
- Standard scaling (already performed for numerical features in preprocessing).
- Ensure features are consistent for train and test datasets.
### Train the model:
- Use RandomForestRegressor from sklearn.ensemble.
- Adjust hyperparameters for better performance.
### Evaluate performance:
- Use cross-validation to test generalization.

In [10]:
# Split the data 
X = train_preprocessed.drop(columns=['Sales'])
y = train_preprocessed['Sales']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Drop the 'Date' column from train and validation sets
X_train = X_train.drop(columns=['Date'], errors='ignore')
X_val = X_val.drop(columns=['Date'], errors='ignore')

In [11]:
# Define the pipeline
pipeline = Pipeline([
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [12]:
# Train the pipeline
pipeline.fit(X_train, y_train)

In [13]:
# Validate the model
y_pred = pipeline.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.013164093763764739
