In [59]:
# Data Handling
import numpy as np
import pandas as pd
import seaborn as sns

# Visualization
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.patches as mpatches
import seaborn as sns
import plotly.express as px

# Statistical Analysis
from statsmodels.tsa.stattools import adfuller
from scipy.stats import ttest_ind
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error


# Feature Processing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Modelling
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.model_selection import RandomizedSearchCV
import pickle

# Other Packages
import warnings

warnings.filterwarnings("ignore")



In [60]:
train_df = pd.read_csv("data/store-sales-time-series-forecasting/train.csv")
test_df = pd.read_csv("data/store-sales-time-series-forecasting/test.csv")
stores_df = pd.read_csv("data/store-sales-time-series-forecasting/stores.csv")
oil_df = pd.read_csv("data/store-sales-time-series-forecasting/oil.csv")
holidays_events_df = pd.read_csv("data/store-sales-time-series-forecasting/holidays_events.csv")
transactions_df = pd.read_csv("data/store-sales-time-series-forecasting/transactions.csv")

In [5]:
# Converting the 'date' column in the datasets to datetime format
# Train dataset
train_df['date'] = pd.to_datetime(train_df['date'])

# Test dataset
test_df['date'] = pd.to_datetime(test_df['date'])

# Holiday Events dataset
holidays_events_df['date'] = pd.to_datetime(holidays_events_df['date'])

# Oil dataset
oil_df['date'] = pd.to_datetime(oil_df['date'])

# Transactions dataset
transactions_df['date'] = pd.to_datetime(transactions_df['date'])

In [6]:
# Check the completeness of the test dataset
min_date = train_df['date'].min()
max_date = train_df['date'].max()
expected_dates = pd.date_range(start=min_date, end=max_date)

missing_dates = expected_dates[~expected_dates.isin(train_df['date'])]

if len(missing_dates) == 0:
    print("The train dataset is complete. It includes all the required dates.")
else:
    print("The train dataset is incomplete. The following dates are missing:")
    print(missing_dates)

The train dataset is incomplete. The following dates are missing:
DatetimeIndex(['2013-12-25', '2014-12-25', '2015-12-25', '2016-12-25'], dtype='datetime64[ns]', freq=None)


In [7]:
# Complete the missing dates in the train dataset
# Create an index of the missing dates as a DatetimeIndex object
missing_dates = pd.Index(['2013-12-25', '2014-12-25', '2015-12-25', '2016-12-25'], dtype='datetime64[ns]')

# Create a DataFrame with the missing dates, using the 'date' column
missing_data = pd.DataFrame({'date': missing_dates})

# Concatenate the original train dataset and the missing data DataFrame
# ignore_index=True ensures a new index is assigned to the resulting DataFrame
train_df = pd.concat([train_df, missing_data], ignore_index=True)

# Sort the DataFrame based on the 'date' column in ascending order
train_df.sort_values('date', inplace=True)

In [8]:
# Check the completeness of the test dataset
min_date = train_df['date'].min()
max_date = train_df['date'].max()
expected_dates = pd.date_range(start=min_date, end=max_date)

missing_dates = expected_dates[~expected_dates.isin(train_df['date'])]

if len(missing_dates) == 0:
    print("The train dataset is complete. It includes all the required dates.")
else:
    print("The train dataset is incomplete. The following dates are missing:")
    print(missing_dates)

The train dataset is complete. It includes all the required dates.


In [9]:
# Merging the common columns ('store_nbr' and 'date') in the datasets using the inner merge() function
# Merge train_df with stores_df based on 'store_nbr' column
merged_df_train = train_df.merge(stores_df, on='store_nbr', how='left')

# Merge merged_df1 with transactions_df based on 'date' and 'store_nbr' columns
# merged_df2 = merged_df_test.merge(transactions_df, on=['date', 'store_nbr'], how='left')

# # Merge merged_df2 with holidays_events_df based on 'date' column
# merged_df3_test = merged_df2.merge(holidays_events_df, on='date', how='left')

# # Merge merged_df3 with oil_df based on 'date' column
# merged_df4_test = merged_df3_test.merge(oil_df, on='date', how='left')

# # View the first five rows of the merged dataset
# merged_df.head()

In [11]:
merged_df_train.shape

(3000892, 10)

In [12]:
merged_df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000892 entries, 0 to 3000891
Data columns (total 10 columns):
 #   Column       Dtype         
---  ------       -----         
 0   id           float64       
 1   date         datetime64[ns]
 2   store_nbr    float64       
 3   family       object        
 4   sales        float64       
 5   onpromotion  float64       
 6   city         object        
 7   state        object        
 8   type         object        
 9   cluster      float64       
dtypes: datetime64[ns](1), float64(5), object(4)
memory usage: 229.0+ MB


In [13]:
# Renaming the columns with the appropriate names
merged_df_train = merged_df_train.rename(columns={"type": "store_type"})
merged_df_train.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,city,state,store_type,cluster
0,0.0,2013-01-01,1.0,AUTOMOTIVE,0.0,0.0,Quito,Pichincha,D,13.0
1,1194.0,2013-01-01,42.0,CELEBRATION,0.0,0.0,Cuenca,Azuay,D,2.0
2,1193.0,2013-01-01,42.0,BREAD/BAKERY,0.0,0.0,Cuenca,Azuay,D,2.0
3,1192.0,2013-01-01,42.0,BOOKS,0.0,0.0,Cuenca,Azuay,D,2.0
4,1191.0,2013-01-01,42.0,BEVERAGES,0.0,0.0,Cuenca,Azuay,D,2.0


In [14]:
merged_df_train['date'] = pd.to_datetime(merged_df_train['date'])
merged_df_train['year'] = merged_df_train['date'].dt.year

In [15]:
merged_df_train['month'] = merged_df_train['date'].dt.month
merged_df_train['day'] = merged_df_train['date'].dt.day

In [16]:
merged_df_train['day_of_week'] = merged_df_train['date'].dt.dayofweek

In [17]:
merged_df_train.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,city,state,store_type,cluster,year,month,day,day_of_week
0,0.0,2013-01-01,1.0,AUTOMOTIVE,0.0,0.0,Quito,Pichincha,D,13.0,2013,1,1,1
1,1194.0,2013-01-01,42.0,CELEBRATION,0.0,0.0,Cuenca,Azuay,D,2.0,2013,1,1,1
2,1193.0,2013-01-01,42.0,BREAD/BAKERY,0.0,0.0,Cuenca,Azuay,D,2.0,2013,1,1,1
3,1192.0,2013-01-01,42.0,BOOKS,0.0,0.0,Cuenca,Azuay,D,2.0,2013,1,1,1
4,1191.0,2013-01-01,42.0,BEVERAGES,0.0,0.0,Cuenca,Azuay,D,2.0,2013,1,1,1


In [18]:
merged_df_train.shape

(3000892, 14)

In [19]:
columns_to_drop = ['date','id', 'store_type', 'state']
merged_df_train = merged_df_train.drop(columns=columns_to_drop)

merged_df_train.head()

Unnamed: 0,store_nbr,family,sales,onpromotion,city,cluster,year,month,day,day_of_week
0,1.0,AUTOMOTIVE,0.0,0.0,Quito,13.0,2013,1,1,1
1,42.0,CELEBRATION,0.0,0.0,Cuenca,2.0,2013,1,1,1
2,42.0,BREAD/BAKERY,0.0,0.0,Cuenca,2.0,2013,1,1,1
3,42.0,BOOKS,0.0,0.0,Cuenca,2.0,2013,1,1,1
4,42.0,BEVERAGES,0.0,0.0,Cuenca,2.0,2013,1,1,1


In [20]:
merged_df_train.shape

(3000892, 10)

In [21]:
unique_families = merged_df_train['family'].unique()
unique_families

array(['AUTOMOTIVE', 'CELEBRATION', 'BREAD/BAKERY', 'BOOKS', 'BEVERAGES',
       'BEAUTY', 'BABY CARE', 'SEAFOOD', 'SCHOOL AND OFFICE SUPPLIES',
       'PRODUCE', 'PREPARED FOODS', 'POULTRY', 'PLAYERS AND ELECTRONICS',
       'PET SUPPLIES', 'PERSONAL CARE', 'MEATS', 'MAGAZINES',
       'LIQUOR,WINE,BEER', 'LINGERIE', 'LAWN AND GARDEN', 'LADIESWEAR',
       'HOME CARE', 'HOME APPLIANCES', 'CLEANING', 'DAIRY', 'DELI',
       'EGGS', 'HOME AND KITCHEN II', 'HOME AND KITCHEN I', 'HARDWARE',
       'GROCERY II', 'GROCERY I', 'FROZEN FOODS', nan], dtype=object)

In [22]:
# Define the category lists for each product category
food_families = ['BEVERAGES', 'BREAD/BAKERY', 'FROZEN FOODS', 'MEATS', 'PREPARED FOODS', 'DELI','PRODUCE', 'DAIRY','POULTRY','EGGS','SEAFOOD']
home_families = ['HOME AND KITCHEN I', 'HOME AND KITCHEN II', 'HOME APPLIANCES']
clothing_families = ['LINGERIE', 'LADYSWARE']
grocery_families = ['GROCERY I', 'GROCERY II']
stationery_families = ['BOOKS', 'MAGAZINES','SCHOOL AND OFFICE SUPPLIES']
cleaning_families = ['HOME CARE', 'BABY CARE','PERSONAL CARE']
hardware_families = ['PLAYERS AND ELECTRONICS','HARDWARE']

# Categorize the 'family' column based on the product categories
merged_df_train['family'] = np.where(merged_df_train['family'].isin(food_families), 'FOODS', merged_df_train['family'])
merged_df_train['family'] = np.where(merged_df_train['family'].isin(home_families), 'HOME', merged_df_train['family'])
merged_df_train['family'] = np.where(merged_df_train['family'].isin(clothing_families), 'CLOTHING', merged_df_train['family'])
merged_df_train['family'] = np.where(merged_df_train['family'].isin(grocery_families), 'GROCERY', merged_df_train['family'])
merged_df_train['family'] = np.where(merged_df_train['family'].isin(stationery_families), 'STATIONERY', merged_df_train['family'])
merged_df_train['family'] = np.where(merged_df_train['family'].isin(cleaning_families), 'CLEANING', merged_df_train['family'])
merged_df_train['family'] = np.where(merged_df_train['family'].isin(hardware_families), 'HARDWARE', merged_df_train['family'])

# Print the updated DataFrame
merged_df_train.head()

Unnamed: 0,store_nbr,family,sales,onpromotion,city,cluster,year,month,day,day_of_week
0,1.0,AUTOMOTIVE,0.0,0.0,Quito,13.0,2013,1,1,1
1,42.0,CELEBRATION,0.0,0.0,Cuenca,2.0,2013,1,1,1
2,42.0,FOODS,0.0,0.0,Cuenca,2.0,2013,1,1,1
3,42.0,STATIONERY,0.0,0.0,Cuenca,2.0,2013,1,1,1
4,42.0,FOODS,0.0,0.0,Cuenca,2.0,2013,1,1,1


In [23]:
# Define the categorical columns to encode
categorical_columns = ["family", "city"]

# Perform one-hot encoding
encoder = OneHotEncoder()
one_hot_encoded_train_data = encoder.fit_transform(merged_df_train[categorical_columns])

# Create column names for the one-hot encoded data
column_names = encoder.get_feature_names_out(categorical_columns)

# Convert the one-hot encoded data to a DataFrame
merged_df_train_encoded = pd.DataFrame(one_hot_encoded_train_data.toarray(), columns=column_names)

# Concatenate the original dataframe with the one-hot encoded data
merged_df_train_encoded = pd.concat([merged_df_train, merged_df_train_encoded], axis=1)

# Drop the original categorical columns
merged_df_train_encoded.drop(categorical_columns, axis=1, inplace=True)

# Print the head of the encoded DataFrame
merged_df_train_encoded.head()

Unnamed: 0,store_nbr,sales,onpromotion,cluster,year,month,day,day_of_week,family_AUTOMOTIVE,family_BEAUTY,...,city_Machala,city_Manta,city_Playas,city_Puyo,city_Quevedo,city_Quito,city_Riobamba,city_Salinas,city_Santo Domingo,city_nan
0,1.0,0.0,0.0,13.0,2013,1,1,1,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,42.0,0.0,0.0,2.0,2013,1,1,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,42.0,0.0,0.0,2.0,2013,1,1,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,42.0,0.0,0.0,2.0,2013,1,1,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,42.0,0.0,0.0,2.0,2013,1,1,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
merged_df_train_encoded.shape

(3000892, 46)

In [25]:
pd.set_option('display.max_columns', None)

In [26]:
merged_df_train_encoded.head()

Unnamed: 0,store_nbr,sales,onpromotion,cluster,year,month,day,day_of_week,family_AUTOMOTIVE,family_BEAUTY,family_CELEBRATION,family_CLEANING,family_CLOTHING,family_FOODS,family_GROCERY,family_HARDWARE,family_HOME,family_LADIESWEAR,family_LAWN AND GARDEN,"family_LIQUOR,WINE,BEER",family_PET SUPPLIES,family_STATIONERY,family_nan,city_Ambato,city_Babahoyo,city_Cayambe,city_Cuenca,city_Daule,city_El Carmen,city_Esmeraldas,city_Guaranda,city_Guayaquil,city_Ibarra,city_Latacunga,city_Libertad,city_Loja,city_Machala,city_Manta,city_Playas,city_Puyo,city_Quevedo,city_Quito,city_Riobamba,city_Salinas,city_Santo Domingo,city_nan
0,1.0,0.0,0.0,13.0,2013,1,1,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,42.0,0.0,0.0,2.0,2013,1,1,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,42.0,0.0,0.0,2.0,2013,1,1,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,42.0,0.0,0.0,2.0,2013,1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,42.0,0.0,0.0,2.0,2013,1,1,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
merged_df_train_encoded = merged_df_train_encoded.drop('family_nan', axis=1)

In [28]:
merged_df_train_encoded.head()

Unnamed: 0,store_nbr,sales,onpromotion,cluster,year,month,day,day_of_week,family_AUTOMOTIVE,family_BEAUTY,family_CELEBRATION,family_CLEANING,family_CLOTHING,family_FOODS,family_GROCERY,family_HARDWARE,family_HOME,family_LADIESWEAR,family_LAWN AND GARDEN,"family_LIQUOR,WINE,BEER",family_PET SUPPLIES,family_STATIONERY,city_Ambato,city_Babahoyo,city_Cayambe,city_Cuenca,city_Daule,city_El Carmen,city_Esmeraldas,city_Guaranda,city_Guayaquil,city_Ibarra,city_Latacunga,city_Libertad,city_Loja,city_Machala,city_Manta,city_Playas,city_Puyo,city_Quevedo,city_Quito,city_Riobamba,city_Salinas,city_Santo Domingo,city_nan
0,1.0,0.0,0.0,13.0,2013,1,1,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,42.0,0.0,0.0,2.0,2013,1,1,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,42.0,0.0,0.0,2.0,2013,1,1,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,42.0,0.0,0.0,2.0,2013,1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,42.0,0.0,0.0,2.0,2013,1,1,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
merged_df_train_encoded.shape

(3000892, 45)

In [30]:
merged_df_train_encoded = merged_df_train_encoded.drop('city_nan', axis=1)

In [31]:
merged_df_train_encoded.shape

(3000892, 44)

In [35]:
merged_df_train_encoded.to_csv("data/transformed data/train_modifiedd.csv")

Model Training

In [36]:
train_set = merged_df_train_encoded.loc[merged_df_train_encoded['year'].isin([2013, 2014, 2015, 2016])]
eval_set = merged_df_train_encoded.loc[merged_df_train_encoded['year'] == 2017]

In [37]:
train_set.shape

(2596378, 44)

In [38]:
eval_set.shape

(404514, 44)

In [39]:
train_set = train_set.dropna(subset=['store_nbr', 'sales','onpromotion', 'cluster' ])

In [40]:
# Separate the target variable and features for training and testing
X_train = train_set.drop('sales', axis=1)
y_train = train_set['sales'] 


X_eval = eval_set.drop('sales', axis=1)  
y_eval = eval_set['sales']

In [41]:
# Initialize the results dataframe
results_df = pd.DataFrame(columns=['Model', 'RMSLE', 'RMSE', 'MSE', 'MAE'])

In [42]:
# Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_eval)

# Calculate metrics
lr_mse = mean_squared_error(y_eval, lr_predictions)
lr_mae = mean_absolute_error(y_eval, lr_predictions)

# Apply the absolute value function to both y_eval and lr_predictions
y_eval_abs = abs(y_eval)
lr_predictions_abs = abs(lr_predictions)

# Calculate the Root Mean Squared Logarithmic Error (RMSLE)
lr_rmsle = np.sqrt(mean_squared_log_error(y_eval_abs, lr_predictions_abs))

# Create a DataFrame to store results for Linear Regression
results_lr = pd.DataFrame({'Model': ['Linear Regression'],
                            'RMSLE': [lr_rmsle],
                            'RMSE': [np.sqrt(lr_mse)],
                            'MSE': [lr_mse],
                            'MAE': [lr_mae]}).round(2)

# Print the results_lr dataframe
results_lr

Unnamed: 0,Model,RMSLE,RMSE,MSE,MAE
0,Linear Regression,2.73,1048.66,1099696.72,462.89


In [43]:
# Random Forest Regression Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_eval)

# Calculate metrics
rf_mse = mean_squared_error(y_eval, rf_predictions)
rf_mae = mean_absolute_error(y_eval, rf_predictions)

# Apply the absolute value function to both y_eval and rf_predictions
y_eval_abs = abs(y_eval)
rf_predictions_abs = abs(rf_predictions)

# Calculate the Root Mean Squared Logarithmic Error (RMSLE)
rf_rmsle = np.sqrt(mean_squared_log_error(y_eval_abs, rf_predictions_abs))

# Create a DataFrame to store results for Random Forest
results_rf = pd.DataFrame({'Model': ['Random Forest'],
                            'RMSLE': [rf_rmsle],
                            'RMSE': [np.sqrt(rf_mse)],
                            'MSE': [rf_mse],
                            'MAE': [rf_mae]}).round(2)

# Print the results_rf dataframe
results_rf

Unnamed: 0,Model,RMSLE,RMSE,MSE,MAE
0,Random Forest,1.42,898.44,807185.48,327.86


In [44]:
# Define the parameter grid for tuning the random forest model
param_grid_rf = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['sqrt', 'log2', 0.5]
}

# Create Random Forest model
rf_model = RandomForestRegressor(random_state=42)

# Initialize RandomizedSearchCV
random_search_rf = RandomizedSearchCV(rf_model, param_distributions=param_grid_rf,
                                      n_iter=10, scoring='neg_mean_squared_error', cv=5,
                                      n_jobs=-1, random_state=42)

# Fit RandomizedSearchCV to the data
random_search_rf.fit(X_train, y_train)

# Get the best model and its hyperparameters
best_rf_model = random_search_rf.best_estimator_
best_rf_params = random_search_rf.best_params_

# Make predictions using the best model
best_rf_predictions = best_rf_model.predict(X_eval)

# Calculate metrics for the best model
best_rf_mse = mean_squared_error(y_eval, best_rf_predictions)
best_rf_rmse = np.sqrt(best_rf_mse)
best_rf_mae = mean_absolute_error(y_eval, best_rf_predictions)

# Apply absolute value to both predicted and target values
abs_best_rf_predictions = np.abs(best_rf_predictions)
abs_y_eval = np.abs(y_eval)

# Calculate RMSLE using the absolute values
best_rf_rmsle = np.sqrt(mean_squared_log_error(abs_y_eval, abs_best_rf_predictions))

# Create a DataFrame to store results for the best Random Forest model
best_results_rf = pd.DataFrame({'Model': ['Best Random Forest'],
                                'RMSLE': [best_rf_rmsle],
                                'RMSE': [best_rf_rmse],
                                'MSE': [best_rf_mse],
                                'MAE': [best_rf_mae]}).round(2)

# Print the best_results_rf dataframe
best_results_rf


Unnamed: 0,Model,RMSLE,RMSE,MSE,MAE
0,Best Random Forest,1.66,768.56,590680.74,310.93


In [45]:
# Define the key components
key_components = {
    'model': best_rf_model, 
    'best_params': best_rf_params,
    'best_score': best_rf_rmsle
}

# Save the key components in a file using pickle
with open('best_rf_model_components_trans_removed.pkl', 'wb') as file:
    pickle.dump(key_components, file)

In [50]:
test = pd.read_csv("test_modified.csv")

In [51]:
test.shape

(28512, 43)

In [53]:
results = best_rf_model.predict(test)

In [54]:
results.shape

(28512,)

In [55]:
# Extract 'id' column from the original DataFrame
id_column = test_df['id']

# Merge 'id' column with the new numpy array
new_column = pd.Series(results, name='sales')
scaled_results_df = pd.concat([id_column, new_column], axis=1)


In [56]:
scaled_results_df

Unnamed: 0,id,sales
0,3000888,10.444090
1,3000889,138.389590
2,3000890,49.243823
3,3000891,1434.002804
4,3000892,3.605678
...,...,...
28507,3029395,547.152435
28508,3029396,462.824326
28509,3029397,547.152435
28510,3029398,276.817989


In [58]:


scaled_results_df.to_csv("./submission/submission_random_regressor.csv", index=False)