In [None]:
# IDENTIFIERS

# shop_ID : Shop's unique identifier.
# day_of_the_week : Encoded from 0 to 6.
# date : day, month and year of the data point.
# number of customers : Quantity of customers that showed up that day.
# open : Binary variable equal to 0 if shop closed that day and 1 if shop open.
# promotion : Binary variable equal to 0 if shop had no promotions that day and 1 if it did.
# state_holiday : Encoded 0, a, b, c indicating if there was a state holiday at all (0 if not), and otherwise, the number indicates which state holiday it was.
# school_holiday : Binary variable equal to 0 if there was a school holiday that day and 1 if not

# Import Data

In [1]:
# import all the necessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [3]:
data = pd.read_csv('sales.csv')

# Read Data

In [4]:
# CHECK DATA

display(data.head(30))
display(data.shape)
display(data.dtypes)

Unnamed: 0.1,Unnamed: 0,store_ID,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales
0,425390,366,4,2013-04-18,517,1,0,0,0,4422
1,291687,394,6,2015-04-11,694,1,0,0,0,8297
2,411278,807,4,2013-08-29,970,1,1,0,0,9729
3,664714,802,2,2013-05-28,473,1,1,0,0,6513
4,540835,726,4,2013-10-10,1068,1,1,0,0,10882
5,687329,674,2,2013-03-19,1016,1,1,0,0,8406
6,600327,659,7,2014-06-08,0,0,0,0,0,0
7,96265,27,3,2014-07-16,1106,1,1,0,1,11162
8,364435,347,6,2013-05-11,448,1,0,0,0,5559
9,163146,288,2,2013-06-25,291,1,0,0,0,3997


(640840, 10)

Unnamed: 0              int64
store_ID                int64
day_of_week             int64
date                   object
nb_customers_on_day     int64
open                    int64
promotion               int64
state_holiday          object
school_holiday          int64
sales                   int64
dtype: object

In [5]:
# CHECK NULL VALUES & DUPLICATES

display(data.isnull().sum())
display(data.duplicated().sum())

Unnamed: 0             0
store_ID               0
day_of_week            0
date                   0
nb_customers_on_day    0
open                   0
promotion              0
state_holiday          0
school_holiday         0
sales                  0
dtype: int64

0

In [6]:
# CHECK VALUE COUNTS


display(data['store_ID'].value_counts())
display(data['day_of_week'].value_counts())
display(data['promotion'].value_counts())
display(data['state_holiday'].value_counts())
display(data['school_holiday'].value_counts())
display(data['open'].value_counts())

1045    645
309     636
754     635
432     634
286     634
       ... 
1004    448
287     448
1065    445
81      438
542     436
Name: store_ID, Length: 1115, dtype: int64

5    92138
4    91972
2    91686
3    91651
6    91347
7    91075
1    90971
Name: day_of_week, dtype: int64

0    396220
1    244620
Name: promotion, dtype: int64

0    621160
a     12842
b      4214
c      2624
Name: state_holiday, dtype: int64

0    526468
1    114372
Name: school_holiday, dtype: int64

1    532016
0    108824
Name: open, dtype: int64

In [7]:
display(pd.pivot_table(data, values='sales', index='promotion', aggfunc='mean'))
display(pd.pivot_table(data, values='sales', index='state_holiday', aggfunc='mean'))
display(pd.pivot_table(data, values='sales', index='day_of_week', aggfunc='mean'))
display(pd.pivot_table(data, values='sales', index='school_holiday', aggfunc='mean'))

Unnamed: 0_level_0,sales
promotion,Unnamed: 1_level_1
0,4410.200517
1,7992.084016


Unnamed: 0_level_0,sales
state_holiday,Unnamed: 1_level_1
0,5952.206403
a,282.368167
b,252.936877
c,178.606707


Unnamed: 0_level_0,sales
day_of_week,Unnamed: 1_level_1
1,7812.766948
2,7004.780054
3,6555.350536
4,6255.380268
5,6726.462697
6,5850.03282
7,210.680439


Unnamed: 0_level_0,sales
school_holiday,Unnamed: 1_level_1
0,5625.813723
1,6475.556447


In [8]:
pivot_table = pd.pivot_table(data, values='sales', index='day_of_week', aggfunc='mean')
pivot_table

Unnamed: 0_level_0,sales
day_of_week,Unnamed: 1_level_1
1,7812.766948
2,7004.780054
3,6555.350536
4,6255.380268
5,6726.462697
6,5850.03282
7,210.680439


In [9]:
display(data[(data['date'] == '2013-12-25') & ((data['open'] == '1'))])

Unnamed: 0.1,Unnamed: 0,store_ID,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales


In [10]:
display(data[data['state_holiday'] == 'c'].head(50))
display(data[data['school_holiday'] == '0'].head(50))

Unnamed: 0.1,Unnamed: 0,store_ID,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales
281,593012,567,4,2013-12-26,0,0,0,c,1,0
425,614063,113,3,2013-12-25,0,0,0,c,1,0
1828,702061,754,5,2014-12-26,0,0,0,c,1,0
1939,335440,109,3,2013-12-25,0,0,0,c,1,0
2048,341529,351,5,2014-12-26,0,0,0,c,1,0
2063,657306,584,4,2013-12-26,0,0,0,c,1,0
2088,168895,223,3,2013-12-25,0,0,0,c,1,0
2178,555822,239,3,2013-12-25,0,0,0,c,1,0
2258,651421,854,5,2014-12-26,0,0,0,c,1,0
2417,53200,293,3,2013-12-25,0,0,0,c,1,0


Unnamed: 0.1,Unnamed: 0,store_ID,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales


In [None]:
# min_date = data['date'].min()
# max_date = data['date'].max()

# # print("Minimum date:", min_date.strftime('%d-%m-%Y'))
# # print("Maximum date:", max_date.strftime('%d-%m-%Y'))

# Clean Data

In [11]:
data['day_of_week'] = data['day_of_week'].astype(str)
data['store_ID'] = data['store_ID'].astype(str)
data['promotion'] = data['promotion'].astype(str)
data['date'] = pd.to_datetime(data['date'])
data['state_holiday'] = data['state_holiday'].astype(str)
data['school_holiday'] = data['school_holiday'].astype(str)
data['open'] = data['open'].astype(str)

data.dtypes

Unnamed: 0                      int64
store_ID                       object
day_of_week                    object
date                   datetime64[ns]
nb_customers_on_day             int64
open                           object
promotion                      object
state_holiday                  object
school_holiday                 object
sales                           int64
dtype: object

In [None]:
# data['year'] = data['date'].dt.year
# data['month'] = data['date'].dt.month

In [None]:
# data['year'] = data['year'].astype(str)
# data['month'] = data['month'].astype(str)
display(data.head(10))
display(data.dtypes)

In [None]:
# def find_outliers_iqr(data, factor=1.5):
#     # Calculating the first quartile (Q1) and third quartile (Q3) for each column
#     Q1 = df.quantile(0.25)
#     Q3 = df.quantile(0.75)
#     # Calculating the IQR (Interquartile Range) for each column
#     IQR = Q3 - Q1
#     # Defining the lower and upper bounds for identifying outliers
#     lower_bound = Q1 - factor * IQR
#     upper_bound = Q3 + factor * IQR
#     # Finding outliers by comparing values to the bounds
#     outliers = ((df < lower_bound) | (df > upper_bound))
#     # Count the number of outliers in each column
#     outlier_count = outliers.sum()
#     outlier_info = pd.DataFrame({'Columns': outlier_count.index, 'Outlier Count': outlier_count.values})
#     return outliers, outlier_info
# # Set the IQR factor for outlier detection (default is 1.5)
# iqr_factor = 1.5
# # Find outliers in the continuous_df DataFrame and get outlier counts
# outliers, outlier_info = find_outliers_iqr(numericals_df, factor=iqr_factor)
# # Display the DataFrame of outliers (True indicates an outlier)
# print("Outliers:")
# print(outliers)
# # Display the DataFrame with outlier counts
# print("\nOutlier Counts:")
# outlier_info

In [None]:
# CREATE AVERAGE SALE PER CUSTOMER

# data['avg_sales_pc'] = (data['sales'] / data['nb_customers_on_day']).round(2)
# data['avg_sales_pc'].fillna(0, inplace=True)
data.head(10)

In [12]:
# DROP UNECESSARY COLUMNS

data = data.drop(['Unnamed: 0', 'date'], axis=1)

display(data)

Unnamed: 0,store_ID,day_of_week,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales
0,366,4,517,1,0,0,0,4422
1,394,6,694,1,0,0,0,8297
2,807,4,970,1,1,0,0,9729
3,802,2,473,1,1,0,0,6513
4,726,4,1068,1,1,0,0,10882
...,...,...,...,...,...,...,...,...
640835,409,6,483,1,0,0,0,4553
640836,97,1,987,1,1,0,0,12307
640837,987,1,925,1,0,0,0,6800
640838,1084,4,725,1,0,0,0,5344


In [13]:
clean_df = data.groupby(['store_ID', 'day_of_week','open', 'state_holiday','school_holiday'], as_index=False).agg({
    'sales': 'sum',
    'nb_customers_on_day': 'sum'
})

display(clean_df.head(30))
clean_df.shape

Unnamed: 0,store_ID,day_of_week,open,state_holiday,school_holiday,sales,nb_customers_on_day
0,1,1,0,a,0,0,0
1,1,1,0,b,1,0,0
2,1,1,1,0,0,358774,41658
3,1,1,1,0,1,78397,8795
4,1,2,0,a,1,0,0
5,1,2,1,0,0,334561,38841
6,1,2,1,0,1,77374,9358
7,1,3,0,a,0,0,0
8,1,3,0,a,1,0,0
9,1,3,0,c,1,0,0


(25927, 7)

# Visualise Data

In [None]:
# plt.bar(data['day_of_week'], data['sales'])
# plt.xlabel('Day of Week')
# plt.ylabel('Sales')
# plt.show()

In [None]:
# plt.bar(data['month'], data['sales'])
# plt.xlabel('Month')
# plt.ylabel('Sales')
# plt.show()

In [None]:
# plt.bar(data['day_of_week'], data['nb_customers_on_day'])
# plt.xlabel('Day of Week')
# plt.ylabel('No. Customers')
# plt.show()

In [None]:
# plt.scatter(data['day_of_week'], data['sales'])
# plt.xlabel('Day of Week')
# plt.ylabel('Sales')
# plt.show()

In [None]:
# plt.scatter(data['day_of_week'], data['nb_customers_on_day'])
# plt.xlabel('Day of Week')
# plt.ylabel('Customers')
# plt.show()

In [None]:
# sns.boxplot(data['nb_customers_on_day'])
# plt.xlabel('nb_customers_on_day')
# plt.show()

# sns.boxplot(data['sales'])
# plt.xlabel('sales')
# plt.show()

In [None]:
# correlation_matrix = data.corr()

# # Plotting the correlation matrix as a heatmap using Seaborn
# sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
# plt.title('Correlation Matrix')
# plt.show()

# Train Test Split

In [22]:
display(clean_df.head(30))
clean_df.tail(30)

Unnamed: 0,store_ID,day_of_week,open,state_holiday,school_holiday,sales,nb_customers_on_day
0,1,1,0,a,0,0,0
1,1,1,0,b,1,0,0
2,1,1,1,0,0,358774,41658
3,1,1,1,0,1,78397,8795
4,1,2,0,a,1,0,0
5,1,2,1,0,0,334561,38841
6,1,2,1,0,1,77374,9358
7,1,3,0,a,0,0,0
8,1,3,0,a,1,0,0
9,1,3,0,c,1,0,0


Unnamed: 0,store_ID,day_of_week,open,state_holiday,school_holiday,sales,nb_customers_on_day
25897,998,5,0,c,1,0,0
25898,998,5,1,0,0,230330,29081
25899,998,5,1,0,1,82714,10484
25900,998,6,1,0,0,323290,41623
25901,998,7,0,0,0,0,0
25902,998,7,0,a,0,0,0
25903,999,1,0,a,0,0,0
25904,999,1,0,a,1,0,0
25905,999,1,0,b,1,0,0
25906,999,1,1,0,0,694032,46077


In [23]:
X = clean_df.drop(['sales'], axis=1)
y = clean_df['sales']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
X_train_cat = X_train.select_dtypes(object)
X_train_num = X_train.select_dtypes(np.number)

X_test_cat = X_test.select_dtypes(object)
X_test_num = X_test.select_dtypes(np.number)

In [27]:
transformer = StandardScaler().fit(X_train_num)

X_train_scaled = pd.DataFrame(transformer.transform(X_train_num), columns=X_train_num.columns)
X_test_scaled = pd.DataFrame(transformer.transform(X_test_num), columns=X_test_num.columns)

display(X_train_scaled.head())
display(X_test_scaled.head())

display(X_train.shape)
display(X_test.shape)

Unnamed: 0,nb_customers_on_day
0,0.91853
1,0.616022
2,-0.642451
3,-0.642451
4,-0.642451


Unnamed: 0,nb_customers_on_day
0,0.290123
1,-0.157874
2,-0.642451
3,-0.642451
4,-0.642451


(20741, 6)

(5186, 6)

In [None]:
# X_train_cat = X_train_cat.drop(columns=['store_ID']) 
# X_store_id = clean_df['store_ID']

In [28]:
# Fit on the training data
encoder = OneHotEncoder(drop="first", sparse=False)
encoded_train = encoder.fit_transform(X_train_cat)
X_train_cat_encoded = pd.DataFrame(encoded_train, columns=encoder.get_feature_names_out())

# Transform the testing data using the same encoder and feature names
encoded_test = encoder.transform(X_test_cat)
X_test_cat_encoded = pd.DataFrame(encoded_test, columns=encoder.get_feature_names_out())

# Reset index
X_train_cat_encoded.reset_index(drop=True, inplace=True)
X_test_cat_encoded.reset_index(drop=True, inplace=True)

# Display the head of the encoded DataFrames

display(X_train_cat_encoded)
display(X_train_scaled)

display(X_test_cat_encoded)
display(X_test_scaled)



Unnamed: 0,store_ID_10,store_ID_100,store_ID_1000,store_ID_1001,store_ID_1002,store_ID_1003,store_ID_1004,store_ID_1005,store_ID_1006,store_ID_1007,...,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,day_of_week_7,open_1,state_holiday_a,state_holiday_b,state_holiday_c,school_holiday_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
20737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
20738,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
20739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,nb_customers_on_day
0,0.918530
1,0.616022
2,-0.642451
3,-0.642451
4,-0.642451
...,...
20736,0.841431
20737,2.033338
20738,-0.642451
20739,-0.642451


Unnamed: 0,store_ID_10,store_ID_100,store_ID_1000,store_ID_1001,store_ID_1002,store_ID_1003,store_ID_1004,store_ID_1005,store_ID_1006,store_ID_1007,...,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,day_of_week_7,open_1,state_holiday_a,state_holiday_b,state_holiday_c,school_holiday_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5181,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
5183,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
5184,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


Unnamed: 0,nb_customers_on_day
0,0.290123
1,-0.157874
2,-0.642451
3,-0.642451
4,-0.642451
...,...
5181,-0.642451
5182,-0.595088
5183,2.700523
5184,1.666298


In [30]:
X_train_cat_encoded.reset_index(drop=True, inplace=True)
X_test_cat_encoded.reset_index(drop=True, inplace=True)

X_train_scaled.reset_index(drop=True, inplace=True)
X_test_scaled.reset_index(drop=True, inplace=True)

In [32]:
X_test_scaled.isna().sum().sum()

0

In [34]:
display(X_train_cat_encoded.head())
display(X_test_cat_encoded.head())

display(X_train_scaled.head())
display(X_test_scaled.head())

Unnamed: 0,store_ID_10,store_ID_100,store_ID_1000,store_ID_1001,store_ID_1002,store_ID_1003,store_ID_1004,store_ID_1005,store_ID_1006,store_ID_1007,...,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,day_of_week_7,open_1,state_holiday_a,state_holiday_b,state_holiday_c,school_holiday_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


Unnamed: 0,store_ID_10,store_ID_100,store_ID_1000,store_ID_1001,store_ID_1002,store_ID_1003,store_ID_1004,store_ID_1005,store_ID_1006,store_ID_1007,...,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,day_of_week_7,open_1,state_holiday_a,state_holiday_b,state_holiday_c,school_holiday_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


Unnamed: 0,nb_customers_on_day
0,0.91853
1,0.616022
2,-0.642451
3,-0.642451
4,-0.642451


Unnamed: 0,nb_customers_on_day
0,0.290123
1,-0.157874
2,-0.642451
3,-0.642451
4,-0.642451


In [35]:
X_train_processed = pd.concat([X_train_cat_encoded, X_train_scaled], axis=1)
X_test_processed = pd.concat([X_test_cat_encoded, X_test_scaled], axis=1)

In [41]:
display(X_train_processed)
display(X_test_processed)

Unnamed: 0,store_ID_10,store_ID_100,store_ID_1000,store_ID_1001,store_ID_1002,store_ID_1003,store_ID_1004,store_ID_1005,store_ID_1006,store_ID_1007,...,day_of_week_4,day_of_week_5,day_of_week_6,day_of_week_7,open_1,state_holiday_a,state_holiday_b,state_holiday_c,school_holiday_1,nb_customers_on_day
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.918530
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.616022
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-0.642451
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.642451
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-0.642451
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.841431
20737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.033338
20738,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,-0.642451
20739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.642451


Unnamed: 0,store_ID_10,store_ID_100,store_ID_1000,store_ID_1001,store_ID_1002,store_ID_1003,store_ID_1004,store_ID_1005,store_ID_1006,store_ID_1007,...,day_of_week_4,day_of_week_5,day_of_week_6,day_of_week_7,open_1,state_holiday_a,state_holiday_b,state_holiday_c,school_holiday_1,nb_customers_on_day
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.290123
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-0.157874
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.642451
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,-0.642451
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-0.642451
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5181,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.642451
5182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,-0.595088
5183,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.700523
5184,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.666298


In [44]:
model = LinearRegression()
model.fit(X_train_processed, y_train)

y_pred = model.predict(X_test_processed)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f'R-squared: {r2:.4f}')
print(f'Mean Absolute Error: {mae:.2f}')
print(f'Root Mean Squared Error: {rmse:.2f}')

R-squared: 0.9545
Mean Absolute Error: 30035.05
Root Mean Squared Error: 44505.95
