In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xgb
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder


In [2]:
# Load the data using the specified file path
df = pd.read_csv('sales.csv')

In [3]:
# Inspect the data
print(df.head())
print(df.info())

   Unnamed: 0  store_ID  day_of_week        date  nb_customers_on_day  open  \
0      425390       366            4  2013-04-18                  517     1   
1      291687       394            6  2015-04-11                  694     1   
2      411278       807            4  2013-08-29                  970     1   
3      664714       802            2  2013-05-28                  473     1   
4      540835       726            4  2013-10-10                 1068     1   

   promotion state_holiday  school_holiday  sales  
0          0             0               0   4422  
1          0             0               0   8297  
2          1             0               0   9729  
3          1             0               0   6513  
4          1             0               0  10882  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640840 entries, 0 to 640839
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0

In [4]:
# Preprocess the data
# Convert 'date' column to datetime using the correct format
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

In [5]:
# Extract day, month, year from 'date'
df['day'] = df['date'].dt.day
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

In [7]:
# Convert categorical variables to numeric
label_encoder = LabelEncoder()

df['day_of_week'] = label_encoder.fit_transform(df['day_of_week'])

'''df['state_holiday'] = label_encoder.fit_transform(df['state_holiday'])'''

df['state_holiday'] = df['state_holiday'].apply(lambda x: 0 if x == '0' else 1)

df.head()

Unnamed: 0.1,Unnamed: 0,store_ID,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales,day,month,year
0,425390,366,3,2013-04-18,517,1,0,1,0,4422,18,4,2013
1,291687,394,5,2015-04-11,694,1,0,1,0,8297,11,4,2015
2,411278,807,3,2013-08-29,970,1,1,1,0,9729,29,8,2013
3,664714,802,1,2013-05-28,473,1,1,1,0,6513,28,5,2013
4,540835,726,3,2013-10-10,1068,1,1,1,0,10882,10,10,2013


In [188]:
df['state_holiday'].value_counts()

state_holiday
0    621160
1     19680
Name: count, dtype: int64

In [8]:
# Select features and target variable
features = ['store_ID', 'day_of_week', 'nb_customers_on_day', 'open', 'promotion', 'state_holiday', 'school_holiday','month']
X = df[features]
y = df['sales']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [182]:
'''correlation_matrix = X.corr()

plt.figure(figsize = (10,8))
sns.heatmap(correlation_matrix, cmap = 'coolwarm', annot=True, fmt=".2f")
plt.show()'''


'correlation_matrix = X.corr()\n\nplt.figure(figsize = (10,8))\nsns.heatmap(correlation_matrix, cmap = \'coolwarm\', annot=True, fmt=".2f")\nplt.show()'

In [173]:
# Train a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [174]:
# Predict sales on the testing set
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)



In [175]:
# Calculate the R^2 score
r2_train = r2_score(y_train, y_train_pred)
r2_test  = r2_score(y_test, y_test_pred)

print('r2_train:', r2_train)
print('r2_test:', r2_test)


r2_train: 0.8541481232545788
r2_test: 0.8515010917013247


In [176]:
'''plt.scatter(y_test, y_pred)
plt.xlabel('Actual Sales')
plt.ylabel('Predicted Sales')
plt.title('Actual vs Predicted Sales')
plt.show()'''

"plt.scatter(y_test, y_pred)\nplt.xlabel('Actual Sales')\nplt.ylabel('Predicted Sales')\nplt.title('Actual vs Predicted Sales')\nplt.show()"

In [177]:
dtr_model = DecisionTreeRegressor()
dtr_model.fit(X_train, y_train)

dtr_y_train_pred = dtr_model.predict(X_train)
dtr_y_test_pred  = dtr_model.predict(X_test)

dtr_r2_train = r2_score(y_train, dtr_y_train_pred)
dtr_r2_test = r2_score(y_test, dtr_y_test_pred)

print(f"DTR R2 Score train: {dtr_r2_train:.5f}")
print(f"DTR R2 Score test: {dtr_r2_test:.5f}")

DTR R2 Score train: 0.99996
DTR R2 Score test: 0.92841


In [178]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', max_depth=3, n_estimators=100)
xgb_model.fit(X_train, y_train)

# Make predictions on the test data
xg_y_train_preds = xgb_model.predict(X_train)
xg_y_test_preds = xgb_model.predict(X_test)

# Calculate the R2 score
xr2_train = r2_score(y_train, xg_y_train_preds)
xr2_test = r2_score(y_test, xg_y_test_preds)

print(f'XG R2 score train: {xr2_train:.5f}')
print(f'XG R2 score test: {xr2_test:.5f}')

XG R2 score train: 0.90336
XG R2 score test: 0.90188


In [9]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

rf_y_train_pred = rf_model.predict(X_train)
rf_y_test_pred = rf_model.predict(X_test)

rf_r2_train = r2_score(y_train, rf_y_train_pred)
rf_r2_test = r2_score(y_test, rf_y_test_pred)

print("R2 Score train:", rf_r2_train)
print("R2 Score test:", rf_r2_test)

R2 Score train: 0.9944456929998814
R2 Score test: 0.9602194728645569


In [16]:
# Load and preprocess the new data
new_df = pd.read_csv('REAL_DATA.csv')

# Convert 'date' column to datetime using the correct format
new_df['date'] = pd.to_datetime(new_df['date'], format='%d/%m/%Y')

# Extract day, month, year from 'date'
new_df['day'] = new_df['date'].dt.day
new_df['month'] = new_df['date'].dt.month
new_df['year'] = new_df['date'].dt.year

# Convert categorical variables to numeric
label_encoder = LabelEncoder()

new_df['day_of_week'] = label_encoder.fit_transform(new_df['day_of_week'])

'''df['state_holiday'] = label_encoder.fit_transform(df['state_holiday'])'''

new_df['state_holiday'] = new_df['state_holiday'].apply(lambda x: 0 if x == '0' else 1)

# Select features and target variable
features = ['store_ID', 'day_of_week', 'nb_customers_on_day', 'open', 'promotion', 'state_holiday', 'school_holiday','month']
X_new = new_df[features]

# Make predictions
predictions = rf_model.predict(X_new)

# Add predictions to the DataFrame
new_df['sales predictions'] = predictions

# Drop original column after extraction
new_df.drop(columns=['day', 'month', 'year'], inplace=True)

# Save or append predictions to the CSV file
existing_csv = 'G5.csv'
new_df.to_csv(existing_csv, mode='a', header=True, index=False)