# **Importing Modules**

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
import statsmodels.api as sm

from sklearn.preprocessing import MinMaxScaler

import pickle 
from os import path

from sklearn import metrics
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

from keras.models import Sequential
from keras.layers import Dense

In [None]:
from scikeras.wrappers import KerasRegressor

# **Importing Datasets**

In [None]:
#data = pd.read_csv('/content/drive/MyDrive/walmart-recruiting-store-sales-forecasting/train.csv')
#stores = pd.read_csv('/content/drive/MyDrive/walmart-recruiting-store-sales-forecasting/stores.csv')
#features = pd.read_csv('/content/drive/MyDrive/walmart-recruiting-store-sales-forecasting/features.csv')

data = pd.read_csv('../PF dataset/train.csv')
stores = pd.read_csv('../PF dataset/stores.csv')
features = pd.read_csv('../PF dataset/features.csv')

**Training Dataset**

In [None]:
data.shape

In [None]:
data.tail()

In [None]:
data.info()

**Dataset containing info of Stores**

In [None]:
stores.shape

In [None]:
stores.tail()

In [None]:
stores.info()

**Dataset containing additional data of Stores**

In [None]:
features.shape

In [None]:
features.tail()

In [None]:
features.info()

# **Handling missing values of features dataset**

In [None]:
features["CPI"].fillna(features["CPI"].median(),inplace=True)
features["Unemployment"].fillna(features["Unemployment"].median(),inplace=True)

In [None]:
for i in range(1,6):
  features["MarkDown"+str(i)] = features["MarkDown"+str(i)].apply(lambda x: 0 if x < 0 else x)
  features["MarkDown"+str(i)].fillna(value=0,inplace=True)

In [None]:
features.info()

# **Merging Training Dataset and merged stores-features Dataset**

In [None]:
data = pd.merge(data,stores,on='Store',how='left')

In [None]:
data = pd.merge(data,features,on=['Store','Date'],how='left')

In [None]:
data['Date'] = pd.to_datetime(data['Date'])

In [None]:
data.sort_values(by=['Date'],inplace=True)

In [None]:
data.set_index(data.Date, inplace=True)

In [None]:
data['IsHoliday_x'].isin(data['IsHoliday_y']).all()

In [None]:
data.drop(columns='IsHoliday_x',inplace=True)
data.rename(columns={"IsHoliday_y" : "IsHoliday"}, inplace=True)
data.info()

In [None]:
data.head()

# **Splitting Date Column**

In [None]:
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Week'] = data['Date'].dt.isocalendar().week.astype(int)


In [None]:
data.tail(10)

# **Outlier Detection and Abnormalities**

**Outliers**

In [None]:
agg_data = data.groupby(['Store', 'Dept']).Weekly_Sales.agg(['max', 'min', 'mean', 'median', 'std']).reset_index()
agg_data.isnull().sum()

In [None]:
store_data = pd.merge(left=data,right=agg_data,on=['Store', 'Dept'],how ='left')
store_data.dropna(inplace=True)
data = store_data.copy()
del store_data

In [None]:
data['Date'] = pd.to_datetime(data['Date'])
data.sort_values(by=['Date'],inplace=True)
data.set_index(data.Date, inplace=True)
data.head()

In [None]:
data['Total_MarkDown'] = data['MarkDown1']+data['MarkDown2']+data['MarkDown3']+data['MarkDown4']+data['MarkDown5']
data.drop(['MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5'], axis = 1,inplace=True)

In [None]:
numeric_col = ['Weekly_Sales','Size','Temperature','Fuel_Price','CPI','Unemployment','Total_MarkDown']
data_numeric = data[numeric_col].copy()

In [None]:
data.shape

In [None]:
data = data[(np.abs(stats.zscore(data_numeric)) < 2.5).all(axis = 1)]
data.shape

In [None]:
data=data[data['Weekly_Sales']>=0]

In [None]:
data.shape

In [None]:
data['IsHoliday'] = data['IsHoliday'].astype('int')

In [None]:
data

In [None]:
data.to_csv('../data/preprocessed_walmart_dataset.csv')

# **Data Visuallizations**

**Average Monthly Sales**

In [None]:
plt.figure(figsize=(14,8))
sns.barplot(x='Month',y='Weekly_Sales',data=data)
plt.ylabel('Sales',fontsize=14)
plt.xlabel('Months',fontsize=14)
plt.title('Average Monthly Sales',fontsize=16)
plt.grid()

**Monthly Sales for Each Year**

In [None]:
data_monthly = pd.crosstab(data["Year"], data["Month"], values=data["Weekly_Sales"],aggfunc='sum')
data_monthly

In [None]:
fig, axes = plt.subplots(3,4,figsize=(16,8))
plt.suptitle('Monthly Sales for each Year', fontsize=18)
k=1
for i in range(3):
    for j in range(4):
      sns.lineplot(ax=axes[i,j],data=data_monthly[k])
      plt.subplots_adjust(wspace=0.4,hspace=0.32)
      plt.ylabel(k,fontsize=12)
      plt.xlabel('Years',fontsize=12)
      k+=1

plt.show()

**Average Weekly Sales Store wise**

In [None]:
plt.figure(figsize=(20,8))
sns.barplot(x='Store',y='Weekly_Sales',data=data)
plt.grid()
plt.title('Average Sales per Store', fontsize=18)
plt.ylabel('Sales', fontsize=16)
plt.xlabel('Store', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(20,8))
sns.barplot(x='Dept',y='Weekly_Sales',data=data)
plt.grid()
plt.title('Average Sales per Department', fontsize=18)
plt.ylabel('Sales', fontsize=16)
plt.xlabel('Department', fontsize=16)
plt.show()

**Sales Vs Temperature**

In [None]:
plt.figure(figsize=(10,8))
sns.distplot(data['Temperature'])
plt.title('Effect of Temperature',fontsize=15)
plt.xlabel('Temperature',fontsize=14)
plt.ylabel('Density',fontsize=14)
plt.show()

**Holiday Distribution**

In [None]:
plt.figure(figsize=(8,8))
plt.pie(data['IsHoliday'].value_counts(),labels=['No Holiday','Holiday'],autopct='%0.2f%%')
plt.title("Pie chart distribution",fontsize=14)
plt.legend()

plt.show()

#**Time Series Decompose**

In [None]:
sm.tsa.seasonal_decompose(data['Weekly_Sales'].resample('MS').mean(), model='additive').plot()
plt.show()

# **One-hot-encoding**

In [None]:
cat_col = ['Store','Dept','Type']
data_cat = data[cat_col].copy()

In [None]:
data_cat.tail()

In [None]:
data_cat = pd.get_dummies(data_cat,columns=cat_col)

In [None]:
data_cat.head()

In [None]:
data.shape

In [None]:
data = pd.concat([data, data_cat],axis=1)

In [None]:
data.shape

In [None]:
data.drop(columns=cat_col,inplace=True)

In [None]:
data.drop(columns=['Date'],inplace=True)

In [None]:
data.shape

# **Data Normalization**

In [None]:
num_col = ['Weekly_Sales','Size','Temperature','Fuel_Price','CPI','Unemployment','Total_MarkDown','max','min','mean','median','std']

In [None]:
minmax_scale = MinMaxScaler(feature_range=(0, 1))
def normalization(df,col):
  for i in col:
    arr = df[i]
    arr = np.array(arr)
    df[i] = minmax_scale.fit_transform(arr.reshape(len(arr),1))
  return df

In [None]:
data.head()

In [None]:
data = normalization(data.copy(),num_col)

In [None]:
data.head()

# **Correlation between features of dataset**

In [None]:
plt.figure(figsize=(15,8))
corr = data[num_col].corr()
sns.heatmap(corr,vmax=1.0,annot=True)
plt.title('Correlation Matrix',fontsize=16)
plt.show()

# **Recursive Feature Elimination**

In [None]:
feature_col = data.columns.difference(['Weekly_Sales'])
feature_col

In [None]:

# param_grid={'n_estimators':np.arange(10,25)}
# tree=GridSearchCV(RandomForestRegressor(oob_score=False,warm_start=True),param_grid,cv=5)
# tree.fit(data_train[feature_col],data_train['Weekly_Sales'])


In [None]:
# tree.best_params_

In [None]:
radm_clf = RandomForestRegressor(oob_score=True,n_estimators=23)
radm_clf.fit(data[feature_col], data['Weekly_Sales'])

In [None]:
pkl_filename = "../models/feature_elim_regressor.pkl"
if (not path.isfile(pkl_filename)):
  # saving the trained model to disk 
  with open(pkl_filename, 'wb') as file:
    pickle.dump(radm_clf, file)
  print("Saved model to disk")
else:
  print("Model already saved")

In [None]:
indices = np.argsort(radm_clf.feature_importances_)[::-1]
feature_rank = pd.DataFrame(columns = ['rank', 'feature', 'importance'])

for f in range(data[feature_col].shape[1]):
    feature_rank.loc[f] = [f+1,
                           data[feature_col].columns[indices[f]],
                           radm_clf.feature_importances_[indices[f]]]

feature_rank

In [None]:
x=feature_rank.loc[0:22,['feature']]
x=x['feature'].tolist()
print(x)

In [None]:
X = data[x]
Y = data['Weekly_Sales']

In [None]:
data = pd.concat([X,Y],axis=1)

In [None]:
data

In [None]:
data.to_csv('../data/final_data.csv')

# **Data Splitted into Training, Validation, Test**

In [None]:
X = data.drop(['Weekly_Sales'],axis=1)
Y = data.Weekly_Sales

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.20, random_state=50)

# **Linear Regression Model**

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
lr_acc = lr.score(X_test,y_test)*100
print("Linear Regressor Accuracy - ",lr_acc)

In [None]:
y_pred = lr.predict(X_test)

In [None]:
print("MAE" , metrics.mean_absolute_error(y_test, y_pred))
print("MSE" , metrics.mean_squared_error(y_test, y_pred))
print("RMSE" , np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("R2" , metrics.explained_variance_score(y_test, y_pred))

In [None]:
lr_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
lr_df

In [None]:
plt.figure(figsize=(20,8))
plt.plot(lr.predict(X_test[:200]), label="prediction", linewidth=2.0,color='blue')
plt.plot(y_test[:200].values, label="real_values", linewidth=2.0,color='lightcoral')
plt.legend(loc="best")

plt.show()

**Saving trained model**

In [None]:
pkl_filename = "../models/linear_regressor.pkl"
if (not path.isfile(pkl_filename)):
  # saving the trained model to disk 
  with open(pkl_filename, 'wb') as file:
    pickle.dump(lr, file)
  print("Saved model to disk")
else:
  print("Model already saved")

# **Random Forest Regressor Model**

In [None]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

In [None]:
rf_acc = rf.score(X_test,y_test)*100
print("Random Forest Regressor Accuracy - ",rf_acc)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
print("MAE" , metrics.mean_absolute_error(y_test, y_pred))
print("MSE" , metrics.mean_squared_error(y_test, y_pred))
print("RMSE" , np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("R2" , metrics.explained_variance_score(y_test, y_pred))

In [None]:
rf_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
rf_df

In [None]:
plt.figure(figsize=(20,8))
plt.plot(rf.predict(X_test[:200]), label="prediction", linewidth=2.0,color='blue')
plt.plot(y_test[:200].values, label="real_values", linewidth=2.0,color='lightcoral')
plt.legend(loc="best")
plt.show()

**Saving trained model**

In [None]:
pkl_filename = "../models/randomforest_regressor.pkl"
if (not path.isfile(pkl_filename)):
  # saving the trained model to disk 
  with open(pkl_filename, 'wb') as file:
    pickle.dump(rf, file)
  print("Saved model to disk")
else:
  print("Model already saved")

# **Custom Deep Learning Neural Network**

In [None]:
def create_model():
  model = Sequential()
  model.add(Dense(64, input_dim=X_train.shape[1], kernel_initializer='normal',activation='relu'))
  model.add(Dense(32, kernel_initializer='normal'))
  model.add(Dense(1, kernel_initializer='normal'))
  model.compile(loss='mean_absolute_error', optimizer='adam')
  return model

In [None]:
estimator_model = KerasRegressor(
    model=create_model,
    verbose=1,
    epochs=100,
    batch_size=5000
)

In [None]:
history = estimator_model.fit(X_train, y_train, validation_split=0.1, epochs=500, batch_size=5000)

In [None]:
if hasattr(estimator_model, "history_"):
    h = estimator_model.history_
    if hasattr(h, "history"):   
        h = h.history          

    train_loss = h.get("loss", [])
    val_loss   = h.get("val_loss", [])

    import matplotlib.pyplot as plt
    plt.figure(figsize=(8,4))
    plt.plot(train_loss, label="Train Loss")
    if len(val_loss) > 0:
        plt.plot(val_loss, label="Val Loss")
    plt.title("Model loss")
    plt.ylabel("loss")
    plt.xlabel("epochs")
    plt.legend(loc="upper right")
    plt.show()
else:
    print("No history_ found on estimator_model")

In [None]:
y_pred = estimator_model.predict(X_test)
dnn_acc = metrics.r2_score(y_pred, y_test)*100
print("Deep Neural Network accuracy - ",dnn_acc)

In [None]:
print("MAE" , metrics.mean_absolute_error(y_test, y_pred))
print("MSE" , metrics.mean_squared_error(y_test, y_pred))
print("RMSE" , np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("R2" , metrics.explained_variance_score(y_test, y_pred))

In [None]:
dnn_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
dnn_df

In [None]:
plt.figure(figsize=(20,8))
plt.plot(estimator_model.predict(X_test[200:300]), label="prediction", linewidth=2.0,color='blue')
plt.plot(y_test[200:300].values, label="real_values", linewidth=2.0,color='lightcoral')
plt.legend(loc="best")

In [None]:
import os
filepath = '../models/dnn_regressor.json'
weightspath = '../models/dnn_regressor.weights.h5'
os.makedirs(os.path.dirname(filepath), exist_ok=True)
model = estimator_model.model_   

model_json = model.to_json()
with open(filepath, "w") as json_file:
    json_file.write(model_json)

model.save_weights(weightspath)

print("Model architecture & weights saved:")
print(f"  - JSON: {filepath}")
print(f"  - H5 weights: {weightspath}")


In [None]:
from keras.models import model_from_json

# Load kiến trúc
with open(filepath, "r") as json_file:
    loaded_model_json = json_file.read()
loaded_model = model_from_json(loaded_model_json)

# Load trọng số
loaded_model.load_weights(weightspath)
print("Loaded model from disk")

loaded_model.compile(loss='mean_absolute_error', optimizer='adam')

# Dự đoán
y_pred = loaded_model.predict(X_test)
dnn_acc = metrics.r2_score(y_pred, y_test)*100
print("Deep Neural Network accuracy - ",dnn_acc)

# **Comparing Models**

In [None]:
acc = {'model':['lr_acc','rf_acc','dnn_acc'],'accuracy':[lr_acc,rf_acc,dnn_acc]}

In [None]:
acc_df = pd.DataFrame(acc)
acc_df

In [None]:
plt.figure(figsize=(10,8))
sns.barplot(x='model',y='accuracy',data=acc_df)
plt.show()