**Importing essential Datasets and Libraries**

In [7]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.layers import LeakyReLU, Dropout
from tensorflow.keras import backend as K
import kerastuner as kt

  import kerastuner as kt


In [8]:
df_original = pd.read_csv('preprocessed_table.csv')

In [9]:
df_original

Unnamed: 0,Day,Month,Year,Week,Date,Store,Dept,Type,Size,Temperature,...,CPI,Unemployment,IsHoliday,Weekly_Sales,Markdown,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5
0,5,2,2010,5,2010-02-05,1,1,1,151315,42.31,...,211.096358,8.106000,0,24924.50,0,5502.9669,1474.233,236.8201,14827.2141,5103.4707
1,5,2,2010,5,2010-02-05,1,2,1,151315,42.31,...,211.096358,8.106000,0,50605.27,0,5502.9669,1474.233,236.8201,14827.2141,5103.4707
2,5,2,2010,5,2010-02-05,1,3,1,151315,42.31,...,211.096358,8.106000,0,13740.12,0,5502.9669,1474.233,236.8201,14827.2141,5103.4707
3,5,2,2010,5,2010-02-05,1,4,1,151315,42.31,...,211.096358,8.106000,0,39954.04,0,5502.9669,1474.233,236.8201,14827.2141,5103.4707
4,5,2,2010,5,2010-02-05,1,5,1,151315,42.31,...,211.096358,8.106000,0,32229.38,0,5502.9669,1474.233,236.8201,14827.2141,5103.4707
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551474,28,6,2013,26,2013-06-28,45,98,2,118221,76.05,...,194.049252,8.299803,0,,1,4842.2900,975.030,3.0000,2449.9700,3169.6900
551475,5,7,2013,27,2013-07-05,45,98,2,118221,77.50,...,194.118327,8.295893,0,,1,9090.4800,2268.580,582.7400,5797.4700,1514.9300
551476,12,7,2013,28,2013-07-12,45,98,2,118221,79.37,...,194.187760,8.291982,0,,1,3789.9400,1827.310,85.7200,744.8400,2150.3600
551477,19,7,2013,29,2013-07-19,45,98,2,118221,82.84,...,194.257397,8.288071,0,,1,2961.4900,1047.070,204.1900,363.0000,1059.4600


**Filtering null values and non-null value in different dataframes**

In [10]:
df_null = df_original[df_original["Weekly_Sales"].isnull()]
df_null.reset_index(drop=True, inplace=True)
print("1.null shape",df_null.shape)

df = df_original[df_original["Weekly_Sales"].notnull()]
df.reset_index(drop=True, inplace=True)
print("2.not null ",df.shape)

1.null shape (131267, 21)
2.not null  (420212, 21)


**Removing Outliers in Weekly_Sales**

In [11]:
Q1 = df['Weekly_Sales'].quantile(0.25)
Q3 = df['Weekly_Sales'].quantile(0.75)

IQR = Q3 - Q1

l = Q1 - 1.5 * IQR
u = Q3 + 1.5 * IQR

df = df[(df['Weekly_Sales'] >= l) & (df['Weekly_Sales'] <= u)]

**Creating new feature -> Weekly Sales Lag.**

**This feature will extract the previous week's sales Data**

In [12]:
df['Weekly_Sales_Lag1'] = df['Weekly_Sales'].shift(1)

**1st Week's will be missing due to lag feature. So filling it with 1st week's mean value.**

In [13]:
df1 = df[df["Store"]==1]

**Selecting only appropriate features with Markdown that is necessary to build a perfect model.**

**Square rooting the Weekly_Sales column to reduce skewness.**

In [14]:
features = ['Store','Type','Size','Day', 'Month', 'Year', 'Dept', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'IsHoliday','MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5', 'Weekly_Sales_Lag1']
x = df[features]
y = df['Weekly_Sales']
x.fillna(df1["Weekly_Sales_Lag1"].mean(),inplace = True)
y_transformed = np.sqrt(y)

scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x_scaled, y_transformed, test_size=0.2, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.fillna(df1["Weekly_Sales_Lag1"].mean(),inplace = True)


In [15]:
# Defining the ANN model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(512, activation='relu', input_shape=(x_train.shape[1],), kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='linear')
])

# Compile the model with learning rate
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005), loss='mean_squared_error')

# Train the model with EarlyStopping and Reducing the Learning Rate
history = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=100, batch_size=32, callbacks=[
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10)
])


y_pred = model.predict(x_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

metrics = {
    'R2': r2,
    'Mean Absolute Error': mae,
    'Mean Squared Error': mse,
    'Root Mean Squared Error': rmse
}

print(metrics)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m9621/9621[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 3ms/step - loss: 2595.8896 - val_loss: 1832.4745 - learning_rate: 5.0000e-04
Epoch 2/100
[1m9621/9621[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 3ms/step - loss: 1903.1234 - val_loss: 1684.9916 - learning_rate: 5.0000e-04
Epoch 3/100
[1m9621/9621[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 3ms/step - loss: 1801.9327 - val_loss: 1606.4099 - learning_rate: 5.0000e-04
Epoch 4/100
[1m9621/9621[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 2ms/step - loss: 1750.7939 - val_loss: 1548.5500 - learning_rate: 5.0000e-04
Epoch 5/100
[1m9621/9621[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 2ms/step - loss: 1701.2429 - val_loss: 1517.9486 - learning_rate: 5.0000e-04
Epoch 6/100
[1m9621/9621[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 3ms/step - loss: 1667.4309 - val_loss: 1431.4082 - learning_rate: 5.0000e-04
Epoch 7/100
[1m9621/9621[0m [32m━━━━━━━━━━━━━━━━━

**Saving the model to find the null values**

In [17]:
import pickle

In [18]:
with open("deep_learning_with_markdown.pkl", 'wb') as file:
    pickle.dump(model, file)

In [19]:
with open("scalar_with_markdown.pkl",'wb') as f:
    pickle.dump(scaler, f)