In [71]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline 
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.metrics import r2_score
from torchsummary import summary
from sklearn.preprocessing import MinMaxScaler
import torch.nn as nn
import torch
from torch.utils.data import DataLoader, TensorDataset
os.chdir("E:/Training/AV/Big Mart III")

In [72]:
train = pd.read_csv("train_v9rqX0R.csv")
test = pd.read_csv("test_AbJTz2l.csv")
print(train.columns)

Index(['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Outlet_Sales'],
      dtype='object')


In [73]:
train.shape, test.shape

((8523, 12), (5681, 11))

In [74]:
train.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [75]:
test.isnull().sum()

Item_Identifier                 0
Item_Weight                   976
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  1606
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [76]:
items_trn = train[['Item_Identifier', 'Item_Weight']]
items_trn = items_trn.drop_duplicates()
items_tst = test[['Item_Identifier', 'Item_Weight']]
items_tst = items_tst.drop_duplicates()
items = pd.concat([items_trn, items_tst])
items = items.dropna()
items = items.drop_duplicates()
items.columns = ['Item_Identifier', 'I_Weight']
train_1 = train.merge(items, how='left', on='Item_Identifier')
train_1 = train_1.drop('Item_Weight', axis=1)
outlets = train_1[['Outlet_Identifier', 'Outlet_Establishment_Year',
       'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']]
outlets = outlets.drop_duplicates()

In [77]:
def clean_data(df):
    df_1 = df.merge(items, how='left', on='Item_Identifier')
    df_1 = df_1.drop('Item_Weight', axis=1)

    outlets = df_1[['Outlet_Identifier','Outlet_Size']]
    outlets = outlets.drop_duplicates()
    size_map = {
        "OUT010": "Small",
        "OUT017": "High",
        "OUT045": "High"
    }

    df_1["Outlet_Size"] = df_1["Outlet_Size"].fillna(
        df_1["Outlet_Identifier"].map(size_map)
    )

    df_1['Item_Fat_Content'] = df_1['Item_Fat_Content'].replace({'reg':'Regular',
                                   'LF':'Low Fat',
                                   'low fat':'Low Fat'})
    return df_1

train_1 = clean_data(train)
test_1 = clean_data(test)
# Validating the processing
#pd.crosstab( train_1['Outlet_Type'] , train_1['Outlet_Size'] , margins=True)
pd.crosstab( test_1['Outlet_Type'] , test_1['Outlet_Size'] , margins=True)

Outlet_Size,High,Medium,Small,All
Outlet_Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Grocery Store,0,0,722,722
Supermarket Type1,1857,620,1240,3717
Supermarket Type2,0,618,0,618
Supermarket Type3,0,624,0,624
All,1857,1862,1962,5681


In [78]:
train_1.shape, test_1.shape

((8523, 12), (5681, 11))

In [79]:
train.shape, test.shape

((8523, 12), (5681, 11))

In [80]:
test_1.isnull().sum()

Item_Identifier              0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
I_Weight                     0
dtype: int64

In [81]:
X = train_1.drop(['Item_Identifier',
                                 'Outlet_Identifier','Item_Outlet_Sales'], axis=1)
y = train_1['Item_Outlet_Sales'].values
ohe = OneHotEncoder(sparse_output=False).set_output(transform='pandas')
col_trnf = ColumnTransformer([('OHE',ohe, make_column_selector(dtype_include=object) )],
                             remainder='passthrough', verbose_feature_names_out=False)
col_trnf = col_trnf.set_output(transform='pandas')
pipe = Pipeline([('TRNS', col_trnf)])
X = pipe.fit_transform(X)

In [82]:
scaler_x = MinMaxScaler()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
                                                    random_state=25)
X_scl_trn = scaler_x.fit_transform(X_train)
X_scl_tst = scaler_x.transform(X_test)
#y_scl_trn = scaler_y.fit_transform(y_train.reshape(-1,1))
#y_scl_tst = scaler_y.transform(y_test.reshape(-1,1))
y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

In [83]:
X_torch = torch.from_numpy(X_scl_trn)
y_torch = torch.from_numpy(y_train)
X_torch_test = torch.from_numpy(X_scl_tst)
y_torch_test = torch.from_numpy(y_test)
print(X_torch.size())
print(y_torch.size())
print(X_torch_test.size())
print(y_torch_test.size())

torch.Size([6818, 32])
torch.Size([6818, 1])
torch.Size([1705, 32])
torch.Size([1705, 1])


In [84]:
joint_dataset = TensorDataset(X_torch.float(), y_torch.float())
torch.manual_seed(25)
data_loader = DataLoader(dataset=joint_dataset, batch_size=100, shuffle=True)

In [85]:
torch.manual_seed(25)
class MLPRegressor(torch.nn.Module):
    def __init__(self, input_features):
        super().__init__()
        self.linear1 = nn.Linear(input_features, 20)
        self.bn1 = nn.BatchNorm1d(20)
        self.linear2 = nn.Linear(20, 10)
        self.bn2 = nn.BatchNorm1d(10)
        self.linear3 = nn.Linear(10, 5)
        self.linear4 = nn.Linear(5, 1)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.bn1(x)
        x = self.linear2(x)
        x = self.relu(x)
        x = self.bn2(x)
        x = self.linear3(x)
        x = self.relu(x)
        x = self.linear4(x)
        x = self.relu(x)
        return x
model = MLPRegressor(X_scl_trn.shape[1])
summary(model, input_size=(X_scl_trn.shape[1],))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                   [-1, 20]             660
              ReLU-2                   [-1, 20]               0
       BatchNorm1d-3                   [-1, 20]              40
            Linear-4                   [-1, 10]             210
              ReLU-5                   [-1, 10]               0
       BatchNorm1d-6                   [-1, 10]              20
            Linear-7                    [-1, 5]              55
              ReLU-8                    [-1, 5]               0
            Linear-9                    [-1, 1]               6
             ReLU-10                    [-1, 1]               0
Total params: 991
Trainable params: 991
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Siz

In [86]:
X_torch_test = torch.from_numpy(X_scl_tst)

In [87]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
for epoch in np.arange(0,1000):
    for i, batch in enumerate(data_loader, 1):
      y_pred_prob = model(batch[0].float())
      loss = criterion(y_pred_prob, batch[1].float())
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
    test_loss = criterion(model(X_torch_test.float()), y_torch_test.float())
    if epoch%100 == 0:
          print('epoch: ', epoch+1,' train loss: ', loss.item(), 
                ' test loss: ', test_loss.item())

epoch:  1  train loss:  6636763.0  test loss:  7888355.5
epoch:  101  train loss:  2004410.0  test loss:  1217829.5
epoch:  201  train loss:  1462877.0  test loss:  1218775.5
epoch:  301  train loss:  1088200.625  test loss:  1225655.875
epoch:  401  train loss:  1145360.0  test loss:  1219377.625
epoch:  501  train loss:  1067454.375  test loss:  1221712.625
epoch:  601  train loss:  1540305.5  test loss:  1226813.75
epoch:  701  train loss:  1250297.625  test loss:  1228604.0
epoch:  801  train loss:  1176428.875  test loss:  1226588.75
epoch:  901  train loss:  720082.125  test loss:  1234127.75


In [88]:
y_pred = model(X_torch_test.float()).detach().numpy()
print(r2_score(y_test,y_pred))

0.5861363113477895


In [95]:
test_ohe = scaler_x.transform( pipe.transform( test_1 ) )
test_ohe_torch = torch.from_numpy(test_ohe)
predictions = model(test_ohe_torch.float()).detach().numpy()
predictions.shape

(5681, 1)

In [97]:
ss = pd.read_csv("sample_submission_8RXa3c6.csv")
ss['Item_Outlet_Sales'] = predictions.ravel()
ss.to_csv("bigmart_submission_pytorch_bn.csv", index=False)