In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

## Data Preprocessing

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [4]:
#Seperate categorical and numeric columns
cat_cols = list(df.select_dtypes("O").columns)
num_cols = [col for col in df.columns if col not in cat_cols]
print(f"categorical columns:\n {cat_cols}")

categorical columns:
 ['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']


In [5]:
# check if any numeric columns are discrete
discrete_cols = [col for col in num_cols if len(df[col].unique()) < 20]
continous_cols = [col for col in num_cols if col not in discrete_cols]
print(f"Discrete col: \n {discrete_cols} \n Continous_cols: \n {continous_cols}")

Discrete col: 
 ['Outlet_Establishment_Year'] 
 Continous_cols: 
 ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Item_Outlet_Sales']


In [6]:
df.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [7]:
discrete_and_categorical_cols = []
discrete_and_categorical_cols.extend(cat_cols)
print(discrete_and_categorical_cols)

['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']


In [8]:
discrete_and_categorical_cols.extend(discrete_cols)
print(discrete_and_categorical_cols)

['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Outlet_Establishment_Year']


## Impute missing Values

In [9]:
#Impute discrete and categorical columns with mode

for col in discrete_and_categorical_cols:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].mode()[0])
        
df.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                     0
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [10]:
#Imputing continous numeric columns with mean
for col in continous_cols:
    if df[col].isnull().sum()>0:
        df[col] = df[col].fillna(df[col].mean())
df.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

In [11]:
## Converting categorical to numeric
#before that check if there is any category column that is a unique identifier
remove_cols = []
for col in cat_cols:
    if len(df[col].unique()) > 1000:
        remove_cols.append(col)
print(remove_cols)

['Item_Identifier']


In [12]:
df.drop(remove_cols, inplace = True, axis = 1)

In [13]:
df.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Medium,Tier 3,Grocery Store,732.38
4,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [14]:
for col in df.select_dtypes("O").columns[2:]:
    print(col)
    print(df[col].value_counts())

Outlet_Identifier
OUT027    935
OUT013    932
OUT049    930
OUT046    930
OUT035    930
OUT045    929
OUT018    928
OUT017    926
OUT010    555
OUT019    528
Name: Outlet_Identifier, dtype: int64
Outlet_Size
Medium    5203
Small     2388
High       932
Name: Outlet_Size, dtype: int64
Outlet_Location_Type
Tier 3    3350
Tier 2    2785
Tier 1    2388
Name: Outlet_Location_Type, dtype: int64
Outlet_Type
Supermarket Type1    5577
Grocery Store        1083
Supermarket Type3     935
Supermarket Type2     928
Name: Outlet_Type, dtype: int64


In [15]:
nominal_cat_cols = ["Item_Type", "Outlet_Type","Outlet_Identifier","Outlet_Location_Type","Outlet_Type"]
ordinal_cat_cols = ["Item_Fat_Content", "Outlet_Size"]

In [16]:
for col in ordinal_cat_cols:
    print(df[col].value_counts())

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64
Medium    5203
Small     2388
High       932
Name: Outlet_Size, dtype: int64


In [17]:
Item_Fat_Content_mapping = {"Low Fat" : 0, "low fat" : 0, "LF" : 0,
                            "Regular" : 1, "reg" : 1}

Outlet_Size_mapping = {"Small": 0, "Medium": 1, "High": 2}

In [18]:
df["Item_Fat_Content"] = df["Item_Fat_Content"].map(Item_Fat_Content_mapping)
df["Outlet_Size"] = df["Outlet_Size"].map(Outlet_Size_mapping)

In [19]:
df[ordinal_cat_cols].head()

Unnamed: 0,Item_Fat_Content,Outlet_Size
0,0,1
1,1,1
2,0,1
3,1,1
4,0,2


In [20]:
df[nominal_cat_cols].head()

Unnamed: 0,Item_Type,Outlet_Type,Outlet_Identifier,Outlet_Location_Type,Outlet_Type.1
0,Dairy,Supermarket Type1,OUT049,Tier 1,Supermarket Type1
1,Soft Drinks,Supermarket Type2,OUT018,Tier 3,Supermarket Type2
2,Meat,Supermarket Type1,OUT049,Tier 1,Supermarket Type1
3,Fruits and Vegetables,Grocery Store,OUT010,Tier 3,Grocery Store
4,Household,Supermarket Type1,OUT013,Tier 3,Supermarket Type1


In [21]:
df_ohe = pd.get_dummies(df,drop_first=True)

In [22]:
df_ohe.shape

(8523, 36)

In [23]:
X = df_ohe.drop("Item_Outlet_Sales", axis = 1)
y = df_ohe["Item_Outlet_Sales"]

### Train/Test split

In [24]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42, test_size=0.3)

### Scaling

In [25]:
print(num_cols)

['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year', 'Item_Outlet_Sales']


In [26]:
for col in num_cols:
    min_val = df_ohe[col].min()
    max_val = df_ohe[col].max()
    df_ohe[col] = (df_ohe[col] - min_val)/(max_val - min_val)


Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Item_Outlet_Sales,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,...,Outlet_Identifier_OUT027,Outlet_Identifier_OUT035,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,0.282525,0,0.048866,0.927507,0.583333,1,0.283587,0,0,0,...,0,0,0,0,1,0,0,1,0,0
1,0.081274,1,0.058705,0.072068,1.0,1,0.031419,0,0,0,...,0,0,0,0,0,0,1,0,1,0
2,0.770765,0,0.051037,0.468288,0.583333,1,0.158115,0,0,0,...,0,0,0,0,1,0,0,1,0,0
3,0.871986,1,0.0,0.640093,0.541667,1,0.053555,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0.260494,0,0.0,0.095805,0.083333,2,0.073651,0,0,0,...,0,0,0,0,0,0,1,1,0,0


In [27]:
df_ohe[num_cols].head()


Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
0,0.282525,0.048866,0.927507,0.583333,0.283587
1,0.081274,0.058705,0.072068,1.0,0.031419
2,0.770765,0.051037,0.468288,0.583333,0.158115
3,0.871986,0.0,0.640093,0.541667,0.053555
4,0.260494,0.0,0.095805,0.083333,0.073651


In [28]:
##Model architecture
import keras
from keras.layers import InputLayer, Dense
from keras.models import Sequential
from keras.activations import linear, sigmoid, relu, leaky_relu
from keras.optimizers import Adam, RMSprop, SGD

In [29]:
input_neurons = X_train.shape[1]
neurons_hidden_layer_1 = 100
neurons_hidden_layer_2 = 40
neurons_hidden_layer_3 = 10
output_neurons = 1

In [30]:
model = Sequential()
model.add(InputLayer(input_shape = (input_neurons,)))
model.add(Dense(units = neurons_hidden_layer_1, activation=relu))
model.add(Dense(units = neurons_hidden_layer_2, activation=sigmoid))
model.add(Dense(units = neurons_hidden_layer_3, activation=relu))
model.add(Dense(units=1, activation=linear))



In [31]:
model.summary()

### Compiling the model - Defining the loss fucntion and optimizer

In [45]:
model.compile(loss=keras.losses.MeanSquaredLogarithmicError, optimizer="adam", metrics = ["msle"])

In [46]:
model.compile?

[1;31mSignature:[0m
[0mmodel[0m[1;33m.[0m[0mcompile[0m[1;33m([0m[1;33m
[0m    [0moptimizer[0m[1;33m=[0m[1;34m'rmsprop'[0m[1;33m,[0m[1;33m
[0m    [0mloss[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mloss_weights[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmetrics[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mweighted_metrics[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mrun_eagerly[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0msteps_per_execution[0m[1;33m=[0m[1;36m1[0m[1;33m,[0m[1;33m
[0m    [0mjit_compile[0m[1;33m=[0m[1;34m'auto'[0m[1;33m,[0m[1;33m
[0m    [0mauto_scale_loss[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Configures the model for training.

Example:

```python
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss=keras.losses.BinaryCrossentropy(),
    m

In [47]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs = 1000)

Epoch 1/1000
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.2735 - msle: 0.2735 - val_loss: 0.3145 - val_msle: 0.3145
Epoch 2/1000
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2884 - msle: 0.2884 - val_loss: 0.3040 - val_msle: 0.3040
Epoch 3/1000
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2723 - msle: 0.2723 - val_loss: 0.2684 - val_msle: 0.2684
Epoch 4/1000
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2969 - msle: 0.2969 - val_loss: 0.2716 - val_msle: 0.2716
Epoch 5/1000
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2709 - msle: 0.2709 - val_loss: 0.2862 - val_msle: 0.2862
Epoch 6/1000
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2662 - msle: 0.2662 - val_loss: 0.2707 - val_msle: 0.2707
Epoch 7/1000
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x1c4a5948f10>

In [48]:
pred_y_val = model.predict(X_val)

[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [49]:
from sklearn.metrics import mean_squared_log_error
mean_squared_log_error(y_val, pred_y_val)

0.2704593855552971

In [50]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_val, pred_y_val)

1180608.6223342223

In [51]:
model.compile(loss=keras.losses.mean_absolute_error, optimizer="adam", metrics = ["mae"])
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs = 1000)

Epoch 1/1000
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 754.8895 - mae: 754.8895 - val_loss: 743.9387 - val_mae: 743.9387
Epoch 2/1000
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 754.8705 - mae: 754.8705 - val_loss: 745.0402 - val_mae: 745.0402
Epoch 3/1000
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 756.0360 - mae: 756.0360 - val_loss: 746.5102 - val_mae: 746.5102
Epoch 4/1000
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 744.9127 - mae: 744.9127 - val_loss: 751.2628 - val_mae: 751.2628
Epoch 5/1000
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 753.3550 - mae: 753.3550 - val_loss: 735.9861 - val_mae: 735.9861
Epoch 6/1000
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 761.8997 - mae: 761.8997 - val_loss: 752.1109 - val_mae: 752.1109
Epoch 7/1000
[1m187/187[0m

KeyboardInterrupt: 

In [52]:
pred_y_val = model.predict(X_val)


from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_val, pred_y_val)

[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


740.6446414208089