In [1]:
import pandas as pd
import numpy as np
import keras
from sklearn.model_selection import train_test_split

In [2]:
from keras.layers import InputLayer, Dense

In [3]:
df = pd.read_csv("test.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5681 entries, 0 to 5680
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            5681 non-null   object 
 1   Item_Weight                4705 non-null   float64
 2   Item_Fat_Content           5681 non-null   object 
 3   Item_Visibility            5681 non-null   float64
 4   Item_Type                  5681 non-null   object 
 5   Item_MRP                   5681 non-null   float64
 6   Outlet_Identifier          5681 non-null   object 
 7   Outlet_Establishment_Year  5681 non-null   int64  
 8   Outlet_Size                4075 non-null   object 
 9   Outlet_Location_Type       5681 non-null   object 
 10  Outlet_Type                5681 non-null   object 
dtypes: float64(3), int64(1), object(7)
memory usage: 488.3+ KB


In [4]:
high_col = [col for col in df.select_dtypes("O").columns if len(df[col].unique()) > 500]

In [5]:
print(high_col)

['Item_Identifier']


In [6]:
cat_cols = df.select_dtypes(include = "O").columns
cat_na = [col for col in df.columns if df[col].isnull().sum() > 0 and len(df[col].unique()) < 20]

In [7]:
for col in cat_na:
    df[col].fillna(df[col].mode()[0], inplace = True)

In [8]:
num_na = [col for col in df.columns if df[col].isnull().sum()>0]
num_na

['Item_Weight']

In [9]:
for col in num_na:
    df[col].fillna(df[col].mean(), inplace=True)

In [10]:
cat_label_cols = [col for col in cat_cols if len(df[col].unique()) < 10]

In [11]:
cat_label_cols

['Item_Fat_Content', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']

In [12]:
for col in cat_label_cols:
    print(col)
    print(df[col].value_counts())
    print()

Item_Fat_Content
Low Fat    3396
Regular    1935
LF          206
reg          78
low fat      66
Name: Item_Fat_Content, dtype: int64

Outlet_Size
Medium    3468
Small     1592
High       621
Name: Outlet_Size, dtype: int64

Outlet_Location_Type
Tier 3    2233
Tier 2    1856
Tier 1    1592
Name: Outlet_Location_Type, dtype: int64

Outlet_Type
Supermarket Type1    3717
Grocery Store         722
Supermarket Type3     624
Supermarket Type2     618
Name: Outlet_Type, dtype: int64



In [13]:
Item_Fat_Content_mappings = {"Low Fat" : 0,"low fat" : 0, "reg" : 1, "Regular" : 1,
                             "LF": 0}
outlet_size_mapping = {"Small": 0, "Medium": 1, "High": 2}
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [14]:
df["Item_Fat_Content"] = df["Item_Fat_Content"].map(Item_Fat_Content_mappings)

df["Outlet_Size"] = df["Outlet_Size"].map(outlet_size_mapping)
df["Outlet_Location_Type"] = df["Outlet_Location_Type"].map({"Tier 1": 0, "Tier 2": 1, "Tier 3": 2})
df["Outlet_Type"] = df["Outlet_Type"].map({"Supermarket Type1": 1,"Grocery Store": 2,"Supermarket Type3": 3,"Supermarket Type2": 4})



In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5681 entries, 0 to 5680
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            5681 non-null   object 
 1   Item_Weight                5681 non-null   float64
 2   Item_Fat_Content           5681 non-null   int64  
 3   Item_Visibility            5681 non-null   float64
 4   Item_Type                  5681 non-null   object 
 5   Item_MRP                   5681 non-null   float64
 6   Outlet_Identifier          5681 non-null   object 
 7   Outlet_Establishment_Year  5681 non-null   int64  
 8   Outlet_Size                5681 non-null   int64  
 9   Outlet_Location_Type       5681 non-null   int64  
 10  Outlet_Type                5681 non-null   int64  
dtypes: float64(3), int64(5), object(3)
memory usage: 488.3+ KB


In [16]:
df.columns

Index(['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type'],
      dtype='object')

In [17]:
cat_cols = df.select_dtypes("O").columns
df = df.drop("Item_Identifier", axis = 1)
df_oh = pd.get_dummies(df)

In [18]:
# X = df_oh.drop("Item_Outlet_Sales", axis = 1)
# y = df_oh["Item_Outlet_Sales"]
# X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

In [19]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# scaler_train = scaler.fit(X_train)
# scaler_train.transform(X_train)
# scaler_train.transform(X_test)

In [20]:
def scaler_columns(df):
    for col in df.columns:
        min_val = df[col].min()
        max_val = df[col].max()
        # print(min_val, max_val)
        df[col] = (df[col] - min_val)/(max_val- min_val)
    return df

In [21]:
X_train = scaler_columns(df_oh.drop("Outlet_Establishment_Year", axis = 1))

In [22]:
df_final = pd.concat([X_train, df_oh["Outlet_Establishment_Year"]], axis=1)

In [23]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5681 entries, 0 to 5680
Data columns (total 34 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Item_Weight                      5681 non-null   float64
 1   Item_Fat_Content                 5681 non-null   float64
 2   Item_Visibility                  5681 non-null   float64
 3   Item_MRP                         5681 non-null   float64
 4   Outlet_Size                      5681 non-null   float64
 5   Outlet_Location_Type             5681 non-null   float64
 6   Outlet_Type                      5681 non-null   float64
 7   Item_Type_Baking Goods           5681 non-null   float64
 8   Item_Type_Breads                 5681 non-null   float64
 9   Item_Type_Breakfast              5681 non-null   float64
 10  Item_Type_Canned                 5681 non-null   float64
 11  Item_Type_Dairy                  5681 non-null   float64
 12  Item_Type_Frozen Foo

In [24]:
df_final.to_csv("test_preprocessed_data.csv", index= False)