In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(style = "whitegrid")

In [2]:
df = pd.read_csv("data/data.csv")
df.head(5)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052



Since we need to predict Sales for each product at a particular outlet.

Here's the plan
1. Exploratory data analysis
2. Outlier Detection
3. Impute missing values
4. Correlation plot
5. Standardization of data
6. Modelling

Firstly, we need to filter data for one item and outlet. Build the model on top of it.


In [3]:
items = df.Item_Type.unique().tolist()
items

['Dairy',
 'Soft Drinks',
 'Meat',
 'Fruits and Vegetables',
 'Household',
 'Baking Goods',
 'Snack Foods',
 'Frozen Foods',
 'Breakfast',
 'Health and Hygiene',
 'Hard Drinks',
 'Canned',
 'Breads',
 'Starchy Foods',
 'Others',
 'Seafood']

In [4]:
df.Outlet_Size.unique()

array(['Medium', nan, 'High', 'Small'], dtype=object)

In [None]:
df.info()

In [None]:
content_type = {
    "Low Fat": "Low Fat",
    "Regular": "Regular",
    "low fat": "Low Fat",
    "Lf": "Low Fat",
    "reg": "Regular"
}
df.Item_Fat_Content.replace(to_replace=content_type, inplace=True)

#### Data - Visulaizations

In [None]:
sns.distplot(df.Item_Weight)

In [None]:
plt.figure(figsize=(12, 4))
sns.barplot(x = "Item_Outlet_Sales", y="Outlet_Identifier", data=df)

In [None]:
plt.figure(figsize=(12, 4))
sns.barplot(x = "Item_Outlet_Sales", y="Outlet_Location_Type", data=df)

In [None]:
plt.figure(figsize=(12, 4))
df.groupby(['Outlet_Type']).agg({"Item_Outlet_Sales": "sum"}).plot(kind="bar")

In [None]:
figure, ax = plt.subplots(figsize=(12, 4))
ax = sns.violinplot(x= "Item_Type", y="Item_Outlet_Sales", data=df)
ax.set_xticklabels(labels = df['Item_Type'].unique(), rotation=45)

In [None]:
plt.figure(figsize=(12, 4))
sns.violinplot(x="Outlet_Size", y="Item_Outlet_Sales", data=df)

In [None]:
fig, ax = plt.subplots(figsize= (12, 4))
sns.scatterplot(x="Item_MRP", y="Item_Outlet_Sales", data=df)

In [None]:
avg_mrp = df.Item_MRP.mean()
figure, ax = plt.subplots(figsize=(12, 6))
sns.countplot(x="Item_Type", data=df)
ax.set_xticklabels(labels= df.Item_Type.unique(), rotation=45)
ax.set_ylabel(ylabel="ItemType Count")
ax.axhline(y=avg_mrp, linewidth=1, color='r')

In [None]:
figure, ax = plt.subplots(figsize=(12, 3))
sns.countplot(x=df.Outlet_Size, data=df)
ax.axhline(y=df.Item_Outlet_Sales.mean(), color='r')

In [None]:
figure, ax = plt.subplots(figsize=(12, 3))
sns.countplot(x = "Item_Fat_Content", data=df)
ax.axhline(y=df.Item_Outlet_Sales.mean(), color='r')

#### Data - Preprocessing

In [None]:
df.isnull().sum()/ len(df)

In [None]:
df.dtypes

In [None]:
num_cols=df.columns[df.dtypes == 'float64']
cat_cols = df.columns[df.dtypes == "object"]

In [None]:
num_cols

In [None]:
cat_cols

In [None]:
def labelencoding(df: pd.DataFrame) -> pd.DataFrame:
    try:
        for x in df.columns:
            if x in df.select_dtypes(include="object"):
                df[x] = df[x].astype('category').cat.codes
        return df
    except Exception as e:
        print(e)
        print("Encoding error")

In [None]:
df = labelencoding(df)
df.head()

In [None]:
df.dtypes

In [None]:
df.shape

In [None]:
from sklearn.preprocessing import StandardScaler

def scale_num_col(df: pd.DataFrame, target_col: str) ->pd.DataFrame:
    num_df = df.select_dtypes(exclude="object").drop(target_col, axis=1)
    num_cols = num_df.columns
    scaled_df= pd.DataFrame(StandardScaler().fit_transform(num_df), columns=num_df.columns)
    new_df = df.drop(num_cols, axis=1)
    return pd.concat([new_df, scaled_df], axis=1)


In [None]:
ndf = scale_num_col(df=df, target_col='Item_Outlet_Sales')
ndf.head()

In [None]:


num_df = df.select_dtypes(include="float64")
scaled_df = pd.DataFrame(StandardScaler().fit_transform(num_df), columns=num_df.columns)

In [None]:
scaled_df

In [None]:
df.head()

In [None]:
df = labelencoding(df=df)

In [None]:
df.head()

In [None]:
df.Item_Identifier.astype('category').cat.codes

In [None]:
df.Outlet_Identifier.unique()