# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data Cleaning

In [2]:
# Load data
df = pd.read_csv("C:\\Users\\hamma\\Documents\\GitHub\\Project-1-Linear-Regression-Project\\sales_predictions.csv")
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [3]:
# Set column headers to lowercase
df.columns = df.columns.str.lower()
df.head()

Unnamed: 0,item_identifier,item_weight,item_fat_content,item_visibility,item_type,item_mrp,outlet_identifier,outlet_establishment_year,outlet_size,outlet_location_type,outlet_type,item_outlet_sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [4]:
# Remove identifier columns
df = df.drop(columns=['item_identifier','outlet_identifier'])
df.head()

Unnamed: 0,item_weight,item_fat_content,item_visibility,item_type,item_mrp,outlet_establishment_year,outlet_size,outlet_location_type,outlet_type,item_outlet_sales
0,9.3,Low Fat,0.016047,Dairy,249.8092,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,5.92,Regular,0.019278,Soft Drinks,48.2692,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,17.5,Low Fat,0.01676,Meat,141.618,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,19.2,Regular,0.0,Fruits and Vegetables,182.095,1998,,Tier 3,Grocery Store,732.38
4,8.93,Low Fat,0.0,Household,53.8614,1987,High,Tier 3,Supermarket Type1,994.7052


In [5]:
# Check and remove duplicates
dup = df.duplicated().sum()
df.drop_duplicates()
dup_2 = df.duplicated().sum()
print(f'There are {dup} duplicates. {dup_2} duplicates were removed.')

There are 0 duplicates. 0 duplicates were removed.


In [6]:
# Check for missing values
miss_val = df.isna().sum().sum()
print(f'There are {miss_val} missing values.')

There are 3873 missing values.


In [7]:
# Confirm dtypes are correct
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   item_weight                7060 non-null   float64
 1   item_fat_content           8523 non-null   object 
 2   item_visibility            8523 non-null   float64
 3   item_type                  8523 non-null   object 
 4   item_mrp                   8523 non-null   float64
 5   outlet_establishment_year  8523 non-null   int64  
 6   outlet_size                6113 non-null   object 
 7   outlet_location_type       8523 non-null   object 
 8   outlet_type                8523 non-null   object 
 9   item_outlet_sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(5)
memory usage: 666.0+ KB


> The datatype for each column appear to be correct

In [17]:
# Use placeholder for missing values
# For loop to replace missing values with 'MISSING'
for col in df:
    df[col].fillna(value='MISSING', inplace=True)
print(df.isna().sum())
df.head()

item_weight                  0
item_fat_content             0
item_visibility              0
item_type                    0
item_mrp                     0
outlet_establishment_year    0
outlet_size                  0
outlet_location_type         0
outlet_type                  0
item_outlet_sales            0
dtype: int64


Unnamed: 0,item_weight,item_fat_content,item_visibility,item_type,item_mrp,outlet_establishment_year,outlet_size,outlet_location_type,outlet_type,item_outlet_sales
0,9.3,Low Fat,0.016047,Dairy,249.8092,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,5.92,Regular,0.019278,Soft Drinks,48.2692,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,17.5,Low Fat,0.01676,Meat,141.618,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,19.2,Regular,0.0,Fruits and Vegetables,182.095,1998,MISSING,Tier 3,Grocery Store,732.38
4,8.93,Low Fat,0.0,Household,53.8614,1987,High,Tier 3,Supermarket Type1,994.7052


In [23]:
# Check for inconsistencies in values
# For loop to print unique values for each column with dtype as object
cols = df.select_dtypes(include='object')
for col in cols:
    values = cols[col].value_counts().sum()
    print(values, '\n')

8523 

8523 

8523 

8523 

8523 

8523 



In [24]:
cols

Unnamed: 0,item_weight,item_fat_content,item_type,outlet_size,outlet_location_type,outlet_type
0,9.3,Low Fat,Dairy,Medium,Tier 1,Supermarket Type1
1,5.92,Regular,Soft Drinks,Medium,Tier 3,Supermarket Type2
2,17.5,Low Fat,Meat,Medium,Tier 1,Supermarket Type1
3,19.2,Regular,Fruits and Vegetables,MISSING,Tier 3,Grocery Store
4,8.93,Low Fat,Household,High,Tier 3,Supermarket Type1
...,...,...,...,...,...,...
8518,6.865,Low Fat,Snack Foods,High,Tier 3,Supermarket Type1
8519,8.38,Regular,Baking Goods,MISSING,Tier 2,Supermarket Type1
8520,10.6,Low Fat,Health and Hygiene,Small,Tier 2,Supermarket Type1
8521,7.21,Regular,Snack Foods,Medium,Tier 3,Supermarket Type2
