In [1]:
from google.colab import drive # code to access google drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


Step 1: Load Data

In [4]:
# STEP 0 — Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("/content/gdrive/MyDrive/Machine Learning Dataset and project /ecommerce_products_sales.csv")
df.head()


Unnamed: 0,product_id,title,description,category,price,quantity,order_date,region
0,P0000000,Within finish Republican.,This sports product is made of Cotton and offe...,Sports,246.57,2,2024-02-07,Bahia
1,P0000001,Remember leave family bed doctor agreement.,This electronics product is made of Aluminum a...,Electronics,268.05,3,2023-10-11,São Paulo
2,P0000002,Think article well behavior natural.,This electronics product is made of PU Leather...,Electronics,289.75,2,2021-01-29,São Paulo
3,P0000003,Worker writer person various question election...,This phone accessories product is made of Stee...,Phone Accessories,343.24,5,2023-05-06,Minas Gerais
4,P0000004,Stock learn lawyer quite next.,This books product is made of Cotton and offer...,Books,13.08,5,2024-07-16,Rio Grande do Sul


Step 2: Basic Dataset Health

In [5]:
df.info()
df.isnull().mean().sort_values(ascending=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   product_id   150000 non-null  object 
 1   title        150000 non-null  object 
 2   description  150000 non-null  object 
 3   category     150000 non-null  object 
 4   price        150000 non-null  float64
 5   quantity     150000 non-null  int64  
 6   order_date   150000 non-null  object 
 7   region       150000 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 9.2+ MB


Unnamed: 0,0
product_id,0.0
title,0.0
description,0.0
category,0.0
price,0.0
quantity,0.0
order_date,0.0
region,0.0


Step 3: Schema Validation

In [14]:
expected_columns = {
    "product_id",
    "title",
    "description",
    "category",
    "price",
    "quantity",
    "order_date",
    "region"
}

actual_columns = set(df.columns)

missing = expected_columns - actual_columns
extra = actual_columns - expected_columns

missing, extra


(set(), set())

CRITICAL NULL CHECK

In [15]:
critical_columns = [
    "product_id",
    "category",
    "price",
    "quantity",
    "order_date"
]

df[critical_columns].isnull().mean()


Unnamed: 0,0
product_id,0.0
category,0.0
price,0.0
quantity,0.0
order_date,0.0


In [13]:
for col in df.columns:
    print(f"'{col}'")


'product_id'
'title'
'description'
'category'
'price'
'quantity'
'order_date'
'region'


PRICE & QUANTITY VALIDATION

In [16]:
df.describe()[["price", "quantity"]]


Unnamed: 0,price,quantity
count,150000.0,150000.0
mean,252.293009,2.99692
std,142.979134,1.414045
min,5.0,1.0
25%,128.3375,2.0
50%,252.44,3.0
75%,376.265,4.0
max,500.0,5.0


In [17]:
invalid_price = (df["price"] <= 0).mean()
invalid_quantity = (df["quantity"] <= 0).mean()

invalid_price, invalid_quantity


(np.float64(0.0), np.float64(0.0))

DATE QUALITY

In [18]:
df["order_date"] = pd.to_datetime(df["order_date"], errors="coerce")

df["order_date"].isnull().mean()


np.float64(0.0)

In [19]:
df["order_date"].min(), df["order_date"].max()


(Timestamp('2021-01-01 00:00:00'), Timestamp('2025-01-01 00:00:00'))

TEXT FIELDS SANITY

In [20]:
df["title"].str.len().describe()


Unnamed: 0,title
count,150000.0
mean,36.0001
std,10.67465
min,10.0
25%,28.0
50%,36.0
75%,44.0
max,80.0


In [21]:
df["description"].str.len().describe()


Unnamed: 0,description
count,150000.0
mean,125.395833
std,6.684414
min,111.0
25%,121.0
50%,125.0
75%,131.0
max,142.0


FINAL QUALITY SUMMARY

In [22]:
quality_summary = {
    "rows": len(df),
    "null_critical_%": df[critical_columns].isnull().mean().max(),
    "invalid_price_%": invalid_price,
    "invalid_quantity_%": invalid_quantity
}

quality_summary


{'rows': 150000,
 'null_critical_%': 0.0,
 'invalid_price_%': np.float64(0.0),
 'invalid_quantity_%': np.float64(0.0)}