In [1]:
import pandas as pd
import numpy as np
from IPython.display import display

CUSTOMER_PATH = "/s3-datalake/source/customer_data.parquet"
SALES_PATH = "/s3-datalake/source/sales_data.csv"

In [2]:
# -----------------------------------------------------------------------
# 1. Load data
# -----------------------------------------------------------------------
customer = pd.read_parquet(CUSTOMER_PATH)
sales = pd.read_csv(SALES_PATH)

print("Customer shape:", customer.shape)
print("Sales shape:", sales.shape)

Customer shape: (99457, 4)
Sales shape: (99457, 7)


In [3]:
# -----------------------------------------------------------------------
# 2. Quick schema / dtypes check
# -----------------------------------------------------------------------
print("\nCustomer dtypes:\n", customer.dtypes)
print("\nSales dtypes:\n", sales.dtypes)

# Try converting invoice_date to datetime
sales["invoice_date"] = pd.to_datetime(sales["invoice_date"],
                                       format="%d-%m-%Y",
                                       errors="coerce")

display(customer.head())
display(sales.head())


Customer dtypes:
 customer_id        object
gender             object
age               float64
payment_method     object
dtype: object

Sales dtypes:
 invoice_no        object
customer_id       object
category          object
quantity           int64
price            float64
invoice_date      object
shopping_mall     object
dtype: object


Unnamed: 0,customer_id,gender,age,payment_method
0,C241288,Female,28.0,Credit Card
1,C111565,Male,21.0,Debit Card
2,C266599,Male,20.0,Cash
3,C988172,Female,66.0,Credit Card
4,C189076,Female,53.0,Cash


Unnamed: 0,invoice_no,customer_id,category,quantity,price,invoice_date,shopping_mall
0,I138884,C241288,Clothing,5,1500.4,2022-08-05,Kanyon
1,I317333,C111565,Shoes,3,1800.51,2021-12-12,Forum Istanbul
2,I127801,C266599,Clothing,1,300.08,2021-11-09,Metrocity
3,I173702,C988172,Shoes,5,3000.85,2021-05-16,Metropol AVM
4,I337046,C189076,Books,4,60.6,2021-10-24,Kanyon


In [4]:
# -----------------------------------------------------------------------
# 3. Null-count & basic descriptive stats
# -----------------------------------------------------------------------
def null_report(df, name):
    n = df.isna().sum()
    pct = (n / len(df)).round(3) * 100
    rep = pd.DataFrame({"nulls": n, "pct_null": pct})
    print(f"\n=== NULL REPORT: {name} ===")
    display(rep.sort_values("pct_null", ascending=False))

null_report(customer, "CUSTOMER")
null_report(sales, "SALES")

print("\nCustomer numerical summary:")
display(customer.describe())

print("\nSales numerical summary:")
display(sales.describe())


=== NULL REPORT: CUSTOMER ===


Unnamed: 0,nulls,pct_null
age,119,0.1
customer_id,0,0.0
gender,0,0.0
payment_method,0,0.0



=== NULL REPORT: SALES ===


Unnamed: 0,nulls,pct_null
invoice_no,0,0.0
customer_id,0,0.0
category,0,0.0
quantity,0,0.0
price,0,0.0
invoice_date,0,0.0
shopping_mall,0,0.0



Customer numerical summary:


Unnamed: 0,age
count,99338.0
mean,43.425859
std,14.9894
min,18.0
25%,30.0
50%,43.0
75%,56.0
max,69.0



Sales numerical summary:


Unnamed: 0,quantity,price,invoice_date
count,99457.0,99457.0,99457
mean,3.003429,689.256321,2022-02-04 02:46:59.783424
min,1.0,5.23,2021-01-01 00:00:00
25%,2.0,45.45,2021-07-19 00:00:00
50%,3.0,203.3,2022-02-05 00:00:00
75%,4.0,1200.32,2022-08-22 00:00:00
max,5.0,5250.0,2023-03-08 00:00:00
std,1.413025,941.184567,


In [5]:
# -----------------------------------------------------------------------
# 4. Value sanity checks / outlier spotting / duplicate checking
# -----------------------------------------------------------------------
bad_qty   = sales.loc[sales["quantity"] <= 0]
bad_price = sales.loc[sales["price"]   <= 0]
bad_age   = customer.loc[(customer["age"] < 13) | (customer["age"] > 100)]
bad_date  = sales.loc[sales["invoice_date"].isna() |
                      (sales["invoice_date"] > pd.Timestamp.today())]

print(f"\nRows with non-positive quantity: {len(bad_qty)}")
print(f"Rows with non-positive price   : {len(bad_price)}")
print(f"Suspicious ages (<13 or >100)  : {len(bad_age)}")
print(f"Unparseable / future dates     : {len(bad_date)}")

duplicate_sales = sales.duplicated().sum()
print(f"Number of duplicate sales: {duplicate_sales}")

duplicate_customers = customer.duplicated().sum()
print(f"Number of duplicate customers: {duplicate_customers}")


Rows with non-positive quantity: 0
Rows with non-positive price   : 0
Suspicious ages (<13 or >100)  : 0
Unparseable / future dates     : 0
Number of duplicate sales: 0
Number of duplicate customers: 0


In [6]:
# -----------------------------------------------------------------------
# 5. Cardinality / unique values for categoricals
# -----------------------------------------------------------------------
cat_columns = ["gender", "payment_method", "category", "shopping_mall"]
for col in cat_columns:
    if col in customer.columns:
        vals = customer[col].unique()[:10]
        print(f"\n{col} – unique sample:", vals)
    if col in sales.columns:
        vals = sales[col].unique()[:10]
        print(f"\n{col} – unique sample:", vals)


gender – unique sample: ['Female' 'Male']

payment_method – unique sample: ['Credit Card' 'Debit Card' 'Cash']

category – unique sample: ['Clothing' 'Shoes' 'Books' 'Cosmetics' 'Food & Beverage' 'Toys'
 'Technology' 'Souvenir']

shopping_mall – unique sample: ['Kanyon' 'Forum Istanbul' 'Metrocity' 'Metropol AVM' 'Istinye Park'
 'Mall of Istanbul' 'Emaar Square Mall' 'Cevahir AVM' 'Viaport Outlet'
 'Zorlu Center']


In [7]:
# -----------------------------------------------------------------------
# 6. Profiler report using pandas-profiling
# -----------------------------------------------------------------------
from ydata_profiling import ProfileReport
profile = ProfileReport(
            pd.concat([customer.add_prefix("cust_"), sales.add_prefix("sales_")], axis=1),
            title="Data Profiling Report", minimal=True)
profile.to_file("profiling_report.html")
print("\nProfiling report written to profiling_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/11 [00:00<?, ?it/s][A
100%|██████████| 11/11 [00:00<00:00, 14.63it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]


Profiling report written to profiling_report.html


The rendered HTML of the profiling report is available:
    **[Profiling Report](./profiling_report.html)**

In [8]:
# Display the report
display(profile)

