In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

DATA_RAW = Path("../data/raw")
DATA_INTERIM = Path("../data/interim")
DATA_INTERIM.mkdir(parents=True, exist_ok=True)


In [2]:
product = pd.read_csv(DATA_RAW / "product.csv")
sales = pd.read_csv(DATA_RAW / "sales.csv")
discount = pd.read_csv(DATA_RAW / "discount.csv")
marketing = pd.read_csv(DATA_RAW / "marketing.csv")

print(product.shape, sales.shape, discount.shape, marketing.shape)


(50, 3) (26130, 5) (4150, 4) (494, 3)


In [4]:
product.head()


Unnamed: 0,product_id,category,cost_price
0,P001,Dairy,31.77
1,P002,Household,26.05
2,P003,Beverages,23.56
3,P004,Snacks,22.08
4,P005,Fresh,14.62


In [5]:
sales.head()



Unnamed: 0,week,store_id,product_id,units_sold,selling_price
0,1,S01,P001,17,51.95
1,1,S01,P002,7,38.57
2,1,S01,P003,25,34.52
3,1,S01,P004,29,35.43
4,1,S01,P005,19,19.55


In [6]:
discount.head()


Unnamed: 0,week,store_id,product_id,discount_percent
0,1,S02,P033,20
1,1,S03,P038,5
2,1,S04,P033,15
3,1,S05,P038,5
4,1,S06,P006,10


In [7]:
marketing.head()


Unnamed: 0,week,store_id,marketing_spend
0,1,S01,8047.96
1,2,S01,6862.98
2,3,S01,9418.4
3,4,S01,6656.55
4,5,S01,7178.85


In [9]:
dup_count = sales.duplicated(
    subset=["week", "store_id", "product_id"]
).sum()

dup_count


np.int64(129)

In [10]:
sales_clean = sales.drop_duplicates(
    subset=["week", "store_id", "product_id"]
).copy()

sales_clean.shape


(26001, 5)

In [None]:
valid_products = set(product["product_id"])

invalid_rows = ~sales_clean["product_id"].isin(valid_products)
invalid_rows.sum()




np.int64(25)

In [12]:
sales_clean = sales_clean[~invalid_rows].copy()
sales_clean.shape


(25976, 5)

In [14]:
upper_cap = sales_clean["units_sold"].quantile(0.99)
sales_clean["units_sold"] = np.minimum(
    sales_clean["units_sold"], upper_cap
)

sales_clean["units_sold"].describe()



count    25976.000000
mean        21.801817
std         14.135519
min          0.000000
25%         12.000000
50%         18.000000
75%         28.000000
max         75.000000
Name: units_sold, dtype: float64

In [15]:
discount.duplicated(
    subset=["week", "store_id", "product_id"]
).sum()


np.int64(0)

In [16]:
marketing.duplicated(
    subset=["week", "store_id"]
).sum()


np.int64(0)

In [17]:
product.to_csv(DATA_INTERIM / "product_clean.csv", index=False)
sales_clean.to_csv(DATA_INTERIM / "sales_clean.csv", index=False)
discount.to_csv(DATA_INTERIM / "discount_clean.csv", index=False)
marketing.to_csv(DATA_INTERIM / "marketing_clean.csv", index=False)

