In [15]:
import pandas as pd, json, datetime, os

from diffprivlib.mechanisms import Laplace
closet = "https://raw.githubusercontent.com/itsmekhang/ClosetAI-Development/main/data/closet.csv"
df = pd.read_csv(closet)
df.head()


Unnamed: 0,Item_ID,Type,Color,Season,Occasion,Material
0,1,Sweater,Blue,Winter,Casual,Wool
1,2,Blazer,Navy,Fall,Business,Wool
2,3,Chinos,Khaki,Fall,Business,Cotton
3,4,Raincoat,Olive,Spring,Casual,Polyester
4,5,Sneakers,White,All,Casual,Leather


In [9]:
df.info()
df.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Item_ID   10 non-null     int64 
 1   Type      10 non-null     object
 2   Color     10 non-null     object
 3   Season    10 non-null     object
 4   Occasion  10 non-null     object
 5   Material  10 non-null     object
dtypes: int64(1), object(5)
memory usage: 612.0+ bytes


Unnamed: 0,Item_ID,Type,Color,Season,Occasion,Material
count,10.0,10,10,10,10,10
unique,,10,9,4,3,5
top,,Sweater,White,Winter,Casual,Cotton
freq,,1,2,3,5,4
mean,5.5,,,,,
std,3.02765,,,,,
min,1.0,,,,,
25%,3.25,,,,,
50%,5.5,,,,,
75%,7.75,,,,,


In [23]:
#schema check
season = {"Winter","Fall","Spring","Summer","All"}
feat_names = ["Item_ID","Type","Color","Season","Occasion","Material"]

missing = [c for c in feat_names if c not in df.columns]
assert not missing, f"Missing columns: {missing}"
assert df["Item_ID"].notna().all(), "Item_ID cannot be null"

bad = set(df["Season"]) - season
assert not bad_season, f"Invalid Season values: {bad}"

dupes = df.duplicated().sum()
if dupes:
    df = df.drop_duplicates().reset_index(drop=True)
    print("Dropped duplicates:", dupes)

In [24]:
#distrbution check
def print_share(col, features):
    counts = col.value_counts()
    total = counts.sum()
    print(f"\n{features} share:")
    for cat in sorted(counts.index):
        p = round((counts[cat] / total) * 100, 1)
        print(f"  {cat}: {p}%")

print_share(df["Season"], "Season")
print_share(df["Occasion"], "Occasion")

# Imbalance flags > 60%
season = df["Season"].value_counts(normalize=True)
occasion = df["Occasion"].value_counts(normalize=True)

season_imbalanced = season.max() > 0.60
occasion_imbalanced = occasion.max() > 0.60

print("\nSeason imbalance?", season_imbalanced)
print("Occasion imbalance?", occasion_imbalanced)



Season share:
  All: 30.0%
  Fall: 20.0%
  Spring: 20.0%
  Winter: 30.0%

Occasion share:
  Business: 30.0%
  Casual: 50.0%
  Formal: 20.0%

Season imbalance? False
Occasion imbalance? False


In [11]:
winter = (df["Season"] == "Winter").sum()
laplace = Laplace(epsilon=1.0, sensitivity=1)
dp = laplace.randomise(winter)   # may be a float; that's fine

print(f"Winter count true={winter}  dp_noised={round(winter)}  (ε=1.0)")

Winter count true=3  dp_noised=3  (ε=1.0)


In [38]:
os.makedirs("data", exist_ok=True)

metadata = {
    "timestamp": datetime.datetime.now(datetime.UTC).isoformat(),
    "rows": len(df),
    "columns": len(df.columns),
    "columns name": df.columns.tolist(),
    "sources": ["closet.csv"],
    "notes": "Validated schema; duplicates handling; distribution & Differential Privacy check."
}
with open("metadata.json","w") as f: json.dump(metadata, f, indent=2)

risks = [
    {"Phase":"Data Collection","Risk":"Representativeness bias","Mitigation":"Distribution checks and targeted augmentation","Remaining Risk":"Moderate"},
    {"Phase":"Privacy","Risk":"Summary Leakage","Mitigation":"Differential-privacy noise on summaries","Remaining Risk":"Low"},
    {"Phase":"Data Quality","Risk":"Schema drift","Mitigation":"Controlled mispelling and validation; Prevent ingestion if validation failed","Remaining Risk":"Low"}
]
pd.DataFrame(risks).to_csv("risk.csv", index=False)

print("Wrote data/metadata.json and data/risk.csv")


Wrote data/metadata.json and data/risk.csv
