In [1]:
# ============================================
# Phase 2: Clean + ETL
# ============================================

import pandas as pd
from pathlib import Path

# Always force BASE to project root (not notebooks/)
BASE = Path(r"C:\Users\lavan\OneDrive\Desktop\PredictiveMaintenanceProject")

RAW = BASE / "data" / "raw"
ST  = BASE / "data" / "staging"

# Load raw CSVs
devices = pd.read_csv(RAW / "hostel_device_inventory_small.csv", parse_dates=["InstallDate"])
usage   = pd.read_csv(RAW / "hostel_usage_logs_small.csv", parse_dates=["Date"])
maint   = pd.read_csv(RAW / "hostel_maintenance_logs_small.csv", parse_dates=["Date"])

# -----------------------------
# Cleaning and standardization
# -----------------------------

# Standardize DeviceID types
for df in (devices, usage, maint):
    df["DeviceID"] = df["DeviceID"].astype(int)

# Ensure FailureFlag is integer (fill missing with 0)
usage["FailureFlag"] = usage["FailureFlag"].fillna(0).astype(int)

# Drop duplicate rows
usage = usage.drop_duplicates()
maint = maint.drop_duplicates()

# Ensure all DeviceIDs in usage exist in devices
missing = set(usage["DeviceID"]) - set(devices["DeviceID"])
if missing:
    extra = pd.DataFrame({
        "DeviceID": list(missing),
        "DeviceType": "Unknown",
        "Location": "Unknown",
        "RoomNo": "Unknown",
        "InstallDate": pd.NaT
    })
    devices = pd.concat([devices, extra], ignore_index=True)

# -----------------------------
# Save cleaned versions to staging
# -----------------------------
devices.to_csv(ST / "hostel_device_inventory_clean.csv", index=False)
usage.to_csv(ST / "hostel_usage_logs_clean.csv", index=False)
maint.to_csv(ST / "hostel_maintenance_logs_clean.csv", index=False)

print("✅ Cleaned files saved to:", ST)


✅ Cleaned files saved to: C:\Users\lavan\OneDrive\Desktop\PredictiveMaintenanceProject\data\staging
