In [None]:
import pandas as pd

path = "../data_raw/charging_sessions.csv"
df = pd.read_csv(path)

# 1) Drop explicit index column
df = df.drop(columns=['Unnamed: 0'])

# 2) Parse datetimes (keep everything in UTC)
time_cols = ["connectionTime", "disconnectTime", "doneChargingTime"]
for col in time_cols:
    df[col] = pd.to_datetime(df[col], utc=True, errors="coerce")

# 3) Handle doneChargingTime

# If doneChargingTime is missing but energy was delivered -> set to disconnectTime
# print('COUNT missing doneChargingTime with kWhDelivered > 0: ', ((df["doneChargingTime"].isna()) & (df["kWhDelivered"].gt(0))).sum())
mask = df["doneChargingTime"].isna() & df["kWhDelivered"].gt(0)
df.loc[mask, "doneChargingTime"] = df.loc[mask, "disconnectTime"]

# Clip doneChargingTime to [connectionTime, disconnectTime]
mask_1 = df["doneChargingTime"] < df["connectionTime"]
mask_2 = df["doneChargingTime"] > df["disconnectTime"]
# print('COUNT doneChargingTime < connectionTime: ', mask_1.sum())
# print('COUNT doneChargingTime > disconnectTime: ', mask_2.sum())
df.loc[mask_1, "doneChargingTime"] = df.loc[mask_1, "connectionTime"]
df.loc[mask_2, "doneChargingTime"] = df.loc[mask_2, "disconnectTime"]

# 4) Remove sessions where disconnectTime < connectionTime
# print('COUNT disconnectTime < connectionTime: ', (df["disconnectTime"] < df["connectionTime"]).sum())
mask_bad_order = df["disconnectTime"] < df["connectionTime"]
df = df.loc[~mask_bad_order].copy()

# 5) Remove negative energy sessions
# print('COUNT negative energy rows: ', (df["kWhDelivered"] == 0).sum())
df = df[df["kWhDelivered"] >= 0].copy()

# 6) Session duration checks
duration_h = (df["disconnectTime"] - df["connectionTime"]).dt.total_seconds() / 3600.0
# Remove non-positive durations
# print('COUNT of sessions <= 0 h: ', (duration_h <= 0).sum())
df = df.loc[duration_h > 0].copy()

# Remove extreme outliers (>200 hours)
# print('COUNT of sessions > 200 h: ', (duration_h > 200).sum())
df = df.loc[duration_h <= 200].copy()
df = df.reset_index(drop=True)

# 7) Clean categorical cols
# Use string dytpe and format string (strip whitespace)
cat_cols = ["siteID", "spaceID", "stationID", "timezone", "id", "sessionID"]
for col in cat_cols:
    # print(f'Unique values in {col}: ', df[col].nunique(), f' vs. Unique values in {col} after cleaning: ', df[col].astype(str).str.strip().str.lower().nunique())
    df[col] = df[col].astype(str).str.strip().str.lower()

# 8) Handle duplicates on sessionID
sort_cols = ["sessionID", "connectionTime"]
df = df.sort_values(sort_cols)
df = df.drop_duplicates(subset=["sessionID"], keep="last")

doneChargingTime < connectionTime:  27
doneChargingTime > disconnectTime:  4692
Number of sessions <= 0 h:  0
Number of sessions > 200 h:  2
65035
65035
65035
