In [35]:
import os
os.environ["OMP_NUM_THREADS"] = "4"  # Further reduce parallel processing issues


import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [28]:
# Step 1: Load & Preprocess Data
def load_and_preprocess(file_path):
    df = pd.read_csv(file_path)
    df.columns = ["pnr", "eksd", "perday", "ATC", "dur_original"]
    df["eksd"] = pd.to_datetime(df["eksd"])
    df = df.sort_values(by=["pnr", "eksd"])
    print("✅ Data loaded and preprocessed.")
    return df

df = load_and_preprocess("ATC_med_events.csv")

✅ Data loaded and preprocessed.


In [29]:
# Step 2: Compute Event Intervals
def compute_event_intervals(df):
    df["prev_eksd"] = df.groupby("pnr")["eksd"].shift(1)
    df["event_interval"] = (df["eksd"] - df["prev_eksd"]).dt.days
    df = df.dropna()
    print("✅ Event intervals computed.")
    return df

df = compute_event_intervals(df)

# Step 3: Generate ECDF & Retain 80%
def retain_lower_ecdf(df, threshold=0.8):
    ecdf = sm.distributions.ECDF(df["event_interval"])
    df["ecdf"] = ecdf(df["event_interval"])
    df = df[df["ecdf"] <= threshold]
    print("✅ Retained lower 80% of ECDF.")
    return df

df = retain_lower_ecdf(df)

✅ Event intervals computed.
✅ Retained lower 80% of ECDF.
