In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Step 1: Load & Preprocess Data
def load_and_preprocess(file_path):
    df = pd.read_csv(file_path)
    df.columns = ["pnr", "eksd", "perday", "ATC", "dur_original"]
    df["eksd"] = pd.to_datetime(df["eksd"])
    print("✅ Data loaded and preprocessed.")
    return df

df = load_and_preprocess("med_events_ATC.csv")


FileNotFoundError: [Errno 2] No such file or directory: 'med_events_ATC.csv'

In [4]:
# Step 2: Compute Event Intervals
def compute_event_intervals(df):
    df = df.sort_values(by=["pnr", "eksd"])
    df["prev_eksd"] = df.groupby("pnr")["eksd"].shift(1)
    df["event_interval"] = (df["eksd"] - df["prev_eksd"]).dt.days
    df = df.dropna()
    print("✅ Event intervals computed.")
    return df

df = compute_event_intervals(df)

NameError: name 'df' is not defined

In [5]:
# Step 3: Generate ECDF & Retain 80%
def retain_lower_ecdf(df, threshold=0.8):
    ecdf = sm.distributions.ECDF(df["event_interval"])
    df["ecdf"] = ecdf(df["event_interval"])
    df = df[df["ecdf"] <= threshold]
    print("✅ Retained lower 80% of ECDF.")
    return df

df = retain_lower_ecdf(df)

NameError: name 'df' is not defined

In [6]:
# Step 4: Determine Optimal Clusters
def find_optimal_clusters(df, max_clusters=10):
    scores = []
    for k in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = kmeans.fit_predict(df[["event_interval"]])
        score = silhouette_score(df[["event_interval"]], labels)
        scores.append(score)
    
    plt.plot(range(2, max_clusters + 1), scores, marker="o")
    plt.xlabel("Number of Clusters (K)")
    plt.ylabel("Silhouette Score")
    plt.title("Optimal Clusters via Silhouette Score")
    plt.show()
    
    best_k = scores.index(max(scores)) + 2
    print(f"✅ Optimal K: {best_k}")
    return best_k

best_k = find_optimal_clusters(df)

NameError: name 'df' is not defined

In [7]:
# Step 5: Apply K-Means
def apply_kmeans(df, num_clusters):
    kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
    df["Cluster"] = kmeans.fit_predict(df[["event_interval"]])
    print("✅ K-Means clustering applied.")
    return df

df = apply_kmeans(df, best_k)

# Step 6: Apply DBSCAN
def apply_dbscan(df, eps=5, min_samples=3):
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    df["DBSCAN_Cluster"] = dbscan.fit_predict(df[["event_interval"]])
    print("✅ DBSCAN clustering applied.")
    return df

df = apply_dbscan(df)

NameError: name 'df' is not defined

In [9]:
# Step 7: Compare Clustering Methods
def compare_clustering(df):
    kmeans_score = silhouette_score(df[["event_interval"]], df["Cluster"])
    dbscan_score = silhouette_score(df[["event_interval"]], df["DBSCAN_Cluster"]) if len(set(df["DBSCAN_Cluster"])) > 1 else None
    print(f"Silhouette Score - K-Means: {kmeans_score:.3f}")
    print(f"Silhouette Score - DBSCAN: {dbscan_score if dbscan_score else 'N/A'}")
    return {"K-Means": kmeans_score, "DBSCAN": dbscan_score}

compare_clustering(df)


NameError: name 'df' is not defined

In [10]:
# Step 8: Visualization
def plot_boxplot(df):
    sns.boxplot(x="Cluster", y="event_interval", data=df)
    plt.title("Boxplot of Event Intervals per Cluster")
    plt.show()

plot_boxplot(df)

NameError: name 'df' is not defined