In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score

# Set random seed for reproducibility
np.random.seed(42)

In [None]:
# Load dataset (update path if needed)
file_path = "/mnt/data/ATC_med_events.csv"
df = pd.read_csv(file_path)

# Display first few rows
df.head()

In [None]:
# Rename columns for consistency (Modify based on actual dataset columns)
df.columns = ["patient_id", "date", "per_day", "ATC", "duration"]

# Convert date column to datetime format
df['date'] = pd.to_datetime(df['date'])

df.head()

In [None]:
# Define function to calculate gaps
def calculate_gaps(df):
    df = df.sort_values(by=['patient_id', 'date'])
    df['gap_days'] = df.groupby('patient_id')['date'].diff().dt.days
    return df

# Define function for Sessa Empirical Estimator
def sessa_estimator(df, threshold=30):
    df['persistent'] = (df['gap_days'] <= threshold).astype(int)
    return df

# Apply functions
df = calculate_gaps(df)
df = sessa_estimator(df)

df.head()

In [None]:
# Prepare data for clustering
features = ['gap_days', 'persistent']
df_filtered = df.dropna(subset=features)

# Standardize data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_filtered[features])

In [None]:
# Apply K-Means Clustering
kmeans = KMeans(n_clusters=3, random_state=42)
df_filtered['kmeans_cluster'] = kmeans.fit_predict(df_scaled)

# Compute Silhouette Score for K-Means
kmeans_silhouette = silhouette_score(df_scaled, df_filtered['kmeans_cluster'])

# Visualize K-Means Clustering
plt.figure(figsize=(8,6))
sns.scatterplot(x=df_filtered['gap_days'], y=df_filtered['persistent'], hue=df_filtered['kmeans_cluster'], palette="viridis")
plt.title(f"K-Means Clustering of Medication Persistence (Silhouette Score: {kmeans_silhouette:.2f})")
plt.show()

In [None]:
# Apply DBSCAN Clustering
dbscan = DBSCAN(eps=0.5, min_samples=5)
df_filtered['dbscan_cluster'] = dbscan.fit_predict(df_scaled)

# Compute Silhouette Score for DBSCAN (ignoring noise points)
dbscan_clusters = df_filtered[df_filtered['dbscan_cluster'] != -1]
if not dbscan_clusters.empty:
    dbscan_silhouette = silhouette_score(df_scaled[dbscan_clusters.index], dbscan_clusters['dbscan_cluster'])
else:
    dbscan_silhouette = "N/A"

# Visualize DBSCAN Clustering
plt.figure(figsize=(8,6))
sns.scatterplot(x=df_filtered['gap_days'], y=df_filtered['persistent'], hue=df_filtered['dbscan_cluster'], palette="coolwarm")
plt.title(f"DBSCAN Clustering of Medication Persistence (Silhouette Score: {dbscan_silhouette})")
plt.show()

In [None]:
# Save processed dataset
df_filtered.to_csv("/mnt/data/processed_ATC_clusters.csv", index=False)

# Compare Cluster Sizes
print("K-Means Cluster Counts:\n", df_filtered['kmeans_cluster'].value_counts())
print("\nDBSCAN Cluster Counts:\n", df_filtered['dbscan_cluster'].value_counts())