In [None]:
!pip install pandas numpy scikit-learn matplotlib seaborn

In [None]:
# version 3 - A
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import SpectralClustering
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import calinski_harabasz_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
import time


def main():
    print("Starting script execution...")
    start_time = time.time()

    # Load the data
    print("Loading data...")
    df = pd.read_csv("2024-Data-Cleaned/merged_patient_data.csv")

    # Print available columns
    print("Available columns in the DataFrame:")
    print(df.columns)

    # Preprocess the data
    print("Preprocessing data...")
    df["Sex"] = df["Sex"].map({"M": 0, "F": 1})
    df["Smoke"] = df["Smoke"].map({"NS": 0, "ES": 1})

    # Select features for clustering
    features = [
        "Age",
        "Sex",
        "Smoke",
        "Smoke_amount",
        "Height",
        "Weight",
        "BMI",
        "BSA",
        "Morning_PEFR",
        "Afternoon_PEFR",
    ]
    X = df[features]

    # Handle missing values
    print("Handling missing values...")
    imputer = SimpleImputer(strategy="mean")
    X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

    # Normalize the features
    print("Normalizing features...")
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_imputed)

    # Define the parameter grid
    param_grid = {
        "n_clusters": [2, 4, 5, 8],
        "affinity": ["rbf", "nearest_neighbors"],
        "gamma": [0.1, 1, 10],
    }

    # Perform manual grid search
    print("Starting grid search...")
    best_score = -np.inf
    best_params = None
    total_combinations = len(list(ParameterGrid(param_grid)))

    for i, params in enumerate(ParameterGrid(param_grid)):
        print(f"Testing combination {i+1}/{total_combinations}: {params}")
        spectral = SpectralClustering(**params, random_state=42, n_jobs=-1)
        labels = spectral.fit_predict(X_scaled)
        score = calinski_harabasz_score(X_scaled, labels)

        if score > best_score:
            best_score = score
            best_params = params

    print("Grid search completed.")
    print("Best parameters:", best_params)
    print("Best Calinski-Harabasz score:", best_score)

    # Use the best parameters to perform the final clustering
    print("Performing final clustering...")
    best_spectral = SpectralClustering(**best_params, random_state=42, n_jobs=-1)
    df["Cluster"] = best_spectral.fit_predict(X_scaled)

    print("Clustering completed.")
    end_time = time.time()
    print(f"Total execution time: {end_time - start_time:.2f} seconds")


if __name__ == "__main__":
    main()

In [None]:
# # version 2 - A
# import pandas as pd
# import numpy as np
# from sklearn.preprocessing import StandardScaler
# from sklearn.cluster import SpectralClustering
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import make_scorer, calinski_harabasz_score
# import matplotlib.pyplot as plt
# import seaborn as sns
# from sklearn.decomposition import PCA
# from sklearn.impute import SimpleImputer

# # Load the data
# df = pd.read_csv("2024-Data-Cleaned/merged_patient_data.csv")

# # Print available columns
# print("Available columns in the DataFrame:")
# print(df.columns)

# # Preprocess the data
# df["Sex"] = df["Sex"].map({"M": 0, "F": 1})
# df["Smoke"] = df["Smoke"].map({"NS": 0, "ES": 1})

# # Select features for clustering
# features = [
#     "Age",
#     "Sex",
#     "Smoke",
#     "Smoke_amount",
#     "Height",
#     "Weight",
#     "BMI",
#     "BSA",
#     "Morning_PEFR",
#     "Afternoon_PEFR",
# ]
# X = df[features]

# # Handle missing values
# imputer = SimpleImputer(strategy="mean")
# X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# # Normalize the features
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X_imputed)

# # Define the parameter grid
# param_grid = {
#     "n_clusters": [2, 4, 5, 8],
#     "affinity": ["rbf", "nearest_neighbors"],
#     # "n_neighbors": [5, 10, 15],
#     "gamma": [0.1, 1, 10],
# }

# # Define a custom scorer (we'll use the Calinski-Harabasz Index)
# ch_scorer = make_scorer(calinski_harabasz_score)

# # Perform Grid Search
# spectral = SpectralClustering(random_state=42)
# grid_search = GridSearchCV(spectral, param_grid, scoring=ch_scorer, cv=5, n_jobs=-1)
# grid_search.fit(X_scaled)

# # Get the best parameters and score
# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

# print("Best parameters:", best_params)
# print("Best Calinski-Harabasz score:", best_score)

# # Use the best parameters to perform the final clustering
# best_spectral = SpectralClustering(**best_params, random_state=42)
# df["Cluster"] = best_spectral.fit_predict(X_scaled)

In [None]:
# Analyze clusters
cluster_summary = df.groupby("Cluster")[features].mean()
print("\nCluster Summary:")
print(cluster_summary)

# Visualize clusters using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=df["Cluster"], cmap="viridis")
plt.title("Spectral Clustering Results (PCA)")
plt.xlabel("First Principal Component")
plt.ylabel("Second Principal Component")
plt.colorbar(scatter)
plt.show()

# Visualize feature distributions across clusters
for feature in features:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x="Cluster", y=feature, data=df)
    plt.title(f"{feature} Distribution Across Clusters")
    plt.show()

# Analyze PEFR trends within clusters
df["Date"] = pd.to_datetime(df["Date"])
for cluster in df["Cluster"].unique():
    cluster_data = df[df["Cluster"] == cluster]
    plt.figure(figsize=(12, 6))
    plt.plot(cluster_data["Date"], cluster_data["Morning_PEFR"], label="Morning PEFR")
    plt.plot(
        cluster_data["Date"], cluster_data["Afternoon_PEFR"], label="Afternoon PEFR"
    )
    plt.title(f"PEFR Trends for Cluster {cluster}")
    plt.xlabel("Date")
    plt.ylabel("PEFR")
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# Correlation analysis within clusters
for cluster in df["Cluster"].unique():
    cluster_data = df[df["Cluster"] == cluster]
    correlation = cluster_data[features].corr()
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation, annot=True, cmap="coolwarm", vmin=-1, vmax=1, center=0)
    plt.title(f"Correlation Heatmap for Cluster {cluster}")
    plt.tight_layout()
    plt.show()