In [3]:
#KMeans

In [1]:
from sklearn.utils import resample
from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ParameterGrid
from sklearn.decomposition import PCA, KernelPCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from joblib import Parallel, delayed
import numpy as np
import pandas as pd

# Load your data
file_path = "data/radiomic78_2824.xlsx"
df = pd.read_excel(file_path)

# Check for missing values
missing_values = df.isnull().sum()

# Use SimpleImputer to fill in missing values with the mean
imputer = SimpleImputer(strategy='mean')
df_imputed = imputer.fit_transform(df)

# Convert the imputed array back to a DataFrame
df_imputed = pd.DataFrame(df_imputed, columns=df.columns)

# Continue with the rest of your code
csv_file_path = "data/data.csv"
df_imputed.to_csv(csv_file_path, encoding='utf-8', index=False)

# Extract the 'patient_id' column (assuming it's not part of the data for scaling)
patient_id = df_imputed['patient_id']

# Drop the 'patient_id' column for scaling
df_imputed = df_imputed.drop(columns=['patient_id'])

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the data
scaled_data = scaler.fit_transform(df_imputed)

# Create a new DataFrame with the scaled data
scaled_df = pd.DataFrame(scaled_data, columns=df_imputed.columns)

# Function to perform clustering stability test with bootstrapping
def cluster_stability_train_test(X_train, X_test, est, n_iter=50, random_state=None):
    rng = np.random.RandomState(random_state)
    labels_test = []

    for i in range(n_iter):
        # Draw bootstrap samples
        sample_indices = resample(np.arange(X_train.shape[0]), random_state=rng)

        # Clone the estimator to ensure a fresh model for each iteration
        est_copy = clone(est)
        
        if hasattr(est_copy, "random_state"):
            # Randomize the estimator if possible
            est_copy.random_state = rng.randint(1e5)
        
        X_bootstrap = X_train[sample_indices]
        est_copy.fit(X_bootstrap)
        
        # Predict on the test data
        relabel_test = est_copy.predict(X_test)
        labels_test.append(relabel_test)

    scores = []

    # Calculate adjusted Rand scores between pairs of labelings
    for i in range(len(labels_test) - 1):
        labels_1 = labels_test[i]
        labels_2 = labels_test[i + 1]
        scores.append(adjusted_rand_score(labels_1, labels_2))

    # Return the mean adjusted Rand score
    return np.mean(scores)

def hypersearch(X_std, kpca_params=None, tsne_params=None, kmeans_params=None, n_jobs=-1):
    # 90% split
    X_train, X_test = train_test_split(X_std, test_size=0.1, random_state=42)

    # Define default hyperparameter values
    default_kpca_params = {
        'kernel': ['sigmoid'],
        'n_components': [10]
    }
    default_tsne_params = {
        'perplexity': [10]
    }
    default_kmeans_params = {
        'n_clusters': [5]
    }

    # Handle None values for hyperparameters and set default values
    kpca_params = kpca_params or default_kpca_params
    kpca_params['kernel'] = kpca_params.get('kernel', default_kpca_params['kernel'])
    kpca_params['n_components'] = kpca_params.get('n_components', default_kpca_params['n_components'])
    
    tsne_params = tsne_params or default_tsne_params
    tsne_params['perplexity'] = tsne_params.get('perplexity', default_tsne_params['perplexity'])

    kmeans_params = kmeans_params or default_kmeans_params
    kmeans_params['n_clusters'] = kmeans_params.get('n_clusters', default_kmeans_params['n_clusters'])

    # Create a pipeline with named steps
    pipeline = Pipeline([
        ('kpca', KernelPCA()),
        ('tsne', TSNE()),
    ])

    param_grid = {}
    for param, values in kpca_params.items():
        param_grid['kpca__' + param] = values
    for param, values in tsne_params.items():
        param_grid['tsne__' + param] = values


    # Get KMeans n_clusters values
    kmeans_n_clusters = kmeans_params.get('n_clusters', [5])

    # Generate all possible parameter combinations
    param_combinations = list(ParameterGrid(param_grid))

    # Create an empty DataFrame to store scores
    columns = list(param_grid.keys()) + ['kmeans__n_clusters', 'score_a', 'score_b']
    scores_df = pd.DataFrame(columns=columns)

    # Iterate over parameter combinations in parallel
    def evaluate_params(params):
        scores_data = []
        for n_clusters in kmeans_n_clusters:
            pipeline_cloned = clone(pipeline)
            pipeline_cloned.set_params(**params)

            X_transformed = pipeline_cloned.fit_transform(X_train)
            X_test_transformed = pipeline_cloned.named_steps['kpca'].transform(X_test)
            X_test_transformed = pipeline_cloned.named_steps['tsne'].fit_transform(X_test_transformed)
            print(X_test_transformed.shape)

            score_a = cluster_stability_train_test(X_transformed, X_test_transformed, KMeans(n_clusters=n_clusters, n_init=10), n_iter=50, random_state=42)

            X_combined = np.vstack((X_transformed, X_test_transformed))
            X_test_tsne_combined = pipeline_cloned.named_steps['tsne'].fit_transform(X_combined)[-X_test.shape[0]:]
            score_b = cluster_stability_train_test(X_transformed, X_test_tsne_combined, KMeans(n_clusters=n_clusters, n_init=10), n_iter=50, random_state=42)

            row = {**params, 'kmeans__n_clusters': n_clusters, 'score_a': score_a, 'score_b': score_b}
            scores_data.append(row)
        return scores_data

    # Parallelize the parameter grid search
    scores_data = Parallel(n_jobs=n_jobs)(delayed(evaluate_params)(params) for params in param_combinations)

    # Create the DataFrame from the collected scores
    scores_df = pd.DataFrame([item for sublist in scores_data for item in sublist])

    # Find the best hyperparameter combination
    best_hyperparameter = scores_df.loc[scores_df[['score_a', 'score_b']].mean(axis=1).idxmax()]

    return scores_df, best_hyperparameter

# Specify the parameter search space
kpca_params = {
    'kernel': ['sigmoid'],
    'n_components': [10, 20, 30, 40]  # Add more values if needed
}

tsne_params = {
    'perplexity': [7]  # Add more values if needed
}

kmeans_params = {
    'n_clusters': [3, 4, 5]  # Add more values if needed
}

# Call the hypersearch function
scores, best_hyperparameter = hypersearch(scaled_df, kpca_params, tsne_params, kmeans_params)

# Print the results
print("Best Hyperparameters:")
print(best_hyperparameter)

Best Hyperparameters:
kpca__kernel           sigmoid
kpca__n_components          30
tsne__perplexity             7
kmeans__n_clusters           5
score_a               0.725367
score_b               0.838268
Name: 8, dtype: object


In [2]:
pd.set_option("display.max_rows", None)
scores

Unnamed: 0,kpca__kernel,kpca__n_components,tsne__perplexity,kmeans__n_clusters,score_a,score_b
0,sigmoid,10,7,3,0.534009,0.589794
1,sigmoid,10,7,4,0.686222,0.646562
2,sigmoid,10,7,5,0.495314,0.516548
3,sigmoid,20,7,3,0.643124,0.511123
4,sigmoid,20,7,4,0.629077,0.588673
5,sigmoid,20,7,5,0.396049,0.688667
6,sigmoid,30,7,3,0.456145,0.816212
7,sigmoid,30,7,4,0.448507,0.839868
8,sigmoid,30,7,5,0.725367,0.838268
9,sigmoid,40,7,3,0.259454,0.35884


In [4]:
#NMF

In [5]:
#Louvain

In [6]:
#Leiden