# Get data

In [1]:
import os
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, normalize
from sklearn.decomposition import TruncatedSVD

import my_utils
import gzip
import shutil
from sklearn.pipeline import Pipeline

In [None]:
input_gz_file = '../Output/GSE185948_count_RNA.rds.gz'
output_rds_file = '../Output/data_for_r.rds'

# Open the compressed file and extract it
with gzip.open(input_gz_file, 'rb') as f_in, open(output_rds_file, 'wb') as f_out:
    shutil.copyfileobj(f_in, f_out)

print(f'{input_gz_file} has been successfully uncompressed to {output_rds_file}.')

In [None]:
input_gz_file = '../Output/GSE185948_metadata_RNA.csv.gz'
output_csv_file = '../Output/uncompressed_metadata.csv'

# Open the compressed file and extract it
with gzip.open(input_gz_file, 'rb') as f_in, open(output_csv_file, 'wb') as f_out:
    shutil.copyfileobj(f_in, f_out)

print(f'{input_gz_file} has been successfully uncompressed to {output_csv_file}.')

In [None]:
output_path = '../Output'
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [4]:
metadata = pd.read_csv('../Output/uncompressed_metadata.csv')

In [5]:
metadata

Unnamed: 0,name,barcode,patient,gender,disease,celltype,nCount_RNA,nFeature_RNA,UMAP_1,UMAP_2
0,PKD_ACACGCGGTATCGGTT-1_1,ACACGCGGTATCGGTT-1,PKD1,female,PKD,TAL1,1234.684629,1250,-7.416020,-6.008874
1,PKD_ACACGCGGTTTGGCTA-1_1,ACACGCGGTTTGGCTA-1,PKD1,female,PKD,PT2,1865.542588,1650,2.499409,-6.587287
2,PKD_ACACGCGTCATGTCTT-1_1,ACACGCGTCATGTCTT-1,PKD1,female,PKD,CNT_PC,1812.700523,1419,-2.254505,8.526364
3,PKD_ACACTGAAGACCCTTA-1_1,ACACTGAAGACCCTTA-1,PKD1,female,PKD,FIB,978.772591,1089,9.949437,2.086737
4,PKD_ACACTGAAGCGACAGT-1_1,ACACTGAAGCGACAGT-1,PKD1,female,PKD,PT1,2361.558871,2013,7.886921,-8.587954
...,...,...,...,...,...,...,...,...,...,...
102705,Cont_TTTGTTGCAGTTAAAG-1_5,TTTGTTGCAGTTAAAG-1,control5,female,control,DCT,2177.456940,1537,-8.038441,6.705230
102706,Cont_TTTGTTGCATATTCGG-1_5,TTTGTTGCATATTCGG-1,control5,female,control,PT1,1116.704812,984,5.658295,-6.785034
102707,Cont_TTTGTTGGTACGATTC-1_5,TTTGTTGGTACGATTC-1,control5,female,control,DCT,2156.953541,1421,-8.645567,6.688504
102708,Cont_TTTGTTGGTGCGTTTA-1_5,TTTGTTGGTGCGTTTA-1,control5,female,control,CNT_PC,1553.371592,1209,-0.795698,7.423819


R code below

## Data Preparation

In [None]:
pip install --upgrade --no-deps memory_profiler

In [None]:
data_path = '../Output/non_zero.parquet'
row_info_path = '../Output/row_names.csv'
column_info_path = '../Output/col_names.csv'
#sparse_matrix = coo_matrix((data['nonzero_elements'], (data['row_indices'], data['col_indices'])))
sparse_matrix, column_names, row_names, row_indices= my_utils.load_data(data_path, row_info_path, column_info_path)

In [None]:
split_pipeline = Pipeline([
    ('splitter', my_utils.SparseTrainTestSplit(test_size=0.2, random_state=42, row_indices = row_indices)),
    # Add other steps in the pipeline as needed
])

# Fit and transform the pipeline
sparse_train, sparse_test, train_indices, test_indices = split_pipeline.fit_transform(sparse_matrix)


In [None]:
sparse_train

In [None]:
clean_and_pca_pipeline = Pipeline([
    ('cleaner', my_utils.DataClean()),  # CleanData is performed first
    ('scaler', StandardScaler(with_mean=False)),
    ('pca', TruncatedSVD(n_components=2))
])

# Fit the pipeline to training
clean_and_pca_pipeline.fit(sparse_train)
# Transform the training data
pca_sparse_train = clean_and_pca_pipeline.transform(sparse_train)

pca_sparse_test = clean_and_pca_pipeline.transform(sparse_test)

In [None]:
reindexer = my_utils.Reindex(columns=["PC1", "PC2"], names=row_names, output_folder="../Output")
train_pca_df = reindexer.transform(pca_sparse_train, train_indices, "train")
test_pca_df = reindexer.transform(pca_sparse_test, test_indices, "test")

In [None]:
eda_pca = my_utils.DataEDAPCA(columns=["PC1", "PC2"], trans = False)
pca_train_df, metadata_empty, outlier_df = eda_pca.fit_transform(train_pca_df)

In [None]:
pca_test_df

## Clustering on the Train Data

In [2]:
pca_train_df = pd.read_csv('../Output/pca_train_df_without_outliers.csv', index_col = 0)
pca_test_df =  pd.read_csv('../Output/pca_test_df.csv', index_col=0)

In [None]:
def silhouette_scorer(estimator, X):
    labels = estimator.fit_predict(X)
    if len(set(labels)) == 1:
        return 0  # Silhouette score is undefined for a single cluster
    return silhouette_score(X, labels)

def optimize_and_compare_hdbscan(data, hdbscan_params, alpha=0.05):
    # Perform Grid Search
    grid_search = GridSearchCV(
        estimator=HDBSCAN(min_cluster_size=20),
        param_grid=hdbscan_params,
        scoring=silhouette_scorer,
        cv=5,
        n_jobs=-1,
    )
    grid_search.fit(data)
    grid_search_estimator = grid_search.best_estimator_

    # Calculate silhouette scores for the default and grid search estimators
    default_hdbscan = HDBSCAN(min_cluster_size=20).fit(data)
    default_labels = default_hdbscan.labels_
    default_silhouette_score = silhouette_score(data, default_labels)

    grid_search_labels = grid_search_estimator.fit_predict(data)
    grid_search_silhouette_score = silhouette_score(data, grid_search_labels)

    # Check if the grid search estimator has a higher silhouette score
    if grid_search_silhouette_score > default_silhouette_score:
        # Perform a two-sample t-test
        t_stat, p_value = stats.ttest_ind(default_labels, grid_search_labels)

        # Check if the p-value is less than the significance level
        if p_value < alpha:
            choice = "Grid Search Estimator"
        else:
            choice = "Default Parameter"
    else:
        choice = "Default Parameter"

    # Output informative print statements
    print("Default HDBSCAN Silhouette Score:", default_silhouette_score)
    print("Grid Search Estimator Silhouette Score:", grid_search_silhouette_score)

    if grid_search_silhouette_score > default_silhouette_score:
        if p_value < alpha:
            print("The difference between the two groups is statistically significant.")
            print(f"Using {choice} as it performs significantly better.")
        else:
            print("The difference between the two groups is not statistically significant.")
            print(f"Using {choice} as there is no significant improvement.")
    else:
        print("Default Parameter has a higher silhouette score. No t-test performed.")

    return choice

# Define the parameter grid for HDBSCAN
hdbscan_params = {
    'min_samples': [10, 30, 50, 60, 100],
    'min_cluster_size': [100, 200, 300, 400, 500, 600],
    'cluster_selection_method': ['eom', 'leaf'],
    'metric': ['euclidean', 'manhattan']
}

# Usage example with parameters
result = optimize_and_compare_hdbscan(train_without_outliers, hdbscan_params)

In [None]:
result

In [None]:
pca_test_df

In [3]:
kmeans_params = {
    'n_clusters': list(range(1, 10)),
    'init': ['random', 'k-means++'],
    'n_init': [1, 5, 10],
    'max_iter': [300],
    'random_state': [0]
}

# Create a pipeline
k_means_pipe = Pipeline([
    ("clusterer", my_utils.OptimizeAndCompareKMeans(kmeans_params)),
])

results = k_means_pipe.fit(pca_train_df)
best_estimator = results.named_steps['clusterer'].best_estimator
predictions_test = best_estimator.predict(pca_test_df)

from sklearn.metrics import silhouette_score

# Assuming you have already fitted the pipeline and obtained the best_estimator

# For training data
silhouette_train = silhouette_score(pca_train_df, best_estimator.labels_)

# For test data
silhouette_test = silhouette_score(pca_test_df, best_estimator.predict(pca_test_df))

print(f'Silhouette Score on training data: {silhouette_train}')
print(f'Silhouette Score on test data: {silhouette_test}')

Default KMeans Silhouette Score: 0.8797200918856573
Grid Search Estimator Silhouette Score: 0.8799544859177173
The difference between the two groups is not statistically significant.
Using Default KMeans Estimator as there is no significant improvement using a threshold of alpha = .05.
Silhouette Score on training data: 0.8797200918856573
Silhouette Score on test data: 0.8752799631951433


Silhoute Score is robust

https://towardsdatascience.com/tuning-with-hdbscan-149865ac2970

Things to consider


1) Get the clustering to work in a pipeline
2) Are these ok default parameters
3) What to do with the test data
4) Should we be using silhoute on grid search or should i be optimizaing it differently
5) Are we normalizing it the right way? Since normalize is working along rows is this dealing with out of domain sampels? Just used standard scalar because no normalizers could work and some of them did along rows
6) Make it so it put things in folders
7) Quality check NAS? How do to this with a sparse matrix
8) Do soemthing like she did in notebook 3 to label the data set
9) Naive classifier and base? 
10) Would k-means be base what is naive?


Transposed Data

Task 2

In [None]:
data_path = '../Output/non_zero.parquet'
row_info_path = '../Output/row_names.csv'
column_info_path = '../Output/col_names.csv'
#sparse_matrix = coo_matrix((data['nonzero_elements'], (data['row_indices'], data['col_indices'])))
sparse_matrix_trans, row_names_trans, col_names_trans, row_indices_trans= my_utils.load_data(data_path, row_info_path, column_info_path, transpose = True)

In [None]:
split_pipeline_trans = Pipeline([
    ('splitter', my_utils.SparseTrainTestSplit(
        test_size=0.2,
        random_state=42,
        row_indices=row_indices_trans
    )),
    # Add other steps in the pipeline as needed
])

# Fit and transform the pipeline
(
    sparse_train_trans,
    sparse_test_trans,
    train_indices_trans,
    test_indices_trans
) = split_pipeline_trans.fit_transform(sparse_matrix_trans)

In [None]:
clean_and_pca_pipeline_trans = Pipeline([
    ('cleaner', my_utils.DataClean()),  # CleanData is performed first
    ('scaler', StandardScaler(with_mean=False)),
    ('pca', TruncatedSVD(n_components=2))
])

# Fit the pipeline to training
clean_and_pca_pipeline_trans.fit(sparse_train_trans)
# Transform the training data
pca_sparse_train_trans = clean_and_pca_pipeline_trans.transform(sparse_train_trans)
pca_sparse_test_trans= clean_and_pca_pipeline_trans.transform(sparse_test_trans)

In [None]:
reindexer_trans = my_utils.Reindex(columns=["PC1", "PC2"], names=row_names_trans, output_folder="../Output")
train_pca_df_trans = reindexer_trans.transform(pca_sparse_train_trans, train_indices_trans, "train")
test_pca_df_trans = reindexer_trans.transform(pca_sparse_test_trans, test_indices_trans, "test")

In [None]:
train_pca_df_trans

In [None]:
eda_pca = my_utils.DataEDAPCA(columns=["PC1", "PC2"])
updated_train_pca_df_trans, metadata_removed, outlier_df_trans= eda_pca.fit_transform(train_pca_df_trans,another_df=  metadata, trans = "trans")

Clustering on the Transposed Data

In [None]:
import gzip
import os
import shutil
import pandas as pd
from pyarrow.parquet import read_table
from scipy.sparse import coo_matrix
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, normalize
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.base import BaseEstimator, TransformerMixin
from memory_profiler import memory_usage
import matplotlib.pyplot as plt
import seaborn as sns
from kneed import KneeLocator
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import adjusted_rand_score
from sklearn.model_selection import GridSearchCV
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import GridSearchCV
from scipy import stats
from sklearn.cluster import HDBSCAN
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import silhouette_score
from scipy import stats
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split

# You can also define custom functions, classes, and other code in this module.

import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix

In [None]:
def silhouette_scorer(estimator, X):
    labels = estimator.fit_predict(X)
    if len(set(labels)) == 1:
        return 0  # Silhouette score is undefined for a single cluster
    return silhouette_score(X, labels)

def optimize_and_compare_hdbscan(data, hdbscan_params, alpha=0.05):
    # Perform Grid Search
    grid_search = GridSearchCV(
        estimator=HDBSCAN(min_cluster_size=20),
        param_grid=hdbscan_params,
        scoring=silhouette_scorer,
        cv=5,
        n_jobs=-1,
    )
    grid_search.fit(data)
    grid_search_estimator = grid_search.best_estimator_

    # Calculate silhouette scores for the default and grid search estimators
    default_hdbscan = HDBSCAN(min_cluster_size=20).fit(data)
    default_labels = default_hdbscan.labels_
    default_silhouette_score = silhouette_score(data, default_labels)

    grid_search_labels = grid_search_estimator.fit_predict(data)
    grid_search_silhouette_score = silhouette_score(data, grid_search_labels)

    # Check if the grid search estimator has a higher silhouette score
    if grid_search_silhouette_score > default_silhouette_score:
        # Perform a two-sample t-test
        t_stat, p_value = stats.ttest_ind(default_labels, grid_search_labels)

        # Check if the p-value is less than the significance level
        if p_value < alpha:
            choice = "Grid Search Estimator"
        else:
            choice = "Default Parameter"
    else:
        choice = "Default Parameter"

    # Output informative print statements
    print("Default HDBSCAN Silhouette Score:", default_silhouette_score)
    print("Grid Search Estimator Silhouette Score:", grid_search_silhouette_score)

    if grid_search_silhouette_score > default_silhouette_score:
        if p_value < alpha:
            print("The difference between the two groups is statistically significant.")
            print(f"Using {choice} as it performs significantly better.")
        else:
            print("The difference between the two groups is not statistically significant.")
            print(f"Using {choice} as there is no significant improvement.")
    else:
        print("Default Parameter has a higher silhouette score. No t-test performed.")

    return choice

# Define the parameter grid for HDBSCAN
hdbscan_params = {
    'min_samples': [10, 30, 50, 60, 100],
    'min_cluster_size': [100, 200, 300, 400, 500, 600],
    'cluster_selection_method': ['eom', 'leaf'],
    'metric': ['euclidean', 'manhattan']
}

# Usage example with parameters
result = optimize_and_compare_hdbscan(updated_train_pca_df_trans, hdbscan_params)