In [None]:
import gzip
import shutil

In [None]:
input_gz_file = 'GSE185948_count_RNA.rds.gz'
output_rds_file = 'data_for_r.rds'

# Open the compressed file and extract it
with gzip.open(input_gz_file, 'rb') as f_in, open(output_rds_file, 'wb') as f_out:
    shutil.copyfileobj(f_in, f_out)

print(f'{input_gz_file} has been successfully uncompressed to {output_rds_file}.')

R code below

In [11]:
pip install memory_profiler

Collecting memory_profiler
  Downloading memory_profiler-0.61.0-py3-none-any.whl (31 kB)
Installing collected packages: memory_profiler
Successfully installed memory_profiler-0.61.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from pyarrow.parquet import read_table
from scipy.sparse import coo_matrix
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, normalize
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.base import BaseEstimator, TransformerMixin
from memory_profiler import memory_usage
import matplotlib.pyplot as plt
import seaborn as sns

from kneed import KneeLocator
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import adjusted_rand_score
from sklearn.model_selection import GridSearchCV

from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import GridSearchCV
from scipy import stats
from sklearn.cluster import HDBSCAN
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import silhouette_score
from scipy import stats


In [3]:


def load_data(data_path, row_info_path, column_info_path):
    # Load non_zero parquet data
    table = read_table(data_path)
    nonzero_data = table.to_pandas()
    
    # Adjust column indices to be 0-based
    nonzero_data['col_indices'] = nonzero_data['col_indices'] - 1
    
    # Load row and column index info
    rows = pd.read_csv(row_info_path)
    row_names = rows.iloc[:, 1].to_list()
    
    columns = pd.read_csv(column_info_path)
    column_names = columns.iloc[:, 1].to_list()
    
    # Convert the sparse matrix to a dense DataFrame
    sparse_matrix = coo_matrix(
        (nonzero_data['nonzero_elements'], (nonzero_data['row_indices'], nonzero_data['col_indices'])),
        shape=(len(row_names), len(column_names))
    )
    
    
   

    print('Returning sparse_matrix, column_names, and row_names')
    
    return sparse_matrix, column_names, row_names

In [4]:
data_path = 'non_zero.parquet'
row_info_path = 'row_names.csv'
column_info_path = 'col_names.csv'
#sparse_matrix = coo_matrix((data['nonzero_elements'], (data['row_indices'], data['col_indices'])))
sparse_matrix, row_names, column_names = load_data(data_path, row_info_path, column_info_path)

Returning sparse_matrix, column_names, and row_names


In [5]:
row_indices = np.arange(sparse_matrix.shape[0])

In [6]:
from sklearn.model_selection import train_test_split
sparse_train, sparse_test, train_indices, test_indices = train_test_split(
    sparse_matrix, row_indices, test_size=0.2, random_state=42
)
del sparse_matrix
del row_indices

In [7]:
train_row_names = [row_names[i] for i in train_indices]
test_row_names = [row_names[i] for i in test_indices]


In [10]:
pipeline = Pipeline([
        ('scaler', StandardScaler(with_mean = False)),
        ('pca', TruncatedSVD(n_components=2))
    ])

In [11]:
pipeline.fit(sparse_train)

# Transform the training data
pca_sparse_train = pipeline.transform(sparse_train)

In [12]:
pca_sparse_test = pipeline.transform(sparse_test)

In [13]:
train_pca_df = pd.DataFrame(pca_sparse_train, columns=["PC1", "PC2"], index=train_row_names)
test_pca_df = pd.DataFrame(pca_sparse_test, columns=["PC1", "PC2"], index=test_row_names)

In [14]:
train_pca_df

Unnamed: 0,PC1,PC2
PKD_CATCGCTCACTCAGAT-1_3,3.297208,-0.093384
PKD_GGACGTCGTATGGGAC-1_2,1.092518,-0.574449
PKD_AGTACTGCAATGCAGG-1_3,2.110547,-1.040423
PKD_CTGCCATTCTTCGTAT-1_2,269.334105,-184.999588
PKD_CAGCAATAGTCGGCCT-1_3,0.138904,-0.014087
...,...,...
PKD_ATCCTATGTTCCTAGA-1_3,39.704073,-17.631512
PKD_GGCTGTGAGGAACGAA-1_1,29.823898,-15.172676
PKD_AGGCCACCACAACCGC-1_1,83.125263,-6.222054
PKD_GGAATGGAGCCAAGGT-1_2,18.421691,-10.114604


In [15]:
test_pca_df

Unnamed: 0,PC1,PC2
PKD_CTCATCGGTTACACAC-1_2,69.309507,-36.936357
PKD_GTATTGGTCCCAATAG-1_3,0.051566,-0.035225
PKD_TCCTGCAAGGACGCTA-1_2,81.411556,-31.177147
PKD_GACTGATAGCACACCC-1_2,15.758585,-6.156989
PKD_GGGTCACAGTCATCGT-1_2,1.104429,-0.615846
...,...,...
PKD_GGGAAGTTCAAGCTTG-1_3,0.111826,-0.023407
PKD_CATCGGGGTAGGGAGG-1_3,0.236129,0.025927
PKD_TCCCAGTTCAGTCCGG-1_1,27.622148,-9.551978
PKD_TGTAGACCACCGGCTA-1_1,12.683483,-5.135039


In [16]:
train_pca_df.to_csv('train_pca_df.csv')
test_pca_df.to_csv('test_pca_df.csv')


## Clustering on the Train Data

In [18]:
def optimize_and_compare_kmeans(data, kmeans_params, alpha=0.05):
    # Perform Grid Search
    grid = GridSearchCV(KMeans(), kmeans_params, cv=3, refit=True)
    grid.fit(data)
    grid_search_estimator = grid.best_estimator_

    # Calculate silhouette scores
    default_kmeans = KMeans(n_clusters=2, random_state=0, n_init="auto").fit(data)
    default_silhouette_score = silhouette_score(data, default_kmeans.labels_)
    grid_search_silhouette_score = silhouette_score(data, grid_search_estimator.labels_)

    # Perform a two-sample t-test only if Grid Search performs better
    if grid_search_silhouette_score > default_silhouette_score:
        t_stat, p_value = stats.ttest_ind(default_kmeans.labels_, grid_search_estimator.labels_)

        # Set the default choice to "Grid Search Estimator"
        choice = "Grid Search Estimator"

        # Output informative print statements
        print("Default KMeans Silhouette Score:", default_silhouette_score)
        print("Grid Search Estimator Silhouette Score:", grid_search_silhouette_score)

        if p_value < alpha:
            print("The difference between the two groups is statistically significant.")
            print(f"Using {choice} as it performs significantly better using a threshold of alpha = .05 .")
        else:
            print("The difference between the two groups is not statistically significant.")
            print(f"Using {choice} as there is no significant improvement using a threshold of alpha = .05.")
    else:
        choice = "Default Parameter"
        print("Default Parameter has a higher Silhouette Score.")
        print("Using Default Parameter as it performs better based on Silhouette Score.")

    return choice

# Usage example with parameters
kmeans_params = {
    'n_clusters': list(range(1, 10)),
    'init': ['random', 'k-means++'],
    'n_init': [1, 5, 10],
    'max_iter': [300],
    'random_state': [0]
}
result = optimize_and_compare_kmeans(train_pca_df, kmeans_params)

Default Parameter has a higher Silhouette Score.
Using Default Parameter as it performs better based on Silhouette Score.


In [None]:
from sklearn.cluster import HDBSCAN

In [1]:
def silhouette_scorer(estimator, X):
    labels = estimator.fit_predict(X)
    if len(set(labels)) == 1:
        return 0  # Silhouette score is undefined for a single cluster
    return silhouette_score(X, labels)

def optimize_and_compare_hdbscan(data, hdbscan_params, alpha=0.05):
    # Perform Grid Search
    grid_search = GridSearchCV(
        estimator=HDBSCAN(min_cluster_size=20),
        param_grid=hdbscan_params,
        scoring=silhouette_scorer,
        cv=5,
        n_jobs=-1,
    )
    grid_search.fit(data)
    grid_search_estimator = grid_search.best_estimator_

    # Calculate silhouette scores for the default and grid search estimators
    default_hdbscan = HDBSCAN(min_cluster_size=20).fit(data)
    default_labels = default_hdbscan.labels_
    default_silhouette_score = silhouette_score(data, default_labels)

    grid_search_labels = grid_search_estimator.fit_predict(data)
    grid_search_silhouette_score = silhouette_score(data, grid_search_labels)

    # Perform a two-sample t-test
    t_stat, p_value = stats.ttest_ind(default_labels, grid_search_labels)

    # Set the default choice to "Grid Search Estimator"
    choice = "Grid Search Estimator"

    # Check if the p-value is less than the significance level
    if p_value < alpha:
        choice = "Grid Search Estimator"
    else:
        choice = "Default Parameter"

    # Output informative print statements
    print("Default HDBSCAN Silhouette Score:", default_silhouette_score)
    print("Grid Search Estimator Silhouette Score:", grid_search_silhouette_score)

    if p_value < alpha:
        print("The difference between the two groups is statistically significant.")
        print(f"Using {choice} as it performs significantly better.")
    else:
        print("The difference between the two groups is not statistically significant.")
        print(f"Using {choice} as there is no significant improvement.")

    return choice

# Define the parameter grid for HDBSCAN
hdbscan_params = {
    'min_samples': [10, 30, 50, 60, 100],
    'min_cluster_size': [100, 200, 300, 400, 500, 600],
    'cluster_selection_method': ['eom', 'leaf'],
    'metric': ['euclidean', 'manhattan']
}

# Usage example with parameters
result = optimize_and_compare_hdbscan(sparse_train_pca_df, hdbscan_params)

NameError: name 'sparse_train_pca_df' is not defined

https://towardsdatascience.com/tuning-with-hdbscan-149865ac2970

Things to consider


1) Get the clustering to work in a pipeline
2) Are these ok default parameters
3) What to do with the test data
4) Should we be using silhoute on grid search or should i be optimizaing it differently
5) Are we normalizing it the right way? Since normalize is working along rows is this dealing with out of domain sampels?
6) Make it so it put things in folders
7) Quality check NAS? How do to this with a sparse matrix
8) Do soemthing like she did in notebook 3 to label the data set
