# Get data

In [1]:
import os
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, normalize
from sklearn.decomposition import TruncatedSVD
import numpy as np
import my_utils
import gzip
import shutil
from sklearn.pipeline import Pipeline

In [None]:
input_gz_file = '../Output/GSE185948_count_RNA.rds.gz'
output_rds_file = '../Output/data_for_r.rds'

# Open the compressed file and extract it
with gzip.open(input_gz_file, 'rb') as f_in, open(output_rds_file, 'wb') as f_out:
    shutil.copyfileobj(f_in, f_out)

print(f'{input_gz_file} has been successfully uncompressed to {output_rds_file}.')

In [None]:
input_gz_file = '../Output/GSE185948_metadata_RNA.csv.gz'
output_csv_file = '../Output/uncompressed_metadata.csv'

# Open the compressed file and extract it
with gzip.open(input_gz_file, 'rb') as f_in, open(output_csv_file, 'wb') as f_out:
    shutil.copyfileobj(f_in, f_out)

print(f'{input_gz_file} has been successfully uncompressed to {output_csv_file}.')

In [None]:
output_path = '../Output'
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [4]:
metadata = pd.read_csv('../Output/uncompressed_metadata.csv')

R code below

## Data Preparation

In [None]:
pip install --upgrade --no-deps memory_profiler

In [None]:
data_path = '../Output/non_zero.parquet'
row_info_path = '../Output/row_names.csv'
column_info_path = '../Output/col_names.csv'
#sparse_matrix = coo_matrix((data['nonzero_elements'], (data['row_indices'], data['col_indices'])))
sparse_matrix, column_names, row_names, row_indices= my_utils.load_data(data_path, row_info_path, column_info_path)

In [None]:
split_pipeline = Pipeline([
    ('splitter', my_utils.SparseTrainTestSplit(test_size=0.2, random_state=42, row_indices = row_indices)),
    # Add other steps in the pipeline as needed
])

# Fit and transform the pipeline
sparse_train, sparse_test, train_indices, test_indices = split_pipeline.fit_transform(sparse_matrix)


In [None]:
sparse_train

In [None]:
clean_and_pca_pipeline = Pipeline([
    ('cleaner', my_utils.DataClean()),  # CleanData is performed first
    ('scaler', StandardScaler(with_mean=False)),
    ('pca', TruncatedSVD(n_components=2))
])

# Fit the pipeline to training
clean_and_pca_pipeline.fit(sparse_train)
# Transform the training data
pca_sparse_train = clean_and_pca_pipeline.transform(sparse_train)

pca_sparse_test = clean_and_pca_pipeline.transform(sparse_test)

In [None]:
reindexer = my_utils.Reindex(columns=["PC1", "PC2"], names=row_names, output_folder="../Output")
train_pca_df = reindexer.transform(pca_sparse_train, train_indices, "train")
test_pca_df = reindexer.transform(pca_sparse_test, test_indices, "test")

In [None]:
eda_pca = my_utils.DataEDAPCA(columns=["PC1", "PC2"], trans=False, graphs=True)
pca_train_df, metadata_empty, outlier_df = eda_pca.fit_transform(train_pca_df)

## Clustering on the Train Data

In [None]:
pca_train_df = pd.read_csv('../Output/pca_train_df_without_outliers.csv', index_col = 0)
pca_test_df =  pd.read_csv('../Output/pca_test_df.csv', index_col=0)

In [None]:
import time
from sklearn.metrics import silhouette_score

# Start timing
start_time = time.time()

# Your existing code for HDBSCAN clustering
hdbscan_params = {
    'min_samples': [10, 30, 50, 60, 100],
    'min_cluster_size': [100, 200, 300, 400, 500, 600],
    'cluster_selection_method': ['eom', 'leaf'],
    'metric': ['euclidean', 'manhattan']
}
# Create a pipeline
hdbscan_pipe = Pipeline([
    ("clusterer", my_utils.Optimize_and_Compare_Hdbscan(hdbscan_params)),
])

results_hbd = hdbscan_pipe.fit(pca_train_df)
best_estimator_hbd = results_hbd.named_steps['clusterer'].best_estimator

try:
    # Calculate silhouette score for test data
    silhouette_test_hbd = silhouette_score(pca_test_df, best_estimator_hbd.fit_predict(pca_test_df))
    print(f'Silhouette Score on test data: {silhouette_test_hbd}')
except ValueError as e:
    print("Only one cluster for my test data set. HDBSCAN does not work well for this data set")
end_time = time.time()
elapsed_time = end_time - start_time
silhouette_train_hbd = silhouette_score(pca_train_df, best_estimator_hbd.labels_)
print(f'Silhouette Score on training data: {silhouette_train_hbd}')
print(f'Time taken: {elapsed_time / 60:.2f} minutes')

In [None]:
list_predictions = best_estimator_hbd.fit_predict(pca_test_df)

In [None]:
import time
# Start timing
start_time = time.time()
kmeans_params = {
    'n_clusters': list(range(1, 10)),
    'init': ['random', 'k-means++'],
    'n_init': [1, 5, 10],
    'max_iter': [300],
    'random_state': [0]
}

# Create a pipeline
k_means_pipe = Pipeline([
    ("clusterer", my_utils.OptimizeAndCompareKMeans(kmeans_params)),
])

results = k_means_pipe.fit(pca_train_df)
best_estimator = results.named_steps['clusterer'].best_estimator
predictions_test = best_estimator.predict(pca_test_df)

from sklearn.metrics import silhouette_score

# Assuming you have already fitted the pipeline and obtained the best_estimator

# For training data
silhouette_train = silhouette_score(pca_train_df, best_estimator.labels_)

# For test data
silhouette_test = silhouette_score(pca_test_df, best_estimator.predict(pca_test_df))

# Stop timing
end_time = time.time()
elapsed_time = end_time - start_time

print(f'Silhouette Score on training data: {silhouette_train}')
print(f'Silhouette Score on test data: {silhouette_test}')
print(f'Time taken: {elapsed_time / 60:.2f} minutes')

I will use the Kmeans as my best estimator.

In [None]:
scoring_df_untransposed = my_utils.create_labels_and_scoring_df(best_estimator, '../Output/best_estimator_untransposed_data_label_and_score', pca_train_df, pca_test_df)

In [None]:
# Use your best_estimator to predict labels for the data
cluster_labels = best_estimator.predict(pca_train_df)

# Add the cluster_labels to the training data DataFrame
pca_train_df['Cluster'] = cluster_labels

# Create a scatter plot to visualize the clustering
plt.figure(figsize=(8, 6))
sns.scatterplot(x='PC1', y='PC2', data=pca_train_df, hue='Cluster', palette='viridis', s=50, alpha=0.7)

# Add labels and title
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Clustering Visualization')

# Show the plot
plt.grid(True)
plt.legend(title='Cluster', loc='upper right')
plt.show()

https://towardsdatascience.com/tuning-with-hdbscan-149865ac2970

Things to consider


1) Get the clustering to work in a pipeline
2) Are these ok default parameters
3) What to do with the test data
4) Should we be using silhoute on grid search or should i be optimizaing it differently
5) Are we normalizing it the right way? Since normalize is working along rows is this dealing with out of domain sampels? Just used standard scalar because no normalizers could work and some of them did along rows
6) Make it so it put things in folders
7) Quality check NAS? How do to this with a sparse matrix
8) Do soemthing like she did in notebook 3 to label the data set
9) Naive classifier and base? 
10) Would k-means be base what is naive?


Transposed Data

Task 2

In [2]:
data_path = '../Output/non_zero.parquet'
row_info_path = '../Output/row_names.csv'
column_info_path = '../Output/col_names.csv'
#sparse_matrix = coo_matrix((data['nonzero_elements'], (data['row_indices'], data['col_indices'])))
sparse_matrix_trans, row_names_trans, col_names_trans, row_indices_trans= my_utils.load_data(data_path, row_info_path, column_info_path, transpose = True)

Returning Transposed matrix, row_names of the transposed matrix, col_names of the transposed matrix, and row_indices of transposed matrix


In [3]:
split_pipeline_trans = Pipeline([
    ('splitter', my_utils.SparseTrainTestSplit(
        test_size=0.2,
        random_state=42,
        row_indices=row_indices_trans
    )),
    # Add other steps in the pipeline as needed
])

# Fit and transform the pipeline
(
    sparse_train_trans,
    sparse_test_trans,
    train_indices_trans,
    test_indices_trans
) = split_pipeline_trans.fit_transform(sparse_matrix_trans)

## Demographic Cleaning 

In [5]:
cell_count = metadata['patient'].value_counts(normalize=True)
cell_count.columns = ['patient', 'percentage']
cell_count
## Information on how much each patients account for total cells

PKD2        0.106241
PKD6        0.104167
control4    0.099338
control2    0.093009
PKD1        0.090575
PKD3        0.088687
PKD4        0.088112
PKD5        0.077879
control5    0.075679
control3    0.065193
control1    0.062428
PKD7        0.024428
PKD8        0.024262
Name: patient, dtype: float64

In [6]:
def get_training_meta_data(row_names_list, train_indices, metadata):
    # Create a DataFrame from the row_names_list with the 'name' column
    row_names_df = pd.DataFrame(row_names_list, columns=['name'])

    # Subset the row_names_df using the train_indices
    subset_row_names_df = row_names_df.loc[train_indices]

    # Subset the metadata to include only rows with 'name' values from subset_row_names_df
    subset_metadata = metadata[metadata['name'].isin(subset_row_names_df['name'])]

    return subset_metadata

# Example usage of the function
subsetted_metadata = get_training_meta_data(row_names_trans, train_indices_trans, metadata)

In [7]:
import pandas as pd
import numpy as np

# Assuming you have the 'subsetted_metadata' DataFrame
group_counts = subsetted_metadata.groupby(['gender', 'disease']).size().reset_index(name='count')

# Calculate the minimum count
min_count = group_counts['count'].min()

# Create an empty DataFrame to store the even distribution sample
even_distribution_sample = pd.DataFrame(columns=subsetted_metadata.columns)

# Randomly sample rows for each group
for group_name, group_data in group_counts.groupby(['gender', 'disease']):
    group_size = group_data['count'].iloc[0]
    sample_size = min(min_count, group_size)
    
    # Randomly sample rows for the current group
    sampled_rows = subsetted_metadata[subsetted_metadata['gender'] == group_name[0]][subsetted_metadata['disease'] == group_name[1]].sample(n=sample_size, random_state=42)
    
    # Append the sampled rows to the even_distribution_sample DataFrame
    even_distribution_sample = even_distribution_sample.append(sampled_rows)

# Reset the index of the even_distribution_sample
even_distribution_sample = even_distribution_sample.reset_index(drop=True)

# Display the resulting DataFrame with an even distribution



  sampled_rows = subsetted_metadata[subsetted_metadata['gender'] == group_name[0]][subsetted_metadata['disease'] == group_name[1]].sample(n=sample_size, random_state=42)
  even_distribution_sample = even_distribution_sample.append(sampled_rows)
  sampled_rows = subsetted_metadata[subsetted_metadata['gender'] == group_name[0]][subsetted_metadata['disease'] == group_name[1]].sample(n=sample_size, random_state=42)
  even_distribution_sample = even_distribution_sample.append(sampled_rows)
  sampled_rows = subsetted_metadata[subsetted_metadata['gender'] == group_name[0]][subsetted_metadata['disease'] == group_name[1]].sample(n=sample_size, random_state=42)
  even_distribution_sample = even_distribution_sample.append(sampled_rows)
  sampled_rows = subsetted_metadata[subsetted_metadata['gender'] == group_name[0]][subsetted_metadata['disease'] == group_name[1]].sample(n=sample_size, random_state=42)
  even_distribution_sample = even_distribution_sample.append(sampled_rows)


In [8]:
balanced_name_list = even_distribution_sample['name'].to_list()

In [43]:
df_of_balanced_indices

Unnamed: 0,name
47294,PKD_ACGTAACCACTGCACG-1_6
34455,PKD_GCTGCAGCATGTGCTA-1_4
12668,PKD_CATTGAGGTCCAACGC-1_2
75143,Cont_GTTACGACAGCTGGTC-1_2
87188,Cont_CAACAACAGTGGAATT-1_4
...,...
41090,PKD_CCATAAGCATCGGTTA-1_5
60263,PKD_CACTAAGAGAAACCCG-1_8
82386,Cont_GTAGAGGGTTTCACAG-1_3
6265,PKD_GTGTTAGCACGAAGAC-1_1


In [35]:
df_of_balanced_indices.index

Int64Index([47294, 34455, 12668, 75143, 87188, 88489, 97095, 21165, 84510,
            98871,
            ...
            53707, 83104, 64925, 59735,   769, 41090, 60263, 82386,  6265,
            76820],
           dtype='int64', length=46304)

In [10]:
row_names_df = pd.DataFrame(row_names_trans, columns=['name'])
subset_row_names_df = row_names_df.loc[train_indices_trans]

In [14]:
df_of_balanced_indices = subset_row_names_df[subset_row_names_df['name'].isin(balanced_name_list)]

In [37]:
min(df_of_balanced_indices.index)

Int64Index([47294, 34455, 12668, 75143, 87188, 88489, 97095, 21165, 84510,
            98871,
            ...
            53707, 83104, 64925, 59735,   769, 41090, 60263, 82386,  6265,
            76820],
           dtype='int64', length=46304)

In [41]:
max(df_of_balanced_indices.index)

102708

In [42]:
len(sparse_train_trans_csr.indices)

168492710

In [40]:
max(sparse_train_trans_csr.indices)

27969

In [36]:
indices_to_reduce = np.array(df_of_balanced_indices.index)

# Create a boolean mask that selects rows based on the indices
mask = np.isin(sparse_train_trans_csr.indices, df_of_balanced_indices.index)

# Use the mask to filter the rows
filtered_sparse_matrix = sparse_train_trans_csr[mask, :]

IndexError: index (168492708) out of range

In [33]:
indices_to_reduce = np.array(df_of_balanced_indices.index)

# Create a boolean mask to select rows based on the indices
mask = np.isin(sparse_train_trans_csr.indices, indices_to_reduce)

# Use the mask to select the valid rows
reduced_rows = sparse_train_trans_csr[mask]

# Ensure the result is a CSR matrix
reduced_sparse_matrix = reduced_rows.tocsr()


IndexError: index (168492708) out of range

In [None]:
reduced_sparse_matrix

In [24]:
mask = np.isin(set(sparse_train_trans_csr.indices), df_of_balanced_indices.index)

{0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [None]:

# Step 2: Create a boolean mask based on the indices in 'df_of_balanced_indices'
mask = np.isin(sparse_train_trans_csr.indices, df_of_balanced_indices.index)

# Step 3: Use the mask to subset the indices and data efficiently
subset_indices = sparse_train_trans_csr.indices[mask]
subset_data = sparse_train_trans_csr.data[mask]

# Step 4: Create the final balanced sparse matrix
sparse_train_trans_balanced = csr_matrix((subset_data, subset_indices, sparse_train_trans_csr.indptr), shape=sparse_train_trans_csr.shape)

# Now, sparse_train_trans_balanced is your computationally efficient balanced csr_matrix


In [82]:
np.max(df_of_balanced_incices.index)

102708

In [79]:
len(df_of_balanced_incices.index)

46304

In [64]:
sparse_train_csr.indices

array([   14,    51,    55, ..., 25159, 25270, 25347], dtype=int32)

In [55]:
np.arange(sparse_train_csr.shape[0])

array([    0,     1,     2, ..., 82165, 82166, 82167])

In [36]:
subset_sparse_train_coo.shape

(33494, 27970)

In [21]:
clean_and_pca_pipeline_trans = Pipeline([
    ('cleaner', my_utils.DataClean(trans=True)),  # Specify that the data is transposed
    ('scaler', StandardScaler(with_mean=False)),
    ('pca', TruncatedSVD(n_components=2))
])
# Fit the pipeline to training
clean_and_pca_pipeline_trans.fit(sparse_train_trans)
# Transform the training data
pca_sparse_train_trans = clean_and_pca_pipeline_trans.transform(subset_sparse_train_coo)
pca_sparse_test_trans= clean_and_pca_pipeline_trans.transform(sparse_test_trans)

In [26]:
pca_sparse_train_trans

array([[ 42.94591579,  10.32372977],
       [ 36.07949886,   6.48549349],
       [ 90.31747513, -11.33265508],
       ...,
       [ 25.79444845,  -2.92319818],
       [ 76.11579509, -17.21161261],
       [ 14.27471937,  -1.68532862]])

In [25]:
reindexer_trans = my_utils.Reindex(columns=["PC1", "PC2"], names=row_names_trans, output_folder="../Output", trans = True)
train_pca_df_trans = reindexer_trans.transform(pca_sparse_train_trans,  df_of_balanced_incices.index, "train")
test_pca_df_trans = reindexer_trans.transform(pca_sparse_test_trans, test_indices_trans, "test")

ValueError: Shape of passed values is (33494, 2), indices imply (46304, 2)

In [None]:
eda_pca = my_utils.DataEDAPCA(columns=["PC1", "PC2"], trans = True, graphs = True)
updated_train_pca_df_trans, metadata_removed, outlier_df_trans= eda_pca.fit_transform(train_pca_df_trans,another_df=  metadata)

Clustering on the Transposed Data

In [None]:
import gzip
import os
import shutil
import pandas as pd
from pyarrow.parquet import read_table
from scipy.sparse import coo_matrix
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, normalize
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.base import BaseEstimator, TransformerMixin
from memory_profiler import memory_usage
import matplotlib.pyplot as plt
import seaborn as sns
from kneed import KneeLocator
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import adjusted_rand_score
from sklearn.model_selection import GridSearchCV
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import GridSearchCV
from scipy import stats
from sklearn.cluster import HDBSCAN
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import silhouette_score
from scipy import stats
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split

# You can also define custom functions, classes, and other code in this module.

import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix

In [None]:
def silhouette_scorer(estimator, X):
    labels = estimator.fit_predict(X)
    if len(set(labels)) == 1:
        return 0  # Silhouette score is undefined for a single cluster
    return silhouette_score(X, labels)

def optimize_and_compare_hdbscan(data, hdbscan_params, alpha=0.05):
    # Perform Grid Search
    grid_search = GridSearchCV(
        estimator=HDBSCAN(min_cluster_size=20),
        param_grid=hdbscan_params,
        scoring=silhouette_scorer,
        cv=5,
        n_jobs=-1,
    )
    grid_search.fit(data)
    grid_search_estimator = grid_search.best_estimator_

    # Calculate silhouette scores for the default and grid search estimators
    default_hdbscan = HDBSCAN(min_cluster_size=20).fit(data)
    default_labels = default_hdbscan.labels_
    default_silhouette_score = silhouette_score(data, default_labels)

    grid_search_labels = grid_search_estimator.fit_predict(data)
    grid_search_silhouette_score = silhouette_score(data, grid_search_labels)

    # Check if the grid search estimator has a higher silhouette score
    if grid_search_silhouette_score > default_silhouette_score:
        # Perform a two-sample t-test
        t_stat, p_value = stats.ttest_ind(default_labels, grid_search_labels)

        # Check if the p-value is less than the significance level
        if p_value < alpha:
            choice = "Grid Search Estimator"
        else:
            choice = "Default Parameter"
    else:
        choice = "Default Parameter"

    # Output informative print statements
    print("Default HDBSCAN Silhouette Score:", default_silhouette_score)
    print("Grid Search Estimator Silhouette Score:", grid_search_silhouette_score)

    if grid_search_silhouette_score > default_silhouette_score:
        if p_value < alpha:
            print("The difference between the two groups is statistically significant.")
            print(f"Using {choice} as it performs significantly better.")
        else:
            print("The difference between the two groups is not statistically significant.")
            print(f"Using {choice} as there is no significant improvement.")
    else:
        print("Default Parameter has a higher silhouette score. No t-test performed.")

    return choice

# Define the parameter grid for HDBSCAN
hdbscan_params = {
    'min_samples': [10, 30, 50, 60, 100],
    'min_cluster_size': [100, 200, 300, 400, 500, 600],
    'cluster_selection_method': ['eom', 'leaf'],
    'metric': ['euclidean', 'manhattan']
}

# Usage example with parameters
result = optimize_and_compare_hdbscan(updated_train_pca_df_trans, hdbscan_params)