In [4]:
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
from sklearn.metrics.cluster import adjusted_rand_score

In [73]:
# dataset to use, either 'hb' or 'brca1'
dataset = 'HB'

In [74]:
if dataset == 'HB':
    anno_df = pd.read_csv('barcode_annotations/hb_barcodes_annotated.csv')
    padded_df = pd.read_csv('barcode_annotations/hb_barcodes_labeled.csv')
    k = 7 # number of regions
    df = pd.read_csv('res/HB/NSFH_L=8_T=7_M=3639_HB_spatial_factors.csv')

    savename = 'res/HB/NSFH_HB_spatial_factors_clustered.csv'
elif dataset == 'BRCA1':
    anno_df = pd.read_csv('barcode_annotations/brca1_barcodes_annotated.csv')
    padded_df = pd.read_csv('barcode_annotations/brca1_barcodes_labeled.csv')
    k = 20 # number of regions
    df = pd.read_csv('res/BRCA1/NSFH_L=21_T=20_M=3789_BRCA1_spatial_factors.csv')
    savename = 'res/BRCA1/NSFH_BRCA1_spatial_factors_clustered.csv'

anno_labels = list(anno_df['reg_labels'])

# remove spots that don't have labels (background spots)
val_idx = [i for i in range(len(anno_labels)) if anno_labels[i] != -1]
anno_arr = np.array(anno_labels)
anno_arr = anno_arr[val_idx]

# remove background spots
X = df.values[val_idx,3:]

# normalize spatial components
X_norm = X / np.linalg.norm(np.double(X), axis=0)

In [75]:
# run Kmeans with k=number of regions
kmeans = KMeans(n_clusters=k, random_state=0).fit(X_norm)

In [76]:
cluster_labels = kmeans.labels_

# Insert clustering labels into corresponding rows for each spot to be used for visualization
full_clusters = np.zeros((len(anno_labels),1))
for i in range(len(val_idx)):
    full_clusters[val_idx[i]] = int(cluster_labels[i])

In [77]:
val_barcode_list = anno_df['barcode']
padded_df = padded_df.fillna(' ')
barcode_list = padded_df['barcode']
padded_clusters = np.zeros((len(barcode_list),1))

for i in range(len(barcode_list)):
    barcode = barcode_list[i]
    if barcode == ' ':
        continue
    for j in range(len(val_barcode_list)):
        if barcode == val_barcode_list[j]:
            padded_clusters[i] = full_clusters[j]
            break

In [78]:
# Calculate ARI
print(adjusted_rand_score(anno_arr, cluster_labels))

0.24091976178303684


In [70]:
padded_df['clusters'] = padded_clusters

In [72]:
padded_df.to_csv(savename, index=False)