# Hand Drawn Cluster Labeler

This code uses pre-dawn dapi surface image data to automatically group the surface into clusters and return the labels to the original data set. 

In [13]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import glob
import numpy as np
from sklearn.preprocessing import StandardScaler
import hdbscan


csv_files = glob.glob('csv_for_tools/*.csv')

# 2. Error if none found
if not csv_files:
    raise FileNotFoundError("No CSV files found in 'csv_for_tools/'")

# 3. Pick the first CSV
first_csv = csv_files[0]
print(f"Reading: {first_csv}")

# 4. Read into a DataFrame
# df = pd.read_csv(first_csv)
df = pd.read_csv(first_csv)

df

Reading: csv_for_tools/merged_for_tools.csv


Unnamed: 0,POSITION X,POSITION Y,POSITION Z,UNIT,CATEGORY,COLLECTION,TIME,ID,UNNAMED: 8,CD3,...,CLUSTER POSITION,H3CIT CD68- LY6G+,IL10,CD40,LY6G IN CONTACT WITH CD68,PDL1,CD68,H3CIT LY6G CLOSE TO CD68,H3CIT,cell_type
0,65165.043,47718.012,-1,µm,Surface,Position,1,3371,0,0,...,0,0,0,0,0,0,0,0,0,tumor cell / other immune cell
1,65026.387,47843.957,0,µm,Surface,Position,1,3372,0,0,...,0,0,0,0,0,0,0,0,0,tumor cell / other immune cell
2,64990.465,47872.090,-2,µm,Surface,Position,1,3373,0,0,...,0,0,0,0,0,1,0,0,0,tumor cell / other immune cell
3,65164.086,47707.301,-1,µm,Surface,Position,1,3375,0,0,...,0,0,0,0,0,0,0,0,0,tumor cell / other immune cell
4,64875.699,47986.750,-2,µm,Surface,Position,1,3376,0,0,...,0,0,0,0,0,0,0,0,0,tumor cell / other immune cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232424,61669.824,43809.270,-2,µm,Surface,Position,1,291754,0,0,...,0,0,0,0,0,0,1,0,0,macrophage
232425,61405.605,44071.590,-1,µm,Surface,Position,1,291759,0,0,...,0,0,0,0,0,0,0,0,0,tumor cell / other immune cell
232426,61917.965,43565.699,0,µm,Surface,Position,1,291771,0,0,...,0,0,1,0,0,0,0,0,0,tumor cell / other immune cell
232427,61681.199,43803.793,0,µm,Surface,Position,1,291785,0,0,...,0,0,0,0,0,0,0,0,0,tumor cell / other immune cell


In [14]:
df['CLUSTER POSITION'] = df['CLUSTER POSITION'].replace(0, 'not in cluster')
df

cluster_df = df[df['CLUSTER POSITION'] != 'not in cluster']
cluster_df

Unnamed: 0,POSITION X,POSITION Y,POSITION Z,UNIT,CATEGORY,COLLECTION,TIME,ID,UNNAMED: 8,CD3,...,CLUSTER POSITION,H3CIT CD68- LY6G+,IL10,CD40,LY6G IN CONTACT WITH CD68,PDL1,CD68,H3CIT LY6G CLOSE TO CD68,H3CIT,cell_type
3662,65161.027,47243.641,0,µm,Surface,Position,1,8858,0,0,...,1,0,0,1,0,0,0,0,0,other myeloid cell
3730,65171.816,47235.633,0,µm,Surface,Position,1,8955,0,0,...,1,0,0,0,0,0,1,0,0,macrophage
3733,65179.609,47229.664,-1,µm,Surface,Position,1,8959,0,0,...,1,0,1,0,0,0,1,0,0,macrophage
3753,65174.027,47236.312,-2,µm,Surface,Position,1,8981,0,0,...,1,0,0,0,0,0,0,0,0,other myeloid cell
3786,65160.281,47252.316,0,µm,Surface,Position,1,9023,0,0,...,1,0,0,0,1,0,1,0,0,macrophage
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227109,63888.074,42128.328,-1,µm,Surface,Position,1,282042,0,0,...,1,0,0,0,0,0,0,0,0,tumor cell / other immune cell
227129,63890.926,42121.480,-1,µm,Surface,Position,1,282072,0,0,...,1,0,0,0,0,0,0,0,0,tumor cell / other immune cell
227131,63882.266,42130.434,-1,µm,Surface,Position,1,282074,0,0,...,1,0,0,0,0,1,0,0,0,tumor cell / other immune cell
227138,63911.227,42106.164,-1,µm,Surface,Position,1,282082,0,0,...,1,0,0,0,0,0,0,0,0,other myeloid cell


In [None]:
# todo assume df's in the future have column called "cluser already" with  0's and 1's and 
# use this to get a subset of the data frame for clustering and return the new labels back to full df

# df = df - where cluster already == 1

coords = cluster_df[['POSITION X', 'POSITION Y']].values
coords_scaled = StandardScaler().fit_transform(coords)

# 3) Run HDBSCAN
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=30,
    min_samples=5,
    cluster_selection_epsilon=0.0
)
labels = clusterer.fit_predict(coords_scaled)

# 4) Attach labels back to df
#    This works because `labels` is the same length and order as df
cluster_df['hdbscan_cluster'] = labels

# 6) Quick check of cluster counts
unique, counts = np.unique(labels, return_counts=True)
print("Cluster label counts:", dict(zip(unique, counts)))

# 7) Prepare for plotting (omit noise if you like)
mask = labels != -1
df_plot = cluster_df.loc[mask, ['POSITION X', 'POSITION Y']].copy()
df_plot['cluster'] = cluster_df.loc[mask, 'hdbscan_cluster'].astype(str)

# 8) Plot
n_clusters = len(unique) - (1 if -1 in unique else 0)
plt.figure(figsize=(10, 8))

palette = 'tab20' if n_clusters > 10 else 'tab10'
legend = False if n_clusters > 10 else 'full'

sns.scatterplot(
    data=df_plot,
    x='POSITION X', y='POSITION Y',
    hue='cluster',
    palette=palette,
    s=30, linewidth=0,
    legend=legend
)
if legend:
    plt.legend(title='cluster', bbox_to_anchor=(1.05,1), loc='upper left')
plt.title(f'HDBSCAN: {n_clusters} clusters')
plt.xlabel('Position X (µm)')
plt.ylabel('Position Y (µm)')
plt.tight_layout()
plt.show()
cluster_df



# Review the ouput above before continuing

If the graph above looks correct then move to next cell below. Running the next cell will update the csv file and rewrite it by adding the cluster labels column. Make sure you are ok with this before running next cell. 

In [9]:
# 5) (Optional) Save your updated DataFrame
df.to_csv(first_csv, index=False)