In [None]:
import numpy as np
import pandas as pd

In [None]:
#Import the excel sheet with raw connectivity data
df = pd.read_excel (r"D:\Gordon Lab\codex SEZ TPN project\Paper preparation\paper_final raw data\connectivity\ACh interneuron vs 2nd projection\2nd projection_final.xlsx", sheet_name='clustermap 783')
print (df)

In [None]:
#Clean the excel sheet
df=df.set_index('type')
y = df.index.tolist()
df_a = df.fillna(0)

In [None]:
#Convert dataframe to arrays for preprocessing
data_x = df_a.values

In [None]:
#Normalize and perform dimensional reduction on a sparse matrix converted from the data array
from sklearn.preprocessing import normalize
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix

sparse_matrix = csr_matrix(data_x)
sparse_matrix = normalize(sparse_matrix, axis=1, norm='l2')

n_components = 10
svd = TruncatedSVD(n_components=n_components, random_state=42)
reduced_data = svd.fit_transform(sparse_matrix)
print(svd.explained_variance_ratio_.sum())

In [None]:
#Perform hierarchical clustering
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
Z = linkage(reduced_data, method='ward', metric='euclidean')
plt.figure(figsize=(20, 5))
dendrogram_result=dendrogram(Z, leaf_rotation=90, leaf_font_size=10, labels=df_a.index)
#plt.savefig("GRN vs 2ndTPN dendrogram.pdf", format="pdf", dpi=300)

In [None]:
#Reorder the excel sheet based on the dendrogram order for plotting
dendrogram_order = dendrogram_result['leaves']
data_reordered = df.iloc[dendrogram_order]
data_reordered = data_reordered.fillna(0)
#data_reordered.to_excel('Ordered GRN vs 2nd TPN clustering.xlsx')

In [None]:
#Perform Silhouette test to determine the number of clusters
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
cluster_range = range(2, 15)
silhouette_scores = []

for n_clusters in cluster_range:
    clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage = 'average', metric='correlation')
    cluster_labels = clustering.fit_predict(reduced_data)

    silhouette_avg = silhouette_score(reduced_data, cluster_labels)
    silhouette_scores.append(silhouette_avg)

plt.plot(cluster_range, silhouette_scores, marker='o')
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Score vs. Number of Clusters")
plt.grid(True)
#plt.savefig('Silhouette score GRN vs 2nd TPN.pdf',dpi=300)

In [None]:
#Add cluster labels to a separate excel sheet and save it
from sklearn.cluster import AgglomerativeClustering
cluster = AgglomerativeClustering(n_clusters=10, metric='correlation', linkage='average')
cluster_labels = cluster.fit_predict(reduced_data)
df['Cluster'] = cluster_labels
#df.to_excel('1st TPN raw sheet with clusterlabels.xlsx', index=True)

In [None]:
#Store the reduced data in a dataframe and cleaned up for a scatter plot and distance matrix
df_plot = pd.DataFrame(reduced_data)
reduceddata_reordered = df_plot.iloc[dendrogram_order]

In [None]:
#Plot pairwise distance matrix
from sklearn.metrics.pairwise import pairwise_distances
import seaborn as sns
corr_matrix = pairwise_distances(reduceddata_reordered, metric='euclidean')
sns.heatmap(corr_matrix, cmap = 'Reds', square=True)
#plt.savefig('GRN vs 2ndTPN correlation distance heatmap.pdf',dpi=300)