In [None]:
# Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import Functions as fns

In [None]:
# Load all protein data

df_all = # INPUT File that contains all data of form (# subjects, # proteins + other variables)
all_prots = # INPUT list of names of all proteins to perform t-SNE on 

print('Uploaded data contains ' + str(len(df_all)) + ' subjects and ' + str(len(all_prots)) + ' proteins.')

In [None]:
# Perform standard scaling and t-SNE

df_prot = df_all[all_prots].dropna().reset_index(drop=True)
df_prot_n = StandardScaler().fit_transform(df_prot)

tsne = TSNE(n_components=2,init='pca',random_state=1) #set random state for reproducability
comps = tsne.fit_transform(df_prot_n.T)
tsne_df = pd.DataFrame(data = comps, columns = ['comp1', 'comp2'])
print('t-SNE done.')

In [None]:
# Plot t-SNE results (Figure X)

basecol = 'skyblue'
plt.rcParams["figure.figsize"] = (10,6)
plt.rc('font', size=14)
plt.scatter(tsne_df['comp1'],tsne_df['comp2'],color=basecol,s=20)
plt.title('t-SNE')
plt.xticks([])
plt.yticks([])

In [None]:
# Color t-SNE space after certain characteristic. Example: x-axis placement

characteristic = np.array(tsne_df['comp1'])# Input array of characteristic ordered as all_prots.

# Set colormap
cmap = cm.get_cmap('viridis')
sep = np.linspace(0,1,len(all_prots))

# Min max scale characteristics 
cols_scaled =MinMaxScaler(feature_range=(0, 1)).fit_transform(characteristic.T.reshape(-1,1))
cscaled = cols_scaled.reshape(1,len(cols_scaled)).tolist()[0]

# OPTIONAL Sort proteins so that those with highest values are placed out last on t-SNE map for visability
idxs = np.argsort(cscaled,axis=0)
cscaled = np.sort(cols_scaled,axis=0)

#Plot
plt.scatter(tsne_df['comp1'].loc[idxs],tsne_df['comp2'].loc[idxs],color=cmap(cscaled),s=20)
plt.title('t-SNE')
plt.xticks([])
plt.yticks([])

In [None]:
# Perform K-means clustering 

n_clusters= 20
kmeans = KMeans(n_clusters=n_clusters, random_state=1)  #set random state for reproducability
clusters = kmeans.fit(tsne_df)
cluster_labels = clusters.labels_

print('K-means done.')

In [None]:
# Plot K-means results in t-SNE plot (Figure X)

plt.rcParams["figure.figsize"] = (10,6)
colors = cm.tab20(np.linspace(0, 1, n_clusters))
for i,col in zip(range(n_clusters),colors):
    clust_idx = cluster_labels == i
    plt.scatter(tsne_df['comp1'][clust_idx],tsne_df['comp2'][clust_idx],label='Cluster ' + str(i),color=col,s=20)
#plt.legend()  #Uncomment to check cluster number color
plt.title('t-SNE')
plt.xticks([])
plt.yticks([])

In [None]:
# Select baseline variables and X,y for cross validation of association 1

baseline_vars = ['age','gender_baseline_variable'] # INPUT of form [main predictor, confounders]
#X,y = # INPUT division into X and y of DataFrame depending on model. 


# Baseline comparison without reference
baseline_AUC = fns.get_mean_AUC_score(X,y,baseline_vars)

# Examine cluster wise reference results for tau association 
best_AUCs = []
best_AUCs_names = []
for n in range(n_clusters):
    clust_idx = cluster_labels == n
    test_cands = [prot for prot,i in zip(all_prots,range(len(clust_idx))) if clust_idx[i]]
    best_AUC, names_best_AUC = fns.get_best_candidates(X,y,test_cands,baseline_vars,amount=len(test_cands), n_splits=10)
    best_AUCs.append(best_AUC)
    best_AUCs_names.append(names_best_AUC)

In [None]:
# Plot cluster wise AUC results for association 

plt.rcParams["figure.figsize"] = (12,6)
plt.axhline(baseline_AUC,color='black',linestyle='dashed',label='Baseline')
for clust_AUC,col,i in zip(best_AUCs,colors,range(n_clusters)):
    plt.errorbar(i,np.mean(clust_AUC),np.std(clust_AUC),fmt = 'o',color = 'black',
                ecolor = col, elinewidth = 5, capsize=10)

plt.title('Association 1')
plt.xlabel('Cluster Number')
plt.xticks(np.arange(0,20,1))
plt.ylabel('AUC')
plt.legend()