# Figure 3. Correlation

In [None]:
import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', -1)

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import squareform

import networkx as nx
import matplotlib.pyplot as plt
from networkx.drawing.nx_pydot import graphviz_layout

## Load Data

In [None]:
df = pd.read_csv('./New_Data/TCGA_GTEX_SLC_103.csv')

df = df[df['label_GTEx_100']!=100] # GTEX 제거
df.drop(columns=['sample', 'TCGA_GTEX_main_category', 'cancer', 'label', 'label_GTEx_100'], inplace=True) # SLC column만 남기고 제거

df_corr = df.corr(method='pearson')
df_corr.to_csv('New_Data/correlation_103.csv')

## Correlation Matrix

In [None]:
idx = df_corr.index.values
# Corr df -> array
corr_arr = np.array(df_corr)

corr_1 = []
corr_2 = []
corr_3 = []
corr_4 = []

# SLC name list
idx = df_corr.index.values

for i in range(df_corr.shape[0]):
    for j in range(i+1, df_corr.shape[0]):
        tmp = df_corr.iloc[i,j]
        if (tmp>0.3)&(tmp<0.7):
            corr_1.append((idx[i], idx[j], tmp))
        if tmp>=0.7:
            corr_2.append((idx[i], idx[j], tmp))
        if (tmp>-0.7)&(tmp<-0.3):
            corr_3.append((idx[i], idx[j], tmp))
        if tmp<=-0.7:
            corr_4.append((idx[i], idx[j], tmp))

# DataFrame으로 만들기
df_corr_1 = pd.DataFrame(corr_1, columns=['name1', 'name2', 'corr_value'])
df_corr_1.shape

df_corr_2 = pd.DataFrame(corr_2, columns=['name1', 'name2', 'corr_value'])
df_corr_2.shape

df_corr_3 = pd.DataFrame(corr_3, columns=['name1', 'name2', 'corr_value'])
df_corr_3.shape

df_corr_ = pd.concat([df_corr_1, df_corr_2, df_corr_3])

## Hierarchical Clustering

https://www.kaggle.com/sgalella/correlation-heatmaps-with-hierarchical-clustering

In [None]:
plt.figure(figsize=(20,5))
dissimilarity = 1 - abs(df_corr)
Z = linkage(squareform(dissimilarity), 'complete') # complete: 가장 먼 거리를 이용해서 측정

dendrogram(Z, labels=df_corr.columns, orientation='top', 
           leaf_rotation=90);

#plt.savefig('Figure/dendrogram.png')

# Clusterize the data
threshold = 0.8 # 파란색으로 합쳐지는 부분이 0.8 정도라서
labels = fcluster(Z, threshold, criterion='distance')

# matching cluster - SLC name
match = np.concatenate([np.array(df_corr.columns).reshape(103,1), labels.reshape(103,1)], axis=1)
df_match = pd.DataFrame(match, columns=['SLC', 'cluster'])

match_list = []
for c in df_match['cluster'].unique().tolist():
    match_list.append([c, df_match[df_match['cluster']==c]['SLC'].values.tolist(), len(df_match[df_match['cluster']==c])])

match_group = pd.DataFrame(match_list, columns=['cluster', 'SLC', 'count']).sort_values(by='count', ascending=False).reset_index(drop=True) #clustered SLC

for i in range(5):
    SLC_list = match_group['SLC'][i]
    if i==0:
        df_5 = df[SLC_list]
    else:
        df_5 = pd.concat([df_5, df[SLC_list]], axis=1)

plt.figure(figsize=(15,10))
correlations = df_5.corr()
sns.heatmap(round(correlations,2), cmap='RdBu', annot=False, 
            annot_kws={"size": 7}, vmin=-1, vmax=1);

#plt.savefig('Figure/cluster_heatmap_top5.png')

In [None]:
# cluseter correlation
# cluster에 따라 정렬해서 df를 다시 생성
# Keep the indices to sort labels
labels_order = np.argsort(labels)

# Build a new dataframe with the sorted columns
for idx, i in enumerate(df.columns[labels_order]):
    if idx == 0:
        clustered = pd.DataFrame(df[i])
    else:
        df_to_append = pd.DataFrame(df[i])
        clustered = pd.concat([clustered, df_to_append], axis=1)

plt.figure(figsize=(15,10))
correlations = clustered.corr()
sns.heatmap(round(correlations,2), cmap='RdBu', annot=False, 
            annot_kws={"size": 7}, vmin=-1, vmax=1);

plt.savefig('Figure/cluster_heatmap.png')

## Clustered SLC
https://choiseokwon.tistory.com/165

In [None]:
# Network analysis means correlation between SLC biomarkers.
G = nx.DiGraph()

G = nx.from_pandas_edgelist(df_corr_, 'name1' , 'name2', create_using = nx.DiGraph())
plt.figure(figsize=(25,25))

nx.draw(G, node_size=[v*300 for v in dict(G.degree).values()], edge_color='#a7aaad',node_color='#72aee6', with_labels=True, font_size=20)
plt.savefig('./Figure/103_corr_graph_2.png')

In [None]:
SLC_cluster_list=['SLC17A4', 'SLC16A4', 'SLC28A1', 'SLC15A4', 'SLC6A3', 'SLC22A11', 'SLC22A2', 'SLC34A1',
                 'SLC12A5', 'SLC10A4', 'SV2C', 'SLC30A3', 'SLC32A1', 'SLCO1A2', 'SFXN5',
                 'SLC30A6', 'SLC30A7', 'SLC25A43', 'NIPA2', 'SLC25A24', 'FLVCR1']

df_cor_clu=df_corr[SLC_cluster_list]
df_cor_clu=df_cor_clu.T
df_cor_clu=df_cor_clu[SLC_cluster_list]

names_clu = df_cor_clu.columns.tolist()

ch_clu = []
for i in range(len(df_cor_clu)):
    for j in range(i+1, len(df_cor_clu)):
        ch_clu.append([df_cor_clu.columns[i], df_cor_clu.columns[j], df_cor_clu.iloc[i,j]])

ch_df1 = pd.DataFrame(ch_clu, columns=['from','to','value'])
ch_df1['from'] = ch_df1['from']
ch_df1['to'] = ch_df1['to']

ch_df2=ch_df1[(ch_df1['value']>=0.3) | (ch_df1['value']<=-0.3)] #유의미한 상관관계를 갖는 SLC
ch_df2.to_csv('New_Data/correlation_21_from_to.csv',index=False)

In [None]:
G = nx.DiGraph() # cluster with more than 5 SLC biomarkers

G = nx.from_pandas_edgelist(ch_df2, 'from' , 'to', create_using = nx.DiGraph())
plt.figure(figsize=(25,25))

nx.draw(G, node_size=[v*300 for v in dict(G.degree).values()], edge_color='#a7aaad', node_color='#72aee6', with_labels=True, font_size=20)

plt.savefig('./Figure/corr_clusterd_3_graph_2.png')