In [28]:
import pandas as pd
import numpy as np
from dtg_utils import *

In [29]:
def cal_pair_distance(row, embedding):

	# Extract indices
	index1, index2 = row[-2], row[-1]

	# Check if indices exist in embedding
	if index1 in embedding.index and index2 in embedding.index:
		return euclidean(embedding.loc[index1], embedding.loc[index2])
	else:
		return np.nan
    
def cal_batch_pair_distance(embedding, embedding_index, cluster_df, base_index_col = 1):
    
    cluster_df['distance_change'] = np.nan
    
    for index, row in cluster_df.iterrows():
        
        based_index = int(row['gene'])
        match_row = embedding_index[embedding_index.iloc[:, base_index_col] == based_index]
        match_row = match_row.values.tolist()[0]
        if match_row:
            cluster_df.at[index, 'distance_change'] = cal_pair_distance(match_row, embedding)
        else:
            print('Is there anything wrong with your input data?')
    return cluster_df

### Human glioblastoma

Load embedding and gene index

In [3]:
glb_embedding = pd.read_csv('../../results/data/singleCell/glioblastoma/0_12_hours.emb',
                                    sep=' ',
                                    skiprows=1,
                                    header=None,
                                    index_col=0)
glb_index = pd.read_csv('../../results/data/singleCell/glioblastoma/splitMatrix/index_tracker.tsv',
                                sep='\t')

In [4]:
glb_index.head(5)

Unnamed: 0.1,Unnamed: 0,0h,t12
0,AL627309.1,1,1983
1,LINC00115,2,1984
2,LINC02593,3,1985
3,SAMD11,4,1986
4,HES4,5,1987


#### 0h as anchor cell type

In [5]:
glb_lovain_0 = pd.read_csv('../../results/data/Figure4Result3Clusterinfo/0h_edgelist_lovain.csv',
                         sep='\t')
glb_lovain_0['community'].value_counts()

1    284
3    179
0    153
6     62
4     29
5     13
2      4
Name: community, dtype: int64

In [8]:
glb_lovain_distance_0 = cal_batch_pair_distance(glb_embedding, glb_index, glb_lovain_0)
glb_distance_info_0 = glb_lovain_distance_0.groupby('community')['distance_change'].agg(['mean', 'std', 'count', 'size'])
glb_distance_info_0['nan_ratio'] = (glb_distance_info_0['size'] - glb_distance_info_0['count']) / glb_distance_info_0['size']
glb_distance_info_0.head(5)

Unnamed: 0_level_0,mean,std,count,size,nan_ratio
community,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,3.104335,0.944496,147,153,0.039216
1,3.527871,0.867707,214,284,0.246479
2,2.903058,1.01066,3,4,0.25
3,3.66829,0.657179,117,179,0.346369
4,3.722782,0.536447,16,29,0.448276


In [9]:
glb_distance_info_0.to_csv('../../results/single_cell/glioblastoma/glb_distance_0.csv', 
                         sep='\t',
                         quoting=False)

#### 12h as anchor cell type

In [11]:
glb_lovain_12 = pd.read_csv('../../results/data/Figure4Result3Clusterinfo/12h_edgelist_lovain.csv',
                         sep='\t')
glb_lovain_12['community'].value_counts()

6     241
1     186
9     153
4      21
11     10
10      8
8       3
7       2
5       2
3       2
0       1
2       1
Name: community, dtype: int64

In [13]:
value_counts_thresholds = 10
value_counts = glb_lovain_12['community'].value_counts()

values_to_keep = value_counts[value_counts >= value_counts_thresholds].index
glb_lovain_12_filtered = glb_lovain_12[glb_lovain_12['community'].isin(values_to_keep)]
glb_lovain_12_filtered['community'].value_counts()

6     241
1     186
9     153
4      21
11     10
Name: community, dtype: int64

In [17]:
glb_lovain_distance_12 = cal_batch_pair_distance(glb_embedding, glb_index, glb_lovain_12_filtered, base_index_col=2)
glb_lovain_distance_12 = glb_lovain_distance_12.groupby('community')['distance_change'].agg(['mean', 'std', 'count', 'size'])
glb_lovain_distance_12['nan_ratio'] = (glb_lovain_distance_12['size'] - glb_lovain_distance_12['count']) / glb_lovain_distance_12['size']
glb_lovain_distance_12.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,mean,std,count,size,nan_ratio
community,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,3.13804,0.961324,178,186,0.043011
4,2.850786,1.040902,20,21,0.047619
6,3.576741,0.848456,192,241,0.20332
9,3.611699,0.715716,115,153,0.248366
11,3.826419,0.283214,10,10,0.0


In [18]:
glb_lovain_distance_12.to_csv('../../results/single_cell/glioblastoma/glb_distance_12.csv', 
                         sep='\t',
                         quoting=False)

### Multi-omics dataset

In [30]:
multiomics_embedding = pd.read_csv('../../results/data/mult-omics/multi_omics.emb',
                                    sep=' ',
                                    skiprows=1,
                                    header=None,
                                    index_col=0)
multiomics_index = pd.read_csv('../../results/data/mult-omics/index_tracker.tsv',
                                sep='\t')

#### MEP

In [31]:
multiomic_lovain = pd.read_csv('../../results/data/Figure4Result3Clusterinfo/Ery_0_edgelist_Lovain.csv',
                         sep='\t')
multiomic_lovain['community'].value_counts()

3     82
7     75
2     57
8     55
9     42
0     39
1     29
10    25
4     22
5     21
6     20
11    17
Name: community, dtype: int64

In [14]:
multiomics_index_0_3 = multiomics_index[['Gene', 'Ery_0_network', 'Ery_3_network']]
multiomics_index_0_3_distance = cal_batch_pair_distance(multiomics_embedding, multiomics_index_0_3, multiomic_lovain)
multiomics_0_3_distance_info = multiomics_index_0_3_distance.groupby('community')['distance_change'].agg(['mean', 'std', 'count', 'size'])
multiomics_0_3_distance_info['nan_ratio'] = (multiomics_0_3_distance_info['size'] - multiomics_0_3_distance_info['count']) / multiomics_0_3_distance_info['size']
multiomics_0_3_distance_info

Unnamed: 0_level_0,mean,std,count,size,nan_ratio
community,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,4.429615,0.700981,27,39,0.307692
1,4.256416,0.843568,22,29,0.241379
2,4.366771,0.6368,50,57,0.122807
3,4.406599,0.957368,67,82,0.182927
4,4.033147,0.914355,19,22,0.136364
5,4.458499,0.394812,18,21,0.142857
6,4.168061,0.822999,17,20,0.15
7,4.351471,0.716839,46,75,0.386667
8,4.190746,0.84499,45,55,0.181818
9,4.006987,0.840436,35,42,0.166667


In [17]:
multiomics_0_3_distance_info.to_csv('../../results/result3/multiomics_0_3_distance_info.csv', 
                         sep='\t',
                         quoting=False)

In [15]:
multiomics_index_0_6 = multiomics_index[['Gene', 'Ery_0_network', 'Ery_6_network']]
multiomics_index_0_6_distance = cal_batch_pair_distance(multiomics_embedding, multiomics_index_0_6, multiomic_lovain)
multiomics_0_6_distance_info = multiomics_index_0_6_distance.groupby('community')['distance_change'].agg(['mean', 'std', 'count', 'size'])
multiomics_0_6_distance_info['nan_ratio'] = (multiomics_0_6_distance_info['size'] - multiomics_0_6_distance_info['count']) / multiomics_0_6_distance_info['size']
multiomics_0_6_distance_info

Unnamed: 0_level_0,mean,std,count,size,nan_ratio
community,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,4.420519,0.599307,26,39,0.333333
1,4.230154,1.050728,20,29,0.310345
2,4.191214,0.931622,51,57,0.105263
3,4.48489,0.781157,69,82,0.158537
4,4.436066,0.44986,19,22,0.136364
5,4.587875,0.368715,16,21,0.238095
6,4.279184,0.49622,17,20,0.15
7,4.193217,0.91449,48,75,0.36
8,4.295531,0.700304,47,55,0.145455
9,4.059267,0.896153,35,42,0.166667


In [18]:
multiomics_0_6_distance_info.to_csv('../../results/result3/multiomics_0_6_distance_info.csv', 
                         sep='\t',
                         quoting=False)

In [16]:
multiomics_index_0_9 = multiomics_index[['Gene', 'Ery_0_network', 'Ery_9_network']]
multiomics_index_0_9_distance = cal_batch_pair_distance(multiomics_embedding, multiomics_index_0_9, multiomic_lovain)
multiomics_0_9_distance_info = multiomics_index_0_9_distance.groupby('community')['distance_change'].agg(['mean', 'std', 'count', 'size'])
multiomics_0_9_distance_info['nan_ratio'] = (multiomics_0_9_distance_info['size'] - multiomics_0_9_distance_info['count']) / multiomics_0_9_distance_info['size']
multiomics_0_9_distance_info

Unnamed: 0_level_0,mean,std,count,size,nan_ratio
community,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,4.121339,1.008851,25,39,0.358974
1,4.212237,0.912998,18,29,0.37931
2,4.310476,0.782654,45,57,0.210526
3,4.262274,0.905497,69,82,0.158537
4,4.488605,0.481358,16,22,0.272727
5,4.521773,0.394994,14,21,0.333333
6,3.767436,1.205426,12,20,0.4
7,4.511615,0.786192,36,75,0.52
8,4.177875,0.974204,43,55,0.218182
9,4.282897,0.715964,34,42,0.190476


In [19]:
multiomics_0_9_distance_info.to_csv('../../results/result3/multiomics_0_9_distance_info.csv', 
                         sep='\t',
                         quoting=False)

#### GMP

In [32]:
multiomic_lovain_gmp = pd.read_csv('../../results/data/Figure4Result3Clusterinfo/GMP_0_edgelist_Lovain.csv',
                         sep='\t')
multiomic_lovain_gmp['community'].value_counts()

4     92
2     72
3     54
1     49
9     48
7     44
10    27
5     24
8     21
0     19
6     18
Name: community, dtype: int64

In [33]:
multiomics_index_mep_0_4 = multiomics_index[['Gene', 'GMP_0_network', 'Gran_0_network']]
multiomics_index_mep_0_4_distance = cal_batch_pair_distance(multiomics_embedding, 
                                                            multiomics_index_mep_0_4, 
                                                            multiomic_lovain_gmp)

multiomics_index_mep_0_4_distance = multiomics_index_mep_0_4_distance.groupby('community')['distance_change'].agg(['mean', 'std', 'count', 'size'])
multiomics_index_mep_0_4_distance['nan_ratio'] = (multiomics_index_mep_0_4_distance['size'] - multiomics_index_mep_0_4_distance['count']) / multiomics_index_mep_0_4_distance['size']
multiomics_index_mep_0_4_distance

Unnamed: 0_level_0,mean,std,count,size,nan_ratio
community,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,4.052883,1.05465,14,19,0.263158
1,4.443724,0.665581,41,49,0.163265
2,4.226568,0.94402,57,72,0.208333
3,4.425091,0.669444,49,54,0.092593
4,4.216215,1.114386,66,92,0.282609
5,3.764402,1.076214,17,24,0.291667
6,4.140955,0.712947,16,18,0.111111
7,4.281633,0.854954,40,44,0.090909
8,4.593728,0.420449,19,21,0.095238
9,4.216265,0.697854,41,48,0.145833


In [34]:
multiomics_index_mep_0_4_distance.to_csv('../../results/result3/multiomics_index_mep_0_4_distance.csv', 
                         sep='\t',
                         quoting=False)

In [27]:
multiomics_index_mep_0_7 = multiomics_index[['Gene', 'GMP_0_network', 'Gran_3_network']]
multiomics_index_mep_0_7_distance = cal_batch_pair_distance(multiomics_embedding, 
                                                            multiomics_index_mep_0_7, 
                                                            multiomic_lovain_gmp)

multiomics_index_mep_0_7 = multiomics_index_mep_0_7_distance.groupby('community')['distance_change'].agg(['mean', 'std', 'count', 'size'])
multiomics_index_mep_0_7['nan_ratio'] = (multiomics_index_mep_0_7['size'] - multiomics_index_mep_0_7['count']) / multiomics_index_mep_0_7['size']
multiomics_index_mep_0_7

Unnamed: 0_level_0,mean,std,count,size,nan_ratio
community,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,4.275115,0.984888,14,19,0.263158
1,4.367729,0.828342,42,49,0.142857
2,4.476584,0.701095,45,72,0.375
3,4.368496,0.610918,48,54,0.111111
4,4.371868,0.771117,53,92,0.423913
5,4.242985,0.706779,19,24,0.208333
6,4.196605,0.669839,16,18,0.111111
7,4.522737,0.550935,38,44,0.136364
8,4.615096,0.796453,17,21,0.190476
9,4.413247,0.812464,42,48,0.125


In [35]:
multiomics_index_mep_0_7.to_csv('../../results/result3/multiomics_index_mep_0_7_distance.csv', 
                         sep='\t',
                         quoting=False)