In [149]:
import pandas as pd
import numpy as np
from functions import organotropism_pairs, utility_functions
from tqdm.notebook import tqdm

In [150]:
raw_data_dir = '../data/raw/'
utility_functions.check_dir(raw_data_dir)
org_pairs_dir = '../data/processed/organotropism_pairs/'
utility_functions.check_dir(org_pairs_dir)

In [151]:
tissue_datasets = ['gtex', 'consensus', ]
metastasis_datasets = ['autopsy', 'hcmdb']

# Determine organotropism pairs
## Rational
-------------
In the first step of our work, we will define organotropism pairs for all the available cancer and metastasis organs. We will have two strategies:

* Use frequency data found in databases and papers

* Find organotropism pairs described in the literature - we compiled all information we found for each cancer in a document

## Notebook workflow
------------------------
* **[HCMDB](https://hcmdb.i-sanger.com/index):** the Human Cancer Metastasis Database) is a database that contains 29 cancer types derived from more than 455 experiments collected from Gene Expression Omnibus (GEO) and The Cancer Genome Atlas (TCGA). to store and analyze large scale expression data of cancer metastasis.
* **[diSibio et al 2008](https://meridian.allenpress.com/aplm/article/132/6/931/460506/Metastatic-Patterns-of-Cancers-Results-From-a):** Paper containing a review of quantitive data from 3827 autopsies, performed between 1914 and 1943 on patients from 5 affiliated medical centers, comprising 41 different primary cancers and 30 different metastatic sites.

## All tissues from GTEx and Consensus
* all tissues in both datasets are matched to the tissue labels we defined based on the data gathered from the metastasis databases and the autopsy study
* This will be used to filter the count data and remove absent tissues
* Since some tissues are only present in GTEx or in Consensus we'll split our analysis

In [152]:
tissues = pd.read_csv(raw_data_dir+'tissue_match.csv', index_col='tissue')
tissues.head(2)

Unnamed: 0_level_0,gtex,consensus
tissue,Unnamed: 1_level_1,Unnamed: 2_level_1
adipose_tissue,Adipose - Subcutaneous,adipose tissue
adipose_tissue,Adipose - Visceral (Omentum),


# Selecting and filtering data

Create dictionary with count data from all data sources

In [153]:
count_data = {}

## HCMDB

In [154]:
data = pd.read_excel(raw_data_dir+'HCMDB.xlsx')
data.head(2)

Unnamed: 0,Experiment_id,Pubmed_id,Dataset_id,Platform_id,Standard,Class_id,Class_name,Sample_id,Cancer_type,Cancer_subtype,Metastasis_status,Primary_site,Metastasis_site,Sample_label
0,EXP00001,22048815.0,GSE10893,GPL1390,[metastasis tumors comparision] of breast canc...,1,metastasis tumor of breast cancer brain metast...,GSM34453,breast cancer,,YES,breast,brain,Metastasis Tumor
1,EXP00001,22048815.0,GSE10893,GPL1390,[metastasis tumors comparision] of breast canc...,1,metastasis tumor of breast cancer brain metast...,GSM34472,breast cancer,,YES,breast,brain,Metastasis Tumor


### Data exploration

In [155]:
data.columns

Index(['Experiment_id', 'Pubmed_id', 'Dataset_id', 'Platform_id', 'Standard',
       'Class_id', 'Class_name', 'Sample_id', 'Cancer_type', 'Cancer_subtype',
       'Metastasis_status', 'Primary_site', 'Metastasis_site', 'Sample_label'],
      dtype='object')

#### There are only 4 Class_ids...

In [156]:
data[['Class_id','Class_name']].drop_duplicates('Class_id')

Unnamed: 0,Class_id,Class_name
0,1,metastasis tumor of breast cancer brain metast...
5,2,metastasis tumor of breast cancer lymph node m...
620,3,primary tumor of breast cancer breast metastasis
639,4,primary tumor of breast cancer other metastasis


#### But 147 Class_names

In [157]:
data[['Class_id','Class_name']].drop_duplicates('Class_name').set_index('Class_id').size

147

#### There are 4 Sample_labels...

In [158]:
data['Sample_label'].unique()

array(['Metastasis Tumor', 'Primary Tumor', 'Primary Normal',
       'Metastasis Normal'], dtype=object)

#### But each sample_label can be in more than 1 class

In [159]:
for label in sorted(data['Sample_label'].unique()):
    print(label, '->', sorted(data[data['Sample_label']==label]['Class_id'].unique()))

Metastasis Normal -> [1]
Metastasis Tumor -> [1, 2, 3, 4]
Primary Normal -> [1]
Primary Tumor -> [1, 2, 3, 4]


#### Samples are duplicated since they can be part of severel experiments defined by the authors and belong to different datasets

In [160]:
data.loc[data['Sample_id']=='GSM34453'][['Experiment_id',
            'Dataset_id', 'Sample_id', ]]

Unnamed: 0,Experiment_id,Dataset_id,Sample_id
0,EXP00001,GSE10893,GSM34453
8,EXP00002,GSE10893,GSM34453
3340,EXP00105,GSE3521,GSM34453
3348,EXP00106,GSE3521,GSM34453


In [161]:
x = data.dropna(subset=['Primary_site', 'Metastasis_site'])
samples = x.Sample_id.unique()

for s in samples:
    met_sites = x[x.Sample_id==s].Metastasis_site.unique()
    if met_sites.shape[0]>1:
        display(x[x.Sample_id==s])

In [162]:
y = x.loc[x.Sample_label.isin(['Primary Tumor', 'Metastasis Tumor']), ['Sample_id', 'Sample_label', 'Cancer_type', 'Metastasis_site']].drop_duplicates()
y.sort_values('Sample_id').tail(20)

Unnamed: 0,Sample_id,Sample_label,Cancer_type,Metastasis_site
12599,TCGA-XF-A8HI-01A,Primary Tumor,bladder cancer,"lung,lymph node"
12439,TCGA-XF-A9SK-01A,Primary Tumor,bladder cancer,bone
12442,TCGA-XF-A9SU-01A,Primary Tumor,bladder cancer,"bone,liver"
12491,TCGA-XF-A9SV-01A,Primary Tumor,bladder cancer,lung
12492,TCGA-XF-A9SX-01A,Primary Tumor,bladder cancer,lung
12476,TCGA-XF-A9SZ-01A,Primary Tumor,bladder cancer,"liver,lung"
12493,TCGA-XF-A9T4-01A,Primary Tumor,bladder cancer,lung
12477,TCGA-XF-AAME-01A,Primary Tumor,bladder cancer,"liver,lung"
12494,TCGA-XF-AAMY-01A,Primary Tumor,bladder cancer,lung
17181,TCGA-XN-A8T3-01A,Primary Tumor,pancreatic cancer,lung


#### Primary site info does not necessarily correspond to a cancer type. For example:

In [163]:
data.loc[data['Cancer_type']=='osteosarcoma', 'Primary_site'].unique()

array(['bone', 'femur', 'humerus', 'fibula', 'tibia', 'costa', 'pelvis'],
      dtype=object)

### Data filtering and selection

Based on what we found exploring how data is organized in the database, we will
filter data taking into account:

* **Sample_labels** - We will exclude normal tissue samples (`Primary Normal` &
`Metastasis Normal`). In some samples `Primary Tumor` & `Metastasis Tumor` labels might overlap and refer to the same pairs of primary - metastasis tumour. But at the same time, deleting entries related with 1 of these labels might result in lost of non-duplicated data.

* **Sample_id** - many samples can be part of several experiments defined by the authors and belong to different datasets. *We will drop duplicated samples*

* **Primary site** - Primary site info does not necessarily correspond to a particular cancer type. So, we will consider only the `Cancer_type`, and `Metastasis_site` columns in our final table.

#### We will exclude normal tissue samples, remove duplicated sample ids and exclude entries without metastasis site information

In [164]:
filtered_data = data[data['Sample_label'].isin(['Primary Tumor', 'Metastasis Tumor'])]\
    .drop_duplicates('Sample_id')[['Cancer_type', 'Metastasis_site']].dropna()

filtered_data

Unnamed: 0,Cancer_type,Metastasis_site
0,breast cancer,brain
1,breast cancer,brain
2,breast cancer,brain
3,breast cancer,brain
4,breast cancer,brain
...,...,...
21951,eye cancer,liver
21952,eye cancer,"liver,skin"
21953,eye cancer,liver
21954,eye cancer,skin


#### Issues with tissue names:
* some metastasis site entries have more than 1 organ separated by commas
* some organs are misspelled (e.g. 'kindey', 'subcutanious soft tissue'),
* others are not organs (e.g. 'other', 'unknown') and there's also some
* terms that appear redundant (e.g. 'kidney' vs 'renal')

In [165]:
for organ in filtered_data['Metastasis_site'].unique():
    print(organ)

brain
lymph node
skin
adrenal gland
lung
caudaequina
spinal cord
liver
other
unknown
brain,other
breast
ovary
bone
subcutanious soft tissue
chest wall
colorectum
subcutaneous
adrenal gland,bone,lymph node
lung,liver
lung,soft tissue
adrenal gland,lung,renal
adrenal gland,bone,lung
lung,lymph node
adrenal gland,renal,pancreas,parotis
lung,mediastinal
lung,skin
liver,lymph node
omentum
spleen
skeleton
lung,skeleton
soft tissue
fat
muscle
pancreas
liver,ovary
brain,lymph node
lymph node,skin
liver,skin
peritoneum
small intestine
pleura
viscera
posterior peritoneum
kindey
ovary,peritoneum
liver,lung
peritoneal surfaces
lung,other
bone,liver
bone,lung
bone,liver,other
bone,liver,lung,other
bone,liver,lymph node
liver,lung,lymph node
liver,other
head & neck
brain,liver,lung
brain,liver
liver,lung,other
non-regional / distant lymph nodes
pleura/pleural effusion
bone,brain,liver,other
brain,lung,lymph node,other
pelvis


#### We will create a count table for all the pairs (tumour cell line, metastatic site) using get_dummies() to distribute and count all metastatic sites into separate columns

In [166]:
counts = filtered_data.set_index('Cancer_type')['Metastasis_site'].str.get_dummies(',')
counts.head()

Unnamed: 0_level_0,adrenal gland,bone,brain,breast,caudaequina,chest wall,colorectum,fat,head & neck,kindey,...,skeleton,skin,small intestine,soft tissue,spinal cord,spleen,subcutaneous,subcutanious soft tissue,unknown,viscera
Cancer_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
breast cancer,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
breast cancer,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
breast cancer,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
breast cancer,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
breast cancer,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [167]:
# group count data by cancer type 
counts = counts.groupby('Cancer_type').sum().rename_axis(index='cancer_organ')
counts.head()

Unnamed: 0_level_0,adrenal gland,bone,brain,breast,caudaequina,chest wall,colorectum,fat,head & neck,kindey,...,skeleton,skin,small intestine,soft tissue,spinal cord,spleen,subcutaneous,subcutanious soft tissue,unknown,viscera
cancer_organ,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bladder cancer,0,16,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
brain cancer,0,0,22,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
breast cancer,2,40,227,49,1,2,0,0,0,0,...,0,47,0,0,1,2,0,0,83,0
cervical cancer,0,3,1,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
colorectal cancer,0,0,2,0,0,0,3,0,0,0,...,0,0,0,0,0,0,0,0,147,0


In [168]:
# add to counts dictionary
count_data['hcmdb'] = counts

## Autopsy paper

In [169]:
data = pd.read_excel(raw_data_dir+'autopsy.ods', sheet_name='table', index_col='cancer_organ')
data.tail()

Unnamed: 0_level_0,cases,adrenal,bone,bladder,brain,breast,diaphragm,gallbladder,heart,kidney,...,skeletal muscle,skin,small intestine,spleen,stomach,testis,thyroid,uterus,vagina,total
cancer_organ,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
unknown,44,6,10,1,4,0,1,0,1,5,...,1,4,1,2,1,1,4,1,0,118
uterus,120,14,8,5,0,1,2,2,1,8,...,0,4,5,1,2,0,5,0,7,289
vagina,11,2,1,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,16
vulva,30,0,1,1,0,0,1,0,2,0,...,0,2,0,1,0,0,0,0,0,43
total,3827,482,617,75,100,62,149,86,125,228,...,16,233,114,201,55,12,117,77,32,9484


#### The column cases has the number of autopsy patients for each cancer. We will remove this and the total column and row

In [170]:
counts = data.drop(index='total', columns=['total', 'cases'])
counts.head()

Unnamed: 0_level_0,adrenal,bone,bladder,brain,breast,diaphragm,gallbladder,heart,kidney,lung,...,prostate,skeletal muscle,skin,small intestine,spleen,stomach,testis,thyroid,uterus,vagina
cancer_organ,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
adrenal,1,3,0,2,0,1,0,0,1,2,...,0,0,3,0,2,1,0,1,0,0
anus,3,4,1,1,0,1,2,2,2,8,...,0,0,2,1,3,0,0,1,0,0
appendix,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
bile duct,3,3,0,0,0,0,2,0,0,4,...,0,0,0,0,2,1,0,0,0,0
bladder,11,20,0,0,0,2,0,2,9,30,...,3,2,1,4,3,0,0,1,0,0


In [171]:
counts.columns

Index(['adrenal', 'bone', 'bladder', 'brain', 'breast', 'diaphragm',
       'gallbladder', 'heart', 'kidney', 'lung', 'colon', 'liver',
       'lymph node (reg)', 'lymph node (dist)', 'omentum', 'ovary', 'pancreas',
       'pericardium', 'peritoneum', 'pleura', 'prostate', 'skeletal muscle',
       'skin', 'small intestine', 'spleen', 'stomach', 'testis', 'thyroid',
       'uterus', 'vagina'],
      dtype='object')

In [172]:
# add to count dictionary
count_data['autopsy'] = counts

# Analysis of count data
To find statistically significant organotropism pairs in our data we will:
1. Match tissues/organs in each dataset to the tissue ids we implemented;
2. compute the hypergeometric test for each pair. Set a FDR threshold to define a statistically significant organotropism pair;

In [173]:
for label, met_dataset in count_data.items():
    
    # import lists that match tissue labels to tissue ids
    metastasis_list = pd.read_excel(
        raw_data_dir+label+'.ods', 
        usecols=['organ', 'name'],
        sheet_name='metastasis_organs',
        dtype='str'
    )
    metastasis_list = metastasis_list[
        metastasis_list.organ.isin(met_dataset.columns)].name.to_list()
    
    cancer_list = pd.read_excel(
        raw_data_dir+label+'.ods',
        usecols=['organ', 'name'], 
        sheet_name='cancer_types',
        dtype='str'
    )
    cancer_list = cancer_list[
        cancer_list.organ.isin(met_dataset.index)].name.to_list()

    # filter tissues with tissue match lists
    # we'll keep all tissues/organs/anatomical locations
    # and we'll rename all tissues
    met_dataset_filtered = organotropism_pairs.filter_df(
        met_dataset,
        metastasis_list,
        cancer_list,
    )
    met_dataset_filtered.to_csv(org_pairs_dir+f'{label}_frequencies.csv')

# Compute organotropism pairs

## Hypergeometric distribution
---------------------------
The hypergeometric random variable with parameters `(N,n,M)` counts the number of “good “objects in a sample of size `N` chosen without replacement from a population of `M` objects where `n` is the number of “good “objects in the total population.
The probability mass function is defined as:
$$
p(k;N,n,M)= \frac{
    \begin{pmatrix} n \\ k \end{pmatrix} 
    \begin{pmatrix} M-n \\ N-k \end{pmatrix}
    }{
    \begin{pmatrix} M \\ N \end{pmatrix}
    }; k \in [ \max(0, M - M + n), \min(n,N)]
$$

Sometimes, it is useful to study the opposite question and ask how often the random variable is above a particular level. This is called the complementary cumulative distribution function (ccdf) or simply the tail distribution or exceedance, and is defined as:

$$
\overline{F}(x)=P(X>x)=1-F_X(x)
$$

This has applications in statistical hypothesis testing, for example, because the one-sided p-value is the probability of observing a test statistic at least as extreme as the one observed. Thus, provided that the test statistic, T, has a continuous distribution, the one-sided p-value is simply given by the ccdf: for an observed value t of the test statistic:

$$
S(x)=P(X\ge{x})=P(X>x)=1-F(x)
$$

In survival analysis, S(x) is the survival function and denoted, while the term reliability function is common in engineering. 

## Hypergeometric test (n controls)

In [174]:

# set the number of controls
controls = 1000

pairs_records = []
for met_dataset in metastasis_datasets:
    
    # import frequency table
    frequencies = pd.read_csv(org_pairs_dir+f'{met_dataset}_frequencies.csv', index_col='cancer_organ') 
    
    # compute pairs for gtex and consensus
    for tissues_label in tissue_datasets:

        tissue_dataset = tissues[tissues_label].dropna().index.unique().to_list()
        
        # compute organotropism pairs 
        org_pairs, match = organotropism_pairs.organotropism_pairs_hyper_test(
            frequencies,
            tissue_dataset,
            fdr_corr=True
        )

        # Compute control pairs
        for c in tqdm(range(1, controls+1)):
            all_pairs = organotropism_pairs.control_pairs(
                org_pairs,
                match,
                random_start=True,
                max_iterations=70000,
                random_state=42
            )
            
            tissue_pairs = organotropism_pairs.compute_pairs(
                                all_pairs,
                                tissues,
                                tissues_label,
                                extra_labels=[
                                    ('metastasis_dataset', met_dataset),
                                    ('tissue_dataset', tissues_label),
                                    ('control', c)
                                ]
            )
            
            pairs_records.extend(tissue_pairs)
            
pairs_records = pd.DataFrame(pairs_records)
pairs_records.head()

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

Unnamed: 0,cancer,metastasis,cancer_tissue,metastasis_tissue,type,metastasis_dataset,tissue_dataset,control
0,breast,adrenal_gland,Breast - Mammary Tissue,Adrenal Gland,organotropism,autopsy,gtex,1
1,breast,brain,Breast - Mammary Tissue,Brain - Amygdala,organotropism,autopsy,gtex,1
2,breast,brain,Breast - Mammary Tissue,Brain - Anterior cingulate cortex (BA24),organotropism,autopsy,gtex,1
3,breast,brain,Breast - Mammary Tissue,Brain - Caudate (basal ganglia),organotropism,autopsy,gtex,1
4,breast,brain,Breast - Mammary Tissue,Brain - Cerebellar Hemisphere,organotropism,autopsy,gtex,1


In [175]:
# Since only the control pairs have more than one "control" the organotropism pairs are repeated in the stats

# drop repeated organotropism rows
org = pairs_records[pairs_records.type=='organotropism']\
    .drop_duplicates(subset=pairs_records.columns.drop('control')).copy()
org['control'] = ['organotropism' for i in range(org.shape[0])]

# rename controls
cont = pairs_records[pairs_records.type=='control'].copy()
cont['control'] = cont['type'] + '_' + cont['control'].astype('str')

pairs_records = pd.concat([org, cont])
display(pairs_records.info())
pairs_records.head(2)

<class 'pandas.core.frame.DataFrame'>
Index: 238236 entries, 0 to 473999
Data columns (total 8 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   cancer              238236 non-null  object
 1   metastasis          238236 non-null  object
 2   cancer_tissue       238236 non-null  object
 3   metastasis_tissue   238236 non-null  object
 4   type                238236 non-null  object
 5   metastasis_dataset  238236 non-null  object
 6   tissue_dataset      238236 non-null  object
 7   control             238236 non-null  object
dtypes: object(8)
memory usage: 16.4+ MB


None

Unnamed: 0,cancer,metastasis,cancer_tissue,metastasis_tissue,type,metastasis_dataset,tissue_dataset,control
0,breast,adrenal_gland,Breast - Mammary Tissue,Adrenal Gland,organotropism,autopsy,gtex,organotropism
1,breast,brain,Breast - Mammary Tissue,Brain - Amygdala,organotropism,autopsy,gtex,organotropism


In [176]:
# export pairs
pairs_records.to_csv(org_pairs_dir+'pairs_records.csv', index=False)

## Frequency pairs

In [177]:
freq_pairs = []
for met_dataset in metastasis_datasets:
    
    # import lists with tissue match
    frequencies = pd.read_csv(org_pairs_dir+f'{met_dataset}_frequencies.csv', index_col='cancer_organ')
    
    for t_dataset in tissue_datasets:
        tissue_dataset = tissues[t_dataset].dropna().index.unique().to_list()

        org_pairs, match = organotropism_pairs.organotropism_pairs_frequency(
            frequencies,
            tissue_dataset,
            method='outlier_detection',
            k=1.5,
            filter_tissues='last',
            drop_met=False
        )
        
        tissue_pairs = organotropism_pairs.compute_pairs(
            org_pairs,
            tissues,
            t_dataset,
            extra_labels=[
                ('tissue_dataset', t_dataset),
                ('metastasis_dataset', met_dataset)
            ]
        )
        freq_pairs.extend(tissue_pairs)

freq_pairs = pd.DataFrame(freq_pairs)
freq_pairs.head()

Unnamed: 0,cancer,metastasis,cancer_tissue,metastasis_tissue,type,tissue_dataset,metastasis_dataset
0,adrenal_gland,liver,Adrenal Gland,Liver,organotropism,gtex,autopsy
1,adrenal_gland,skin,Adrenal Gland,Skin - Not Sun Exposed (Suprapubic),organotropism,gtex,autopsy
2,adrenal_gland,skin,Adrenal Gland,Skin - Sun Exposed (Lower leg),organotropism,gtex,autopsy
3,adrenal_gland,bladder,Adrenal Gland,Bladder,control,gtex,autopsy
4,adrenal_gland,brain,Adrenal Gland,Brain - Amygdala,control,gtex,autopsy


In [178]:
display(freq_pairs.head())
freq_pairs.info()

Unnamed: 0,cancer,metastasis,cancer_tissue,metastasis_tissue,type,tissue_dataset,metastasis_dataset
0,adrenal_gland,liver,Adrenal Gland,Liver,organotropism,gtex,autopsy
1,adrenal_gland,skin,Adrenal Gland,Skin - Not Sun Exposed (Suprapubic),organotropism,gtex,autopsy
2,adrenal_gland,skin,Adrenal Gland,Skin - Sun Exposed (Lower leg),organotropism,gtex,autopsy
3,adrenal_gland,bladder,Adrenal Gland,Bladder,control,gtex,autopsy
4,adrenal_gland,brain,Adrenal Gland,Brain - Amygdala,control,gtex,autopsy


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3113 entries, 0 to 3112
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   cancer              3113 non-null   object
 1   metastasis          3113 non-null   object
 2   cancer_tissue       3113 non-null   object
 3   metastasis_tissue   3113 non-null   object
 4   type                3113 non-null   object
 5   tissue_dataset      3113 non-null   object
 6   metastasis_dataset  3113 non-null   object
dtypes: object(7)
memory usage: 170.4+ KB


In [179]:
freq_pairs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3113 entries, 0 to 3112
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   cancer              3113 non-null   object
 1   metastasis          3113 non-null   object
 2   cancer_tissue       3113 non-null   object
 3   metastasis_tissue   3113 non-null   object
 4   type                3113 non-null   object
 5   tissue_dataset      3113 non-null   object
 6   metastasis_dataset  3113 non-null   object
dtypes: object(7)
memory usage: 170.4+ KB


In [180]:
# export pairs
freq_pairs.to_csv(org_pairs_dir+'freq_pairs_records.csv', index=False)

## Literature pairs

In [181]:
lit_pairs = pd.read_excel(raw_data_dir+'literature_organotropism_pairs.ods', index_col='cancer', usecols=['cancer', 'metastasis'])
display(lit_pairs.head(2))
lit_pairs.info()

Unnamed: 0_level_0,metastasis
cancer,Unnamed: 1_level_1
adrenal_gland,"liver,lung,lymph_node,bone"
bladder,"lymph_node,bone,lung,liver"


<class 'pandas.core.frame.DataFrame'>
Index: 25 entries, adrenal_gland to vagina
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   metastasis  22 non-null     object
dtypes: object(1)
memory usage: 400.0+ bytes


In [182]:
pairs = lit_pairs.metastasis.str.get_dummies(',')
pairs.head(2)

Unnamed: 0_level_0,adrenal_gland,bone,brain,fallopian_tube,liver,lung,lymph_node,omentum,peritoneum,skin,vagina
cancer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
adrenal_gland,0,1,0,0,1,1,1,0,0,0,0
bladder,0,1,0,0,1,1,1,0,0,0,0


In [183]:
tiss_labels = tissues.index.unique().to_list()
shape = (len(tiss_labels), len(tiss_labels))
all_pairs = pd.DataFrame(np.zeros(shape, dtype='int'), index=tiss_labels, columns=tiss_labels)
all_pairs.head(2)

Unnamed: 0,adipose_tissue,adrenal_gland,appendix,artery,bladder,blood,bone,brain,breast,cervix,...,spinal_cord,spleen,stomach,testis,thymus,thyroid,tongue,tonsil,uterus,vagina
adipose_tissue,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
adrenal_gland,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [184]:
# all_pairs is empty
# update all_pairs with literature pairs
# only tissues present in the tissue datasets will be kept
all_pairs.update(pairs)
all_pairs = all_pairs.astype('int64')
all_pairs.head(2)

Unnamed: 0,adipose_tissue,adrenal_gland,appendix,artery,bladder,blood,bone,brain,breast,cervix,...,spinal_cord,spleen,stomach,testis,thymus,thyroid,tongue,tonsil,uterus,vagina
adipose_tissue,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
adrenal_gland,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [185]:
# remove organotropism pairs with the same tissue
all_pairs, match = organotropism_pairs.match_tissues(all_pairs, drop_met=False)
all_pairs.head(2)

Unnamed: 0,adipose_tissue,adrenal_gland,appendix,artery,bladder,blood,bone,brain,breast,cervix,...,spinal_cord,spleen,stomach,testis,thymus,thyroid,tongue,tonsil,uterus,vagina
adrenal_gland,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
bladder,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [186]:
# remove cancers without pairs
all_pairs = all_pairs[all_pairs.sum(axis=1)>0]
all_pairs = all_pairs.where(all_pairs==1, -1)
all_pairs.head(2)

Unnamed: 0,adipose_tissue,adrenal_gland,appendix,artery,bladder,blood,bone,brain,breast,cervix,...,spinal_cord,spleen,stomach,testis,thymus,thyroid,tongue,tonsil,uterus,vagina
adrenal_gland,-1,-1,-1,-1,-1,-1,1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
bladder,-1,-1,-1,-1,-1,-1,1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [187]:
# remove control pairs with the same tissue
all_pairs = all_pairs*match
all_pairs.head(2)

Unnamed: 0,adipose_tissue,adrenal_gland,appendix,artery,bladder,blood,bone,brain,breast,cervix,...,spinal_cord,spleen,stomach,testis,thymus,thyroid,tongue,tonsil,uterus,vagina
adrenal_gland,-1,0,-1,-1,-1,-1,1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
bladder,-1,-1,-1,-1,0,-1,1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [188]:
literature_pairs = []
for d in tissues:
    tissue_list = tissues[d].dropna().index.unique()
    dataset_pairs = all_pairs.loc[all_pairs.index.isin(tissue_list), all_pairs.columns.isin(tissue_list)]
    p = organotropism_pairs.compute_pairs(dataset_pairs, tissues, d, [('tissue_dataset', d)])
    literature_pairs.extend(p)
    
literature_pairs = pd.DataFrame(literature_pairs)
literature_pairs.head(2)

Unnamed: 0,cancer,metastasis,cancer_tissue,metastasis_tissue,type,tissue_dataset
0,adrenal_gland,liver,Adrenal Gland,Liver,organotropism,gtex
1,adrenal_gland,lung,Adrenal Gland,Lung,organotropism,gtex


In [189]:
literature_pairs.to_csv(org_pairs_dir+'literature_pairs_records.csv', index=False)