In [None]:
# This script will perform weighted mean eQTL prediction based on GTEX v8 expression data.

In [None]:
###### Module Loading #######

In [1]:
import pandas as pd
import os
from scipy.stats import ttest_ind
import numpy as np
from scipy import stats
from sklearn.metrics import mean_squared_error
from math import sqrt

In [None]:
###### Move to working directory #######

In [2]:
os.chdir('/nfs/research1/zerbino/jhidalgo/inteql_GTEX_v8/data/original-data/RNA-seq/')

In [None]:
###### Load GTEX sample metadata and sample names #######

In [3]:
samples_GTEX=pd.read_csv('../GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt',sep='\t',usecols=['SAMPID','SMTS','SMTSD'])

In [4]:
samples_RNASEQ=list(pd.read_csv('../GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz',header=2,sep='\t',nrows=0).columns[2:])

In [5]:
print(len(samples_GTEX),len(samples_RNASEQ))

22951 17382


In [6]:
samples=pd.DataFrame({'SAMPID':samples_RNASEQ}).merge(samples_GTEX)

In [33]:
samples

Unnamed: 0,SAMPID,SMTS,SMTSD
0,GTEX-1117F-0226-SM-5GZZ7,Adipose Tissue,Adipose - Subcutaneous
1,GTEX-1117F-0426-SM-5EGHI,Muscle,Muscle - Skeletal
2,GTEX-1117F-0526-SM-5EGHJ,Blood Vessel,Artery - Tibial
3,GTEX-1117F-0626-SM-5N9CS,Blood Vessel,Artery - Coronary
4,GTEX-1117F-0726-SM-5GIEN,Heart,Heart - Atrial Appendage
...,...,...,...
17377,GTEX-ZZPU-2126-SM-5EGIU,Ovary,Ovary
17378,GTEX-ZZPU-2226-SM-5EGIV,Vagina,Vagina
17379,GTEX-ZZPU-2426-SM-5E44I,Blood Vessel,Artery - Tibial
17380,GTEX-ZZPU-2626-SM-5E45Y,Muscle,Muscle - Skeletal


In [7]:
unique_tissues=list(samples['SMTSD'].drop_duplicates())

In [None]:
###### Filter TPM file for transcripts with very low T test value for the different tissues #######
"""
GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz is read chunk by chunk, for each line in the chunk
a t-test is performed for each unique tissue. Transcripts with 1 to 3 transcript-tissue t-test values of 0 are
forwarded to two new files:
1) Top_T-test_GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct: Contains the raw TPM values as read.
2) Binary_top_T-test_GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct: Contains the T-test values for 
    each tissue converted to 0 or 1 according to the values being under or over a threshold (0.0001 by default).
"""

In [7]:
chunksize=1000

In [9]:
cols={}
notcols={}
for a in unique_tissues:
    cols[a] = list(samples[samples['SMTSD'] == a]['SAMPID'].values)
    notcols[a] = list(samples[samples['SMTSD'] != a]['SAMPID'].values)

count=0
added=0
write_header=True

In [17]:
for chunk in pd.read_csv('../GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz',header=2,sep='\t',chunksize=chunksize,index_col='transcript_id',usecols=lambda x: x not in ['gene_id']):
    output=pd.DataFrame()
    bin_output=pd.DataFrame()
    for i in chunk.index:
        ttest=[]
        for a in unique_tissues:
            ttest=np.append(ttest,float(ttest_ind(chunk.loc[i][cols[a]], chunk.loc[i][notcols[a]]).pvalue))
            if (ttest == 0).sum() > 3: continue
        if 0 < (ttest == 0).sum() <= 3:
            output=output.append(chunk.loc[i])
            bin_output=bin_output.append(pd.DataFrame(columns=unique_tissues,data=[np.where((ttest < 0.0001) & (ttest > 0),1,0)],index = [i]))
    if len(output) != 0: 
        output.to_csv('Top_T-test_GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct',chunksize=len(output),mode='a',index=True,header=write_header)
        bin_output.to_csv('Binary_top_T-test_GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct',chunksize=len(bin_output),mode='a',index=True,header=write_header)
    write_header=False
    count=count+chunksize
    added=added+len(output)
    print(count,"transcripts processed,",added,'transcripts added.',end="\r",flush=True)

44000 transcripts processed, 24144 transcripts added.

In [None]:
###### Load previous 2) output #######

In [9]:
bin_ttest=pd.read_csv('Binary_top_T-test_GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct',header=0).rename(columns={'Unnamed: 0':'transcript_id'}).set_index('transcript_id')

In [138]:
bin_ttest

Unnamed: 0_level_0,Adipose - Subcutaneous,Muscle - Skeletal,Artery - Tibial,Artery - Coronary,Heart - Atrial Appendage,Adipose - Visceral (Omentum),Uterus,Vagina,Breast - Mammary Tissue,Skin - Not Sun Exposed (Suprapubic),...,Brain - Cerebellar Hemisphere,Liver,Brain - Substantia nigra,Kidney - Cortex,Brain - Amygdala,Cervix - Ectocervix,Fallopian Tube,Cervix - Endocervix,Bladder,Kidney - Medulla
transcript_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENST00000373020.8,1,1,1,1,1,1,1,1,1,1,...,1,1,1,0,1,0,0,0,0,0
ENST00000494424.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENST00000496771.5,0,1,1,0,1,0,1,0,0,1,...,1,1,0,0,0,0,0,0,0,0
ENST00000612152.4,1,1,1,1,1,1,1,1,1,1,...,1,0,0,0,0,1,1,1,0,0
ENST00000614008.4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENST00000638486.1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENST00000381568.9,1,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
ENST00000622217.1,0,1,1,0,1,0,0,1,0,0,...,1,1,1,0,1,0,0,0,0,0
ENST00000639802.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
###### Transcript selection #######
"""
100 transcripts are selected for each tissue, an iterative selection is performed increasing the threshold
(number of tissue with significant T-test value for the transcript). Transcripts with fewer total associations
are selected first to maximise the tissue information each transcript can provide.
"""

In [10]:
data_selection={}
threshold = 1
for tissue in bin_ttest.columns[1:]:
    while tissue not in data_selection:
        data=bin_ttest[(bin_ttest.sum(axis=1) <= threshold)&(bin_ttest.sum(axis=1) > 0)]
        if len(data[data[tissue] <= threshold]) >= 100:
            data_selection[tissue]= data[data[tissue] <= threshold][:100].index
        if threshold == len(bin_ttest.columns[1:]):
            break
        threshold=threshold+1

In [11]:
index_selection=set([value for values in data_selection.values() for value in values])

In [178]:
index_selection

{'ENST00000002165.10',
 'ENST00000002596.5',
 'ENST00000002829.7',
 'ENST00000003084.10',
 'ENST00000003583.12',
 'ENST00000003912.7',
 'ENST00000004103.7',
 'ENST00000004531.14',
 'ENST00000005995.7',
 'ENST00000006724.7',
 'ENST00000007699.9',
 'ENST00000008391.3',
 'ENST00000023064.8',
 'ENST00000046087.6',
 'ENST00000161006.7',
 'ENST00000217885.5',
 'ENST00000222381.7',
 'ENST00000222462.2',
 'ENST00000225275.3',
 'ENST00000226299.8',
 'ENST00000231749.7',
 'ENST00000234701.7',
 'ENST00000247452.3',
 'ENST00000257621.4',
 'ENST00000261377.10',
 'ENST00000262407.5',
 'ENST00000262418.10',
 'ENST00000262630.7',
 'ENST00000262820.7',
 'ENST00000264276.10',
 'ENST00000265723.8',
 'ENST00000265742.7',
 'ENST00000309032.7',
 'ENST00000314669.9',
 'ENST00000321037.4',
 'ENST00000327906.7',
 'ENST00000330550.8',
 'ENST00000330692.11',
 'ENST00000337248.8',
 'ENST00000339809.10',
 'ENST00000340022.6',
 'ENST00000341376.10',
 'ENST00000342002.6',
 'ENST00000348433.10',
 'ENST00000353205.5',

In [None]:
###### Read raw data for selected transcripts #######
"""
The complete list of transcripts is read from the previously exported 1) file, then index of the transcripts
matching the selection is used to read only the raw values of the selected transcripts.
"""

In [12]:
transcripts=pd.read_csv('Top_T-test_GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct',header=0,sep=',',usecols=[0]).rename(columns={'Unnamed: 0':'transcript_id'})


In [187]:
transcripts

Unnamed: 0,transcript_id
0,ENST00000373020.8
1,ENST00000494424.1
2,ENST00000496771.5
3,ENST00000612152.4
4,ENST00000614008.4
...,...
116972,ENST00000638486.1
116973,ENST00000381568.9
116974,ENST00000622217.1
116975,ENST00000639802.1


In [13]:
rows=[0]+list(transcripts[transcripts['transcript_id'].isin(index_selection)].index.values+1)

In [14]:
data=pd.read_csv('Top_T-test_GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct',header=0,sep=',',index_col=0,usecols=lambda x: x not in ['gene_id'],skiprows=lambda x: x not in rows).T


In [422]:
data

Unnamed: 0,ENST00000373020.8,ENST00000496771.5,ENST00000612152.4,ENST00000614008.4,ENST00000373031.4,ENST00000485971.1,ENST00000371582.8,ENST00000371584.8,ENST00000371588.9,ENST00000466152.5,...,ENST00000419417.5,ENST00000522523.5,ENST00000460412.5,ENST00000469954.5,ENST00000372581.1,ENST00000247452.3,ENST00000522159.5,ENST00000538154.5,ENST00000441542.6,ENST00000523867.5
GTEX-1117F-0226-SM-5GZZ7,26.32,2.53,0.47,0.23,8.73,2.84,0.91,0.00,32.33,2.12,...,0.00,0.00,0.0,1.87,0.13,0.03,0.00,0.0,0.0,0.0
GTEX-1117F-0426-SM-5EGHI,3.95,0.48,0.21,0.24,0.44,0.15,1.73,0.96,56.37,2.47,...,0.00,0.00,0.0,1.98,0.00,0.02,0.00,0.0,0.0,0.0
GTEX-1117F-0526-SM-5EGHJ,13.23,1.98,0.15,1.32,1.49,0.26,0.00,0.30,29.72,2.49,...,0.00,0.00,0.0,2.36,0.00,0.00,1.03,0.0,0.0,0.0
GTEX-1117F-0626-SM-5N9CS,30.15,4.18,1.23,1.15,4.70,1.48,0.52,0.30,34.21,2.91,...,0.00,0.00,0.0,1.53,0.00,0.03,0.00,0.0,0.0,0.0
GTEX-1117F-0726-SM-5GIEN,6.60,0.38,0.31,0.74,0.03,0.00,0.00,0.00,17.18,1.78,...,0.00,0.00,0.0,0.59,0.09,0.00,0.00,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTEX-ZZPU-2126-SM-5EGIU,68.85,28.48,5.59,4.23,1.72,0.28,2.02,3.70,30.97,1.19,...,0.11,0.00,0.0,0.70,0.12,0.03,0.00,0.0,0.0,0.0
GTEX-ZZPU-2226-SM-5EGIV,30.73,0.89,0.20,0.00,0.04,0.00,1.18,2.16,25.62,0.61,...,0.00,0.00,0.0,2.14,0.12,0.03,0.00,0.0,0.0,0.0
GTEX-ZZPU-2426-SM-5E44I,5.05,0.56,0.20,0.42,0.13,0.00,0.29,0.74,31.52,0.68,...,0.00,0.00,0.0,1.49,0.00,0.02,0.00,0.0,0.0,0.0
GTEX-ZZPU-2626-SM-5E45Y,1.67,0.50,0.14,0.00,4.73,0.13,1.05,0.00,23.54,0.89,...,0.00,0.01,0.0,6.13,0.08,0.00,0.00,0.0,0.0,0.0


In [None]:
###### HKG raw data #######
"""
REVIEW:
Raw data for a published list of Housekeeping Genes is extracted from GTEX data. Transcripts with at least 1 TPM
are filtered and the top 150 with less relative variation are selected.
Publication:
Wang, Z., Lyu, Z., Pan, L. et al. Defining housekeeping genes suitable for RNA-seq analysis of the human allograft kidney biopsy tissue. BMC Med Genomics 12, 86 (2019). https://doi.org/10.1186/s12920-019-0538-z
"""

In [250]:
HKG={'ANAPC5':'ENSG00000089053','ANAPC15':'ENSG00000110200','ARID3B':'ENSG00000179361','ARL10':'ENSG00000175414','ATXN2':'ENSG00000204842','C3orf49':'ENSG00000163632','CCAR1':'ENSG00000060339','CCDC125':'ENSG00000277868','CCDC90B':'ENSG00000137500','CHFR':'ENSG00000072609','DHRSX':'ENSG00000169084','FRMD8':'ENSG00000126391','GGA1':'ENSG00000100083','HERC4':'ENSG00000148634','MKNK1':'ENSG00000079277','NASP':'ENSG00000132780','NME4':'ENSG00000103202','OTUB1':'ENSG00000167770','POLR2B':'ENSG00000047315','POLR3A':'ENSG00000148606','POMK':'ENSG00000185900','PSMA3-AS1':'ENSG00000257621','PTPN14':'ENSG00000152104','RAPGEF6':'ENSG00000158987','REL':'ENSG00000162924','RRP1':'ENSG00000160214','RUNDC1':'ENSG00000198863','SAMD4B':'ENSG00000179134','SLC4A1AP':'ENSG00000163798','SLMAP':'ENSG00000163681','SMARCAL1':'ENSG00000138375','SNAP29':'ENSG00000099940','SNRNP200':'ENSG00000144028','SUPT4H1':'ENSG00000213246','TBC1D22A':'ENSG00000054611','THUMPD3-AS1':'ENSG00000206573','TSPOAP1-AS1':'ENSG00000265148','TUBGCP2':'ENSG00000130640','WDTC1':'ENSG00000142784','ZNF544':'ENSG00000198131'}

In [243]:
# HKG=['ANAPC5','ANAPC15','ARID3B','ARL10','ATXN2','C16orf62','C3orf49','CCAR1','CCDC125','CCDC90B','CHFR','DHRSX','FRMD8','GGA1','HERC4','MKNK1','NASP','NME4','OTUB1','PMF1/PMF1-BGLAP','POLR2B','POLR3A','POMK','PSMA3-AS1','PTPN14','RAPGEF6','REL','RRP1','RUNDC1','SAMD4B','SLC4A1AP','SLMAP','SMARCAL1','SNAP29','SNRNP200','SUPT4H1','TBC1D22A','THUMPD3-AS1','TSPOAP1-AS1','TUBGCP2','WDTC1','ZNF544']

In [301]:
count=0
added=0
write_header=True

In [302]:
for chunk in pd.read_csv('../GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz',header=2,sep='\t',chunksize=chunksize,index_col='transcript_id'):
    output=chunk[chunk['gene_id'].str.split('.',1).str[0].isin(list(HKG.values()))]
    if not output.empty: 
        output.to_csv('HKG_GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct',chunksize=len(output),mode='a',index=True,header=write_header)
        write_header=False
    count=count+chunksize
    added=added+len(output)
    print(count,"transcripts processed,",added,'transcripts added.',end="\r",flush=True)

200000 transcripts processed, 494 transcripts added.

In [20]:
HKG_raw=pd.read_csv('HKG_GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct',index_col='transcript_id')#,usecols=lambda x: x not in ['gene_id'])

In [21]:
HKG_top=HKG_raw[HKG_raw.mean(axis=1) >= 1]

In [22]:
HKG_top_rel_data=HKG_top.drop(columns='gene_id')/HKG_top.mean()

In [23]:
HKG_transc=HKG_top_rel_data.loc[HKG_top_rel_data.std(axis=1).nsmallest(150).index].index.tolist()

In [445]:
HKG_raw.loc[HKG_transc].drop(columns='gene_id').mean()

GTEX-1117F-0226-SM-5GZZ7    4.257200
GTEX-1117F-0426-SM-5EGHI    2.160800
GTEX-1117F-0526-SM-5EGHJ    3.955667
GTEX-1117F-0626-SM-5N9CS    4.651267
GTEX-1117F-0726-SM-5GIEN    2.100333
                              ...   
GTEX-ZZPU-2126-SM-5EGIU     4.355200
GTEX-ZZPU-2226-SM-5EGIV     2.884800
GTEX-ZZPU-2426-SM-5E44I     3.225267
GTEX-ZZPU-2626-SM-5E45Y     2.067067
GTEX-ZZPU-2726-SM-5NQ8O     2.991467
Length: 17382, dtype: float64

In [None]:
###### Normalize GTEX data #######
"""
Raw data for transcripts selected before are normalized by dividing raw TPM by the mean expression 
of the Housekeeping Genes filtered above for each sample.
"""

In [24]:
rel_data=data.T/HKG_raw.loc[HKG_transc].drop(columns='gene_id').mean()

In [459]:
rel_data

Unnamed: 0,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,GTEX-1117F-2826-SM-5GZXL,GTEX-1117F-2926-SM-5GZYI,...,GTEX-ZZPU-1126-SM-5N9CW,GTEX-ZZPU-1226-SM-5N9CK,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
ENST00000373020.8,6.182467,1.828027,3.344569,6.482105,3.142358,13.015987,4.754024,4.268734,13.090899,2.478849,...,2.741102,4.474821,3.775171,3.507979,1.144950,15.808688,10.652385,1.565762,0.807908,11.001293
ENST00000496771.5,0.594287,0.222140,0.500548,0.898680,0.180924,0.995538,1.817238,0.277191,0.407915,0.093063,...,0.382702,0.238513,0.212436,0.436379,0.211064,6.539309,0.308514,0.173629,0.241889,0.715368
ENST00000612152.4,0.110401,0.097186,0.037920,0.264444,0.147596,0.423776,0.228777,0.124181,0.410808,0.104343,...,0.110027,0.089442,0.130560,0.190651,0.069391,1.283523,0.069329,0.062010,0.067729,0.330941
ENST00000614008.4,0.054026,0.111070,0.333698,0.247244,0.352325,0.000000,0.890771,0.000000,0.000000,0.045121,...,0.143513,0.000000,0.000000,0.076260,0.130108,0.971253,0.000000,0.130222,0.000000,0.130371
ENST00000373031.4,2.050644,0.203628,0.376675,1.010477,0.014283,3.766901,0.035696,0.070961,8.473645,0.290468,...,0.095675,0.000000,0.026555,0.038130,0.011565,0.394930,0.013866,0.040307,2.288267,5.987030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENST00000247452.3,0.007047,0.009256,0.000000,0.006450,0.000000,0.000000,0.000000,0.004435,0.000000,0.005640,...,0.009568,0.005421,0.000000,0.000000,0.000000,0.006888,0.010399,0.006201,0.000000,0.000000
ENST00000522159.5,0.000000,0.000000,0.260386,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.176174,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ENST00000538154.5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ENST00000441542.6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [25]:
data=samples.merge(rel_data.T,right_index=True,left_on='SAMPID').rename(columns={'SMTSD':'Tissue'}).set_index('Tissue').drop(columns=['SAMPID','SMTS'])

In [26]:
data.to_csv('Normalised_data_for_prediction.csv.gz',compression='gzip',index=True)

In [None]:
data=pd.read_csv('Normalised_data_for_prediction.csv.gz',compression='gzip',index_col=0)

In [462]:
data

Unnamed: 0_level_0,ENST00000373020.8,ENST00000496771.5,ENST00000612152.4,ENST00000614008.4,ENST00000373031.4,ENST00000485971.1,ENST00000371582.8,ENST00000371584.8,ENST00000371588.9,ENST00000466152.5,...,ENST00000419417.5,ENST00000522523.5,ENST00000460412.5,ENST00000469954.5,ENST00000372581.1,ENST00000247452.3,ENST00000522159.5,ENST00000538154.5,ENST00000441542.6,ENST00000523867.5
Tissue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Adipose - Subcutaneous,6.182467,0.594287,0.110401,0.054026,2.050644,0.667105,0.213756,0.000000,7.594193,0.497980,...,0.000000,0.000000,0.0,0.439256,0.030537,0.007047,0.000000,0.0,0.0,0.0
Muscle - Skeletal,1.828027,0.222140,0.097186,0.111070,0.203628,0.069419,0.800629,0.444280,26.087560,1.143095,...,0.000000,0.000000,0.0,0.916327,0.000000,0.009256,0.000000,0.0,0.0,0.0
Artery - Tibial,3.344569,0.500548,0.037920,0.333698,0.376675,0.065728,0.000000,0.075841,7.513272,0.629477,...,0.000000,0.000000,0.0,0.596612,0.000000,0.000000,0.260386,0.0,0.0,0.0
Artery - Coronary,6.482105,0.898680,0.264444,0.247244,1.010477,0.318193,0.111798,0.064499,7.354986,0.625636,...,0.000000,0.000000,0.0,0.328943,0.000000,0.006450,0.000000,0.0,0.0,0.0
Heart - Atrial Appendage,3.142358,0.180924,0.147596,0.352325,0.014283,0.000000,0.000000,0.000000,8.179654,0.847485,...,0.000000,0.000000,0.0,0.280908,0.042850,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ovary,15.808688,6.539309,1.283523,0.971253,0.394930,0.064291,0.463813,0.849559,7.111040,0.273237,...,0.025257,0.000000,0.0,0.160727,0.027553,0.006888,0.000000,0.0,0.0,0.0
Vagina,10.652385,0.308514,0.069329,0.000000,0.013866,0.000000,0.409040,0.748752,8.881032,0.211453,...,0.000000,0.000000,0.0,0.741819,0.041597,0.010399,0.000000,0.0,0.0,0.0
Artery - Tibial,1.565762,0.173629,0.062010,0.130222,0.040307,0.000000,0.089915,0.229438,9.772835,0.210835,...,0.000000,0.000000,0.0,0.461977,0.000000,0.006201,0.000000,0.0,0.0,0.0
Muscle - Skeletal,0.807908,0.241889,0.067729,0.000000,2.288267,0.062891,0.507966,0.000000,11.388118,0.430562,...,0.000000,0.004838,0.0,2.965555,0.038702,0.000000,0.000000,0.0,0.0,0.0


In [27]:
reindex_data=data.reset_index()

In [28]:
reindex_data

Unnamed: 0,Tissue,ENST00000373020.8,ENST00000496771.5,ENST00000612152.4,ENST00000614008.4,ENST00000373031.4,ENST00000485971.1,ENST00000371582.8,ENST00000371584.8,ENST00000371588.9,...,ENST00000419417.5,ENST00000522523.5,ENST00000460412.5,ENST00000469954.5,ENST00000372581.1,ENST00000247452.3,ENST00000522159.5,ENST00000538154.5,ENST00000441542.6,ENST00000523867.5
0,Adipose - Subcutaneous,6.182467,0.594287,0.110401,0.054026,2.050644,0.667105,0.213756,0.000000,7.594193,...,0.000000,0.000000,0.0,0.439256,0.030537,0.007047,0.000000,0.0,0.0,0.0
1,Muscle - Skeletal,1.828027,0.222140,0.097186,0.111070,0.203628,0.069419,0.800629,0.444280,26.087560,...,0.000000,0.000000,0.0,0.916327,0.000000,0.009256,0.000000,0.0,0.0,0.0
2,Artery - Tibial,3.344569,0.500548,0.037920,0.333698,0.376675,0.065728,0.000000,0.075841,7.513272,...,0.000000,0.000000,0.0,0.596612,0.000000,0.000000,0.260386,0.0,0.0,0.0
3,Artery - Coronary,6.482105,0.898680,0.264444,0.247244,1.010477,0.318193,0.111798,0.064499,7.354986,...,0.000000,0.000000,0.0,0.328943,0.000000,0.006450,0.000000,0.0,0.0,0.0
4,Heart - Atrial Appendage,3.142358,0.180924,0.147596,0.352325,0.014283,0.000000,0.000000,0.000000,8.179654,...,0.000000,0.000000,0.0,0.280908,0.042850,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17377,Ovary,15.808688,6.539309,1.283523,0.971253,0.394930,0.064291,0.463813,0.849559,7.111040,...,0.025257,0.000000,0.0,0.160727,0.027553,0.006888,0.000000,0.0,0.0,0.0
17378,Vagina,10.652385,0.308514,0.069329,0.000000,0.013866,0.000000,0.409040,0.748752,8.881032,...,0.000000,0.000000,0.0,0.741819,0.041597,0.010399,0.000000,0.0,0.0,0.0
17379,Artery - Tibial,1.565762,0.173629,0.062010,0.130222,0.040307,0.000000,0.089915,0.229438,9.772835,...,0.000000,0.000000,0.0,0.461977,0.000000,0.006201,0.000000,0.0,0.0,0.0
17380,Muscle - Skeletal,0.807908,0.241889,0.067729,0.000000,2.288267,0.062891,0.507966,0.000000,11.388118,...,0.000000,0.004838,0.0,2.965555,0.038702,0.000000,0.000000,0.0,0.0,0.0


In [493]:
###### Identify significant (DE) transcripts for each tissue #######
"""
A T-test is used to assess the p-value of the expression of every transcript of being differentially expressed
for every tissue. If the p-value falls under a certain threshold, the transcripts is considered to have a DE
for such tissue. This section is currently not necessary to run the whole script!
"""

In [31]:
significant={}

In [495]:
for Tissue in data.index.unique(): significant[Tissue]=[]

In [496]:
for Transc in data.columns:
    for Tissue in data.index.unique():
        ttest=ttest_ind(data[Transc][Tissue],data[Transc][data.index != Tissue])
        if ttest.pvalue <= 0.00001:
            significant[Tissue].append(Transc)

In [497]:
pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in significant.items() ]))

Unnamed: 0,Adipose - Subcutaneous,Muscle - Skeletal,Artery - Tibial,Artery - Coronary,Heart - Atrial Appendage,Adipose - Visceral (Omentum),Uterus,Vagina,Breast - Mammary Tissue,Skin - Not Sun Exposed (Suprapubic),...,Brain - Cerebellar Hemisphere,Liver,Brain - Substantia nigra,Kidney - Cortex,Brain - Amygdala,Cervix - Ectocervix,Fallopian Tube,Cervix - Endocervix,Bladder,Kidney - Medulla
0,ENST00000373020.8,ENST00000373020.8,ENST00000373020.8,ENST00000373020.8,ENST00000373020.8,ENST00000373020.8,ENST00000496771.5,ENST00000373020.8,ENST00000373020.8,ENST00000373020.8,...,ENST00000373020.8,ENST00000373020.8,ENST00000612152.4,ENST00000373020.8,ENST00000612152.4,ENST00000222462.2,ENST00000431869.1,ENST00000358495.7,ENST00000464944.1,ENST00000475760.1
1,ENST00000612152.4,ENST00000496771.5,ENST00000496771.5,ENST00000612152.4,ENST00000612152.4,ENST00000612152.4,ENST00000612152.4,ENST00000359326.8,ENST00000496771.5,ENST00000496771.5,...,ENST00000496771.5,ENST00000612152.4,ENST00000371588.9,ENST00000612152.4,ENST00000371588.9,ENST00000361301.6,ENST00000475688.1,ENST00000540167.5,ENST00000493057.1,ENST00000480582.1
2,ENST00000373031.4,ENST00000612152.4,ENST00000612152.4,ENST00000367429.8,ENST00000373031.4,ENST00000373031.4,ENST00000459772.5,ENST00000459772.5,ENST00000612152.4,ENST00000612152.4,...,ENST00000612152.4,ENST00000371588.9,ENST00000496973.5,ENST00000413811.3,ENST00000413811.3,ENST00000540167.5,ENST00000478269.5,,ENST00000540167.5,ENST00000481031.5
3,ENST00000485971.1,ENST00000614008.4,ENST00000485971.1,ENST00000466229.5,ENST00000485971.1,ENST00000485971.1,ENST00000367429.8,ENST00000367429.8,ENST00000373031.4,ENST00000371582.8,...,ENST00000371582.8,ENST00000359326.8,ENST00000367429.8,ENST00000496973.5,ENST00000459772.5,ENST00000572061.1,ENST00000501740.6,,ENST00000573483.1,ENST00000539668.1
4,ENST00000371588.9,ENST00000485971.1,ENST00000371582.8,ENST00000470918.1,ENST00000371582.8,ENST00000371582.8,ENST00000466229.5,ENST00000435187.1,ENST00000485971.1,ENST00000371584.8,...,ENST00000371584.8,ENST00000413811.3,ENST00000496761.1,ENST00000630130.2,ENST00000496973.5,ENST00000416767.8,ENST00000564701.1,,,ENST00000617670.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,,,,,,,,,,,...,,,,,,,,,,
284,,,,,,,,,,,...,,,,,,,,,,
285,,,,,,,,,,,...,,,,,,,,,,
286,,,,,,,,,,,...,,,,,,,,,,


In [505]:
# Number of transcripts with significant DE for each tissue
for i in significant:
    print(i,len(significant[i]))

Adipose - Subcutaneous 131
Muscle - Skeletal 233
Artery - Tibial 185
Artery - Coronary 51
Heart - Atrial Appendage 111
Adipose - Visceral (Omentum) 99
Uterus 43
Vagina 39
Breast - Mammary Tissue 75
Skin - Not Sun Exposed (Suprapubic) 129
Minor Salivary Gland 70
Brain - Cortex 109
Adrenal Gland 90
Thyroid 135
Lung 114
Spleen 103
Pancreas 124
Esophagus - Muscularis 114
Esophagus - Mucosa 162
Esophagus - Gastroesophageal Junction 75
Stomach 79
Colon - Sigmoid 91
Small Intestine - Terminal Ileum 55
Colon - Transverse 102
Prostate 77
Testis 288
Nerve - Tibial 152
Skin - Sun Exposed (Lower leg) 152
Heart - Left Ventricle 122
Brain - Cerebellum 155
Cells - Cultured fibroblasts 197
Whole Blood 272
Artery - Aorta 106
Cells - EBV-transformed lymphocytes 142
Pituitary 108
Brain - Frontal Cortex (BA9) 93
Brain - Caudate (basal ganglia) 107
Brain - Nucleus accumbens (basal ganglia) 115
Brain - Putamen (basal ganglia) 91
Brain - Hypothalamus 103
Brain - Spinal cord (cervical c-1) 69
Brain - Hippocam

In [506]:
####### Calculate ratio of success for each transcript for each tissue #######
"""
For each tissue type a T-test is used to assess the p-value of that transcript of being differentially 
expressed in that tissue. If so, for each sample of the same tissue a two-sided t-test is used to obtain
the p-value of the sample expression of belonging to a normal distribution of the whole tissue-transcript
expression. If the p-value falls under a certain threshold, that sample is considered to be successfully
associated to the rest of samples of the same tissue, otherwise if the p-value is above the threshold
that sample is considered to fail to associate to the rest of samples of the same tissue. After testing 
all samples, sucesses and fails are used as a score of how likely a DE expressed transcript would have a
significant expression in one sample of a specific tissue.
"""

In [507]:
fails={}
success={}
ratio={}

In [514]:
len_tran=len(data.columns)
for Tissue in data.index.unique().sort_values():
    print(Tissue)
    fails[Tissue]=[]
    success[Tissue]=[]
    ratio[Tissue]={}
    for Transc in data.columns:
        print_cum=('Transcript '+str(data.columns.get_loc(Transc)+1)+' of '+str(len_tran)+": ")  
        ttest_tissue=ttest_ind(data[Transc][Tissue],data[Transc][data.index != Tissue])
        if ttest_tissue.pvalue <= 0.00001:
            print_cum=print_cum+'Tissue expression significant:'
            for sample in list((reindex_data[reindex_data['Tissue'] == Tissue]).index):
                print_cum_fin=print_cum+'Sample '+str((reindex_data[reindex_data['Tissue'] == Tissue]).index.get_loc(sample))+' of '+str(len(list((reindex_data[reindex_data['Tissue'] == Tissue]).index)))
                print(print_cum_fin,end="\r",flush=True)
                ttest_sample=stats.ttest_1samp(data[Transc][Tissue],data.iloc[sample][Transc])
                if ttest_sample.pvalue <= 0.0001: success[Tissue].append(Transc)
                elif ttest_sample.pvalue >= 0.0001: fails[Tissue].append(Transc)
                if (data[Transc][Tissue]).mean() == 0 and data.iloc[sample][Transc] == 0:
                    success[Tissue].append(Transc)
                if (data[Transc][Tissue]).mean() == 0 and data.iloc[sample][Transc] != 0:
                    fails[Tissue].append(Transc)
            ratio[Tissue][Transc]=success[Tissue].count(Transc)/(success[Tissue].count(Transc)+fails[Tissue].count(Transc))
        else: ratio[Tissue][Transc]= np.nan
    print(len_tran,"transcripts processed,",len(success[Tissue]),"successful associations",len(fails[Tissue]),"failed. Mean ratio of",pd.Series(ratio[Tissue]).mean(),end="\r",flush=True)
    print("")
    print('###########')

Adipose - Subcutaneous
448 transcripts processed, 75956 successful associations 10897 failed. Mean ratio of 0.8745351340771188
###########
Adipose - Visceral (Omentum)
448 transcripts processed, 46546 successful associations 7013 failed. Mean ratio of 0.8690602886536343
###########
Adrenal Gland
448 transcripts processed, 18492 successful associations 4728 failed. Mean ratio of 0.7963824289405685
###########
Artery - Aorta
448 transcripts processed, 39580 successful associations 6212 failed. Mean ratio of 0.8643431167016074
###########
Artery - Coronary
448 transcripts processed, 9527 successful associations 2713 failed. Mean ratio of 0.7783496732026144
###########
Artery - Tibial
448 transcripts processed, 108288 successful associations 14367 failed. Mean ratio of 0.8828665769842241
###########
Bladder
448 transcripts processed, 18 successful associations 66 failed. Mean ratio of 0.21428571428571427
###########
Brain - Amygdala
448 transcripts processed, 8761 successful associations 4

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 77 of 448: Tissue expression significant:Sample 236 of 701

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 98 of 448: Tissue expression significant:Sample 665 of 701

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 124 of 448: Tissue expression significant:Sample 395 of 701

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 140 of 448: Tissue expression significant:Sample 105 of 701

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 153 of 448: Tissue expression significant:Sample 540 of 701

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 162 of 448: Tissue expression significant:Sample 36 of 7011

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 169 of 448: Tissue expression significant:Sample 208 of 701

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 184 of 448: Tissue expression significant:Sample 394 of 701

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 194 of 448: Tissue expression significant:Sample 570 of 701

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 205 of 448: Tissue expression significant:Sample 68 of 7011

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 211 of 448: Tissue expression significant:Sample 224 of 701

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 240 of 448: Tissue expression significant:Sample 412 of 701

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 253 of 448: Tissue expression significant:Sample 374 of 701

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 286 of 448: Tissue expression significant:Sample 691 of 701

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 301 of 448: Tissue expression significant:Sample 656 of 701

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 440 of 448: Tissue expression significant:Sample 262 of 701

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 54 of 448: Tissue expression significant:Sample 87 of 1877

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 152 of 448: Tissue expression significant:Sample 117 of 187

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 387 of 448: Tissue expression significant:Sample 141 of 187

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 31 of 448: Tissue expression significant:Sample 47 of 2411

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 47 of 448: Tissue expression significant:Sample 215 of 241

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 95 of 448: Tissue expression significant:Sample 214 of 241

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 153 of 448: Tissue expression significant:Sample 44 of 2411

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 178 of 448: Tissue expression significant:Sample 80 of 2411

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 286 of 448: Tissue expression significant:Sample 129 of 241

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 23 of 448: Tissue expression significant:Sample 193 of 359

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 54 of 448: Tissue expression significant:Sample 161 of 359

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 77 of 448: Tissue expression significant:Sample 1 of 35959

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 96 of 448: Tissue expression significant:Sample 329 of 359

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 120 of 448: Tissue expression significant:Sample 173 of 359

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 187 of 448: Tissue expression significant:Sample 138 of 359

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 214 of 448: Tissue expression significant:Sample 337 of 359

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



448 transcripts processed, 23419 successful associations 4942 failed. Mean ratio of 0.8257466238849125
###########
Testis
Transcript 2 of 448: Tissue expression significant:Sample 295 of 361

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 12 of 448: Tissue expression significant:Sample 159 of 361

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 30 of 448: Tissue expression significant:Sample 86 of 3611

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 37 of 448: Tissue expression significant:Sample 310 of 361

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 52 of 448: Tissue expression significant:Sample 221 of 361

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 62 of 448: Tissue expression significant:Sample 103 of 361

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 76 of 448: Tissue expression significant:Sample 19 of 3611

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 83 of 448: Tissue expression significant:Sample 243 of 361

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 96 of 448: Tissue expression significant:Sample 122 of 361

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 112 of 448: Tissue expression significant:Sample 2 of 36161

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 132 of 448: Tissue expression significant:Sample 244 of 361

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 140 of 448: Tissue expression significant:Sample 130 of 361

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 151 of 448: Tissue expression significant:Sample 163 of 361

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 161 of 448: Tissue expression significant:Sample 161 of 361

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 174 of 448: Tissue expression significant:Sample 267 of 361

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 186 of 448: Tissue expression significant:Sample 264 of 361

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 200 of 448: Tissue expression significant:Sample 5 of 36161

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 207 of 448: Tissue expression significant:Sample 26 of 3611

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 234 of 448: Tissue expression significant:Sample 139 of 361

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 245 of 448: Tissue expression significant:Sample 148 of 361

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 277 of 448: Tissue expression significant:Sample 262 of 361

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 285 of 448: Tissue expression significant:Sample 264 of 361

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 310 of 448: Tissue expression significant:Sample 18 of 3611

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 317 of 448: Tissue expression significant:Sample 34 of 3611

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 342 of 448: Tissue expression significant:Sample 140 of 361

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 375 of 448: Tissue expression significant:Sample 157 of 361

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 399 of 448: Tissue expression significant:Sample 266 of 361

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 420 of 448: Tissue expression significant:Sample 277 of 361

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 442 of 448: Tissue expression significant:Sample 16 of 3611

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



448 transcripts processed, 86423 successful associations 17545 failed. Mean ratio of 0.8312461526623577
###########
Thyroid
Transcript 2 of 448: Tissue expression significant:Sample 392 of 653

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 10 of 448: Tissue expression significant:Sample 632 of 653

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 17 of 448: Tissue expression significant:Sample 559 of 653

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 35 of 448: Tissue expression significant:Sample 128 of 653

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 42 of 448: Tissue expression significant:Sample 66 of 6533

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 54 of 448: Tissue expression significant:Sample 294 of 653

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 60 of 448: Tissue expression significant:Sample 228 of 653

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 70 of 448: Tissue expression significant:Sample 447 of 653

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 75 of 448: Tissue expression significant:Sample 379 of 653

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 85 of 448: Tissue expression significant:Sample 561 of 653

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 93 of 448: Tissue expression significant:Sample 528 of 653

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 103 of 448: Tissue expression significant:Sample 461 of 653

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 131 of 448: Tissue expression significant:Sample 611 of 653

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 147 of 448: Tissue expression significant:Sample 138 of 653

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 179 of 448: Tissue expression significant:Sample 285 of 653

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 211 of 448: Tissue expression significant:Sample 459 of 653

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 247 of 448: Tissue expression significant:Sample 631 of 653

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 421 of 448: Tissue expression significant:Sample 98 of 6533

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 241 of 448: Tissue expression significant:Sample 94 of 1422

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



448 transcripts processed, 4606 successful associations 1478 failed. Mean ratio of 0.7570677186061802
###########
Whole Blood
Transcript 2 of 448: Tissue expression significant:Sample 389 of 755

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 12 of 448: Tissue expression significant:Sample 145 of 755

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 22 of 448: Tissue expression significant:Sample 622 of 755

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 32 of 448: Tissue expression significant:Sample 397 of 755

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 42 of 448: Tissue expression significant:Sample 142 of 755

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 51 of 448: Tissue expression significant:Sample 628 of 755

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 62 of 448: Tissue expression significant:Sample 407 of 755

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 73 of 448: Tissue expression significant:Sample 184 of 755

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 82 of 448: Tissue expression significant:Sample 674 of 755

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 96 of 448: Tissue expression significant:Sample 443 of 755

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 102 of 448: Tissue expression significant:Sample 401 of 755

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 118 of 448: Tissue expression significant:Sample 182 of 755

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 130 of 448: Tissue expression significant:Sample 702 of 755

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 145 of 448: Tissue expression significant:Sample 464 of 755

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 156 of 448: Tissue expression significant:Sample 241 of 755

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 166 of 448: Tissue expression significant:Sample 29 of 7555

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 178 of 448: Tissue expression significant:Sample 581 of 755

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 190 of 448: Tissue expression significant:Sample 366 of 755

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 203 of 448: Tissue expression significant:Sample 103 of 755

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 220 of 448: Tissue expression significant:Sample 621 of 755

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 235 of 448: Tissue expression significant:Sample 337 of 755

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 248 of 448: Tissue expression significant:Sample 97 of 7555

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 257 of 448: Tissue expression significant:Sample 607 of 755

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 278 of 448: Tissue expression significant:Sample 372 of 755

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 293 of 448: Tissue expression significant:Sample 140 of 755

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 317 of 448: Tissue expression significant:Sample 627 of 755

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 367 of 448: Tissue expression significant:Sample 438 of 755

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 394 of 448: Tissue expression significant:Sample 193 of 755

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Transcript 427 of 448: Tissue expression significant:Sample 678 of 755

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [516]:
ratiodata=pd.DataFrame(ratio).T

In [518]:
ratiodata.to_csv('Ratiodata_v8.csv',index=True)

In [32]:
ratiodata=pd.read_csv('Ratiodata_v8.csv',index_col=0)

In [33]:
# Only transcripts with a high score are used.
for i in ratiodata.columns:
    for b in ratiodata[i].index:
        if ratiodata[i][b]<=0.7:ratiodata[i][b]=np.nan

In [34]:
# To avoid transcripts associated to multiple tissues being over-informative, a relative score of each 
# transcript-tissue association can be used insted.
rel_ratiodata=ratiodata/ratiodata.sum()

In [None]:
####### Transcript-tissue scoring function ######
"""
A simple sample relative expression set is passed to this function. For each transcript selected previously, 
a T-test is performed to obtain the p-value of the expression of such transcript in the sample to belong to 
the expression pattern of each tissue. If the p-value falls under a certain threshold, the combination 
transcript-tissue gets a scoring equal to the ratio of success of such transcript for that tissue as calculated 
above. The list of transcript-tissue scores is returned to calculate the weights.
"""

In [298]:
def t_test_ratio_external (i):
    value={}
    for Transc in data.columns:
        value[Transc]={}
        for Tissue in data.index.unique():
            ttest_sample=stats.ttest_1samp(data[Transc][Tissue],i[Transc])
            value[Transc][Tissue]=np.nan
            if ttest_sample.pvalue >= 0.0001: value[Transc][Tissue] = ratiodata[Transc][Tissue]## 
    return {'value':value}

In [None]:
##### LOAD eQTLs #######
"""
Pre-finemapped eQTLs from GTEX database are loaded to make predictions based on attributed weights for each tissue.
Only CaVEMaN dataset is being used here although a consensus dataset is recommended.
"""

In [16]:
eqtl_file = '/nfs/research1/zerbino/jhidalgo/inteql_GTEX_v8/data/original-data/pre-finemapped/GTEx_v8_finemapping_CaVEMaN/GTEx_v8_finemapping_CaVEMaN.txt.gz'

In [17]:
eQTLs=pd.read_csv(eqtl_file,sep='\t',compression='gzip')

In [629]:
eQTLs

Unnamed: 0,TISSUE,GENE,eQTL,CHROM,POS,REF,ALT,Probability
0,Adipose_Subcutaneous,ENSG00000000003.14,chrX_100649875_A_G,chrX,100649875,A,G,0.986724
1,Adipose_Subcutaneous,ENSG00000000457.13,chr1_169699715_T_C,chr1,169699715,T,C,0.174525
2,Adipose_Subcutaneous,ENSG00000000457.13,chr1_169891332_G_A,chr1,169891332,G,A,0.266151
3,Adipose_Subcutaneous,ENSG00000000460.16,chr1_169787407_G_C,chr1,169829604,C,G,0.176874
4,Adipose_Subcutaneous,ENSG00000000938.12,chr1_27634321_C_G,chr1,27634281,G,A,0.226577
...,...,...,...,...,...,...,...,...
1314538,Whole_Blood,ENSG00000284167.1,chr2_143097724_G_A,chr2,143097724,G,A,0.264387
1314539,Whole_Blood,ENSG00000284526.1,chr17_76585863_A_G,chr17,76585863,A,G,0.231189
1314540,Whole_Blood,ENSG00000284526.1,chr17_76585863_A_G,chr17,76585864,A,C,0.231189
1314541,Whole_Blood,ENSG00000284526.1,chr17_76585863_A_G,chr17,76585943,G,C,0.231189


In [18]:
eQTLs=eQTLs.pivot_table(index=['eQTL','GENE'],columns='TISSUE',values='Probability')

In [None]:
##### Tissue name link #######
"""
Since tissue names differ, specially in format, in eQTL and expression data, a dictionary of corresponding names
is used to match both names when scoring.
"""

In [29]:
tissues_common={}
for b in data.index.unique():
    if b in eQTLs.columns:
        tissues_common[b]=b
    else:
        if b.replace(' - ','_').replace(' ','_').replace('(','').replace(')','') in (eQTLs.columns):
            #print(b.replace(' - ','_').replace(' ','_').replace('(','').replace(')',''),'matches.')
            tissues_common[b]=b.replace(' - ','_').replace(' ','_').replace('(','').replace(')','')
        else:
            print(b.replace(' - ','_').replace(' ','_').replace('(','').replace(')',''),'doesnt match!!!!!!!!!!!!!!')

Cervix_Ectocervix doesnt match!!!!!!!!!!!!!!
Fallopian_Tube doesnt match!!!!!!!!!!!!!!
Cervix_Endocervix doesnt match!!!!!!!!!!!!!!
Bladder doesnt match!!!!!!!!!!!!!!
Kidney_Medulla doesnt match!!!!!!!!!!!!!!


In [30]:
tissues_common_inv = {v: k for k, v in tissues_common.items()}

In [552]:
###### Prediction on the mean RNA-seq value per tissue #######
"""
To test the scoring system, optimal-conditions samples are obtain with the mean transcript expression for each
tissue. This mean-sample should align much better to eQTL values than single samples randomly obtain from the dataset.
"""

In [35]:
tissue_means=data.groupby('Tissue').mean()

In [554]:
tissue_means

Unnamed: 0_level_0,ENST00000373020.8,ENST00000496771.5,ENST00000612152.4,ENST00000614008.4,ENST00000373031.4,ENST00000485971.1,ENST00000371582.8,ENST00000371584.8,ENST00000371588.9,ENST00000466152.5,...,ENST00000419417.5,ENST00000522523.5,ENST00000460412.5,ENST00000469954.5,ENST00000372581.1,ENST00000247452.3,ENST00000522159.5,ENST00000538154.5,ENST00000441542.6,ENST00000523867.5
Tissue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Adipose - Subcutaneous,7.651107,0.559521,0.312459,0.269514,5.901195,1.400464,0.417175,0.279636,8.778546,0.327058,...,0.003184,0.004096,0.006503,0.370096,0.004628,0.001631,0.006984,0.002462,0.0,0.000467
Adipose - Visceral (Omentum),8.003169,0.602775,0.299905,0.264194,3.83325,1.509557,0.558681,0.384057,9.62301,0.324878,...,0.003577,0.0054,0.004237,0.309954,0.015306,0.003876,0.007303,0.001771,7e-06,0.000546
Adrenal Gland,4.328808,0.172714,0.14348,0.073832,0.017504,0.007835,0.477154,0.337843,10.561037,0.286399,...,0.005365,0.002988,0.000831,0.188243,0.004455,0.002036,0.012706,0.000699,0.0,8.9e-05
Artery - Aorta,2.113147,0.225462,0.113634,0.093456,0.056983,0.014618,0.44216,0.317682,9.214403,0.311579,...,0.002071,0.004917,0.00184,0.336784,0.003466,0.00138,0.009087,0.001014,0.0,0.001235
Artery - Coronary,2.378499,0.265408,0.115467,0.113097,0.252642,0.057965,0.429183,0.340227,8.668406,0.279399,...,0.001615,0.004431,0.003036,0.327164,0.003016,0.001907,0.009833,0.000533,7.5e-05,0.001545
Artery - Tibial,1.778353,0.199982,0.102855,0.079631,0.404135,0.032121,0.394867,0.266236,9.4403,0.388161,...,0.001752,0.006173,0.002867,0.389944,0.003388,0.001519,0.009969,0.001006,0.0,0.000353
Bladder,5.97656,0.551584,0.215201,0.104386,0.061835,0.014078,0.364176,0.321342,7.49685,0.41936,...,0.006521,0.013173,0.007618,0.478627,0.002538,0.000701,0.007672,0.0,0.0,0.0
Brain - Amygdala,3.924991,0.570902,0.293247,0.04733,0.060324,0.004611,0.440178,0.321178,6.052392,0.300997,...,0.005452,0.000788,0.006217,0.13551,0.005878,0.003882,0.046179,0.001999,0.0,0.000251
Brain - Anterior cingulate cortex (BA24),2.80921,0.340139,0.229862,0.032461,0.069775,0.008747,0.446006,0.26618,6.546011,0.318361,...,0.002721,0.003217,0.004393,0.121468,0.009079,0.003679,0.038595,0.001678,0.0,7.5e-05
Brain - Caudate (basal ganglia),3.681646,0.331679,0.254638,0.080496,0.020659,0.005637,0.455149,0.276148,6.318172,0.307481,...,0.002401,0.005563,0.009237,0.159008,0.006197,0.001658,0.029375,0.001844,8.4e-05,0.0


In [555]:
####### Prediction and eQTL comparison for each mean RNA-seq value ######
"""
Each mean-expression sample is used as a target for weighted means prediction and compared to simple mean
prediction and to real eQTL value for that tissue.
"""

In [321]:
prediction={}
real={}
mean_prediction={}
### a = renamed eQTLs to match RNAseq names
a=eQTLs[tissues_common.values()].rename(columns=tissues_common_inv)
l=list(tissues_common.keys())
for i in l:
    print(l.index(i),'out of',len(l),':',i,end='\r',flush=True)
    test=t_test_ratio_external(tissue_means.loc[i])
    ### b = tissue scores according to T test
    b=pd.DataFrame(test['value']).sum(axis=1)
    ### b_where_a = scores for tissues with real eQTL value for such transcript
    b_where_a=b.multiply(~ np.isnan(a))
    b_where_a_sum=b_where_a.sum(axis=1)
    prediction[i]=a.multiply(b_where_a.div(b_where_a_sum,axis=0)).sum(axis=1)#.replace(0,np.nan)
    real[i]=eQTLs[tissues_common[i]]
    mean_prediction[i]=a.mean(axis=1)
    print('                                                                       ',end='\r',flush=True)

                                          rertex (BA24))

In [323]:
Predicted=pd.DataFrame(prediction)
Real=pd.DataFrame(real)
Mean_predicted=pd.DataFrame(mean_prediction)

In [324]:
Predicted

Unnamed: 0_level_0,Unnamed: 1_level_0,Adipose - Subcutaneous,Muscle - Skeletal,Artery - Tibial,Artery - Coronary,Heart - Atrial Appendage,Adipose - Visceral (Omentum),Uterus,Vagina,Breast - Mammary Tissue,Skin - Not Sun Exposed (Suprapubic),...,Brain - Hypothalamus,Brain - Spinal cord (cervical c-1),Brain - Hippocampus,Brain - Anterior cingulate cortex (BA24),Ovary,Brain - Cerebellar Hemisphere,Liver,Brain - Substantia nigra,Kidney - Cortex,Brain - Amygdala
eQTL,GENE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
chr10_100007241_C_T,ENSG00000107554.16,0.201558,0.201558,0.201558,0.201558,0.201558,0.201558,0.201558,0.201558,0.201558,0.201558,...,0.201558,0.201558,0.201558,0.201558,0.201558,0.201558,0.201558,0.201558,0.201558,0.201558
chr10_100008640_A_G,ENSG00000107554.16,0.292908,0.292908,0.292908,0.292908,0.292908,0.292908,0.292908,0.292908,0.292908,0.292908,...,0.292908,0.292908,0.292908,0.292908,0.292908,0.292908,0.292908,0.292908,0.292908,0.292908
chr10_100009635_T_G,ENSG00000107554.16,0.182330,0.182330,0.182330,0.182330,0.182330,0.182330,0.182330,0.182330,0.182330,0.182330,...,0.182330,0.182330,0.182330,0.182330,0.182330,0.182330,0.182330,0.182330,0.182330,0.182330
chr10_100014923_C_T,ENSG00000119929.12,0.189001,0.189001,0.189001,0.189001,0.189001,0.189001,0.189001,0.189001,0.189001,0.189001,...,0.189001,0.189001,0.189001,0.189001,0.189001,0.189001,0.189001,0.189001,0.189001,0.189001
chr10_100017063_G_A,ENSG00000107554.16,0.253726,0.253726,0.253726,0.253726,0.253726,0.253726,0.253726,0.253726,0.253726,0.253726,...,0.253726,0.253726,0.253726,0.253726,0.253726,0.253726,0.253726,0.253726,0.253726,0.253726
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chrX_99899260_C_G,ENSG00000233680.4,0.231066,0.231066,0.231066,0.231066,0.231066,0.231066,0.231066,0.231066,0.231066,0.231066,...,0.231066,0.231066,0.231066,0.231066,0.231066,0.231066,0.231066,0.231066,0.231066,0.231066
chrX_99953318_T_G,ENSG00000102362.15,0.186377,0.186377,0.186377,0.186377,0.186377,0.186377,0.186377,0.186377,0.186377,0.186377,...,0.186377,0.186377,0.186377,0.186377,0.186377,0.186377,0.186377,0.186377,0.186377,0.186377
chrX_99959994_A_G,ENSG00000101811.13,0.261396,0.261396,0.261396,0.261396,0.261396,0.261396,0.261396,0.261396,0.261396,0.261396,...,0.261396,0.261396,0.261396,0.261396,0.261396,0.261396,0.261396,0.261396,0.261396,0.261396
chrX_99960274_C_T,ENSG00000179031.8,0.178828,0.178828,0.178828,0.178828,0.178828,0.178828,0.178828,0.178828,0.178828,0.178828,...,0.178828,0.178828,0.178828,0.178828,0.178828,0.178828,0.178828,0.178828,0.178828,0.178828


In [325]:
pd.DataFrame({'Mean':Mean_predicted[Real.notna()].corrwith(Real),'Weighted':Predicted[Real.notna()].corrwith(Real)})

Unnamed: 0,Mean,Weighted
Adipose - Subcutaneous,0.899644,0.973799
Muscle - Skeletal,0.89826,0.983496
Artery - Tibial,0.896339,0.977602
Artery - Coronary,0.838258,0.857714
Heart - Atrial Appendage,0.884101,0.952529
Adipose - Visceral (Omentum),0.881495,0.963358
Uterus,0.817804,0.811646
Vagina,0.811308,0.813896
Breast - Mammary Tissue,0.875703,0.943005
Skin - Not Sun Exposed (Suprapubic),0.899655,0.965178


In [326]:
rmse_mean=[]
rmse_pred=[]
rmse_tissue=[]
for i in Real.columns:
    test=pd.DataFrame({'Real':Real[i],'Mean':Mean_predicted[i],'Weighted':Predicted[i]}).dropna(how='any')
    #print(i,'Mean:',sqrt(mean_squared_error(test['Real'], test['Mean'])),'Weighted:',sqrt(mean_squared_error(test['Real'], test['Weighted'])))
    rmse_mean.append(sqrt(mean_squared_error(test['Real'], test['Mean'])))
    rmse_pred.append(sqrt(mean_squared_error(test['Real'], test['Weighted'])))
    rmse_tissue.append(i)
rmse_merged=pd.DataFrame({'RMSE Mean':rmse_mean,'RMSE Weighted':rmse_pred},index=rmse_tissue)


In [279]:
rmse_merged=pd.DataFrame({'RMSE Mean':rmse_mean,'RMSE Weighted':rmse_pred},index=rmse_tissue)

In [327]:
rmse_merged['Diff']=rmse_merged['RMSE Weighted']-rmse_merged['RMSE Mean']

In [328]:
rmse_merged.mean()

RMSE Mean        0.105031
RMSE Weighted    0.075330
Diff            -0.029701
dtype: float64

In [43]:
# Number of transcripts with eQTL value for N different tissues
for i in range(0,len(Real.columns)):
    print(i,(Real.notna().sum(axis=1) > i).sum())

0 471726
1 77075
2 35811
3 21615
4 14950
5 11134
6 8681
7 7074
8 5861
9 4947
10 4257
11 3696
12 3231
13 2870
14 2510
15 2214
16 1947
17 1760
18 1553
19 1403
20 1255
21 1133
22 1042
23 930
24 827
25 751
26 681
27 617
28 547
29 479
30 437
31 391
32 344
33 306
34 271
35 240
36 218
37 193
38 165
39 147
40 120
41 107
42 90
43 78
44 67
45 46
46 34
47 20
48 8


In [329]:
Real_melt=Real.reset_index().melt(var_name='Value',value_vars=Real.columns,id_vars=['eQTL','GENE'])#.set_index(['eQTL','GENE'])

In [330]:
Real_melt=Real_melt[Real_melt['value'].notna()]

In [331]:
Real_melt

Unnamed: 0,eQTL,GENE,Value,value
46,chr10_100153963_AT_A,ENSG00000107566.13,Adipose - Subcutaneous,0.232764
59,chr10_100186276_G_A,ENSG00000196072.11,Adipose - Subcutaneous,0.147162
65,chr10_100201209_C_T,ENSG00000230224.1,Adipose - Subcutaneous,0.223630
67,chr10_100207653_G_A,ENSG00000095485.16,Adipose - Subcutaneous,0.331564
104,chr10_100278884_G_A,ENSG00000196072.11,Adipose - Subcutaneous,0.714275
...,...,...,...,...
23113837,chrX_7842759_C_T,ENSG00000182583.12,Brain - Amygdala,0.538380
23114185,chrX_8473788_T_C,ENSG00000205642.9,Brain - Amygdala,0.517315
23114197,chrX_84926459_C_T,ENSG00000184788.12,Brain - Amygdala,0.196668
23114244,chrX_85275311_A_C,ENSG00000184788.12,Brain - Amygdala,0.137840


In [332]:
Mean_melt=Mean_predicted.reset_index().melt(var_name='Value',value_vars=Real.columns,id_vars=['eQTL','GENE'])#.set_index(['eQTL','GENE'])

In [333]:
Mean_melt=Mean_melt.loc[Real_melt['value'].notna().index]

In [334]:
Predicted_melt=Predicted.reset_index().melt(var_name='Value',value_vars=Real.columns,id_vars=['eQTL','GENE'])#.set_index(['eQTL','GENE'])

In [335]:
Predicted_melt=Predicted_melt.loc[Real_melt['value'].notna().index]

In [336]:
Melted_results=pd.concat([Real_melt.rename(columns={'value':'Real','Value':'Tissue'}),Mean_melt['value'].rename('Mean'),Predicted_melt['value'].rename('Weighted')],axis=1)

In [338]:
Melted_results.to_csv('Results_to_plot_2807.csv',index=False)

In [585]:
###### Perform predictions on random RNA-seq samples and check performance per tissue #######

In [135]:
a=eQTLs[tissues_common.values()].rename(columns=tissues_common_inv)
export=False
slope=[]
weighted_rmse=[]
weighted_corr=[]
mean_rmse=[]
mean_corr=[]
tissue=[]
count=0
sample=0
while count < 1000:
    if data.iloc[sample].name in tissues_common:
            print(data.iloc[sample].name)
            test=t_test_ratio_external(data.iloc[sample])
            ### b = tissue scores according to T test
            b=pd.DataFrame(test['value']).sum(axis=1)[tissues_common.keys()]
            ### b_where_a = scores for tissues with eQTL value for such transcript
            b_where_a=b.multiply(~ np.isnan(a))
            b_where_a_sum=b_where_a.sum(axis=1)
            prediction=a.multiply(b_where_a.div(b_where_a_sum,axis=0)).sum(axis=1)[eQTLs[tissues_common[data.iloc[sample].name]].notna()]
            real=eQTLs[tissues_common[data.iloc[sample].name]][eQTLs[tissues_common[data.iloc[sample].name]].notna()]
            mean_prediction=eQTLs.mean(axis=1)[eQTLs[tissues_common[data.iloc[sample].name]].notna()]
            tissue.append(data.iloc[sample].name)
            weighted_rmse.append(sqrt(mean_squared_error(real, prediction)))
            mean_rmse.append(sqrt(mean_squared_error(real, mean_prediction)))
            weighted_corr.append(real.corr(prediction))
            mean_corr.append(real.corr(mean_prediction))
            if export: pd.DataFrame({'Real':real,'Weighted':prediction,'Mean':mean_prediction}).to_csv(str(count)+'_'+str(data.iloc[sample].name)+'.csv')
            count=count+1
            print('###############')
    sample=sample+1

Adipose - Subcutaneous
###############
Muscle - Skeletal
###############
Artery - Tibial
###############
Artery - Coronary
###############
Heart - Atrial Appendage
###############
Adipose - Visceral (Omentum)
###############
Uterus
###############
Vagina
###############
Breast - Mammary Tissue
###############
Skin - Not Sun Exposed (Suprapubic)
###############


In [136]:
pd.DataFrame({'Weighted RMSE':weighted_rmse,'Mean RMSE':mean_rmse,'Weighted Corr':weighted_corr,'Mean Corr':mean_corr},index=tissue)

Unnamed: 0,Weighted RMSE,Mean RMSE,Weighted Corr,Mean Corr
Adipose - Subcutaneous,0.096337,0.106729,0.918013,0.899644
Muscle - Skeletal,0.090631,0.104004,0.924661,0.89826
Artery - Tibial,0.098182,0.107787,0.914833,0.896339
Artery - Coronary,0.127904,0.117652,0.819943,0.838258
Heart - Atrial Appendage,0.09148,0.101605,0.90751,0.884101
Adipose - Visceral (Omentum),0.098843,0.107335,0.900115,0.881495
Uterus,0.126893,0.127394,0.824621,0.817804
Vagina,0.13352,0.124559,0.795738,0.811308
Breast - Mammary Tissue,0.102521,0.106277,0.885922,0.875703
Skin - Not Sun Exposed (Suprapubic),0.087815,0.098711,0.921291,0.899655


In [339]:
####### Testing with ENCODE #######

In [664]:
def t_test_ratio_encode(i):
    value={}
    for Transc in data.columns:
        if Transc in i.index:
            value[Transc]={}
            for Tissue in data.index.unique():
                ttest_sample=stats.ttest_1samp(data[Transc][Tissue],i[Transc])
                value[Transc][Tissue]=np.nan
                if ttest_sample.pvalue >= 0.0001: value[Transc][Tissue] = ratiodata[Transc][Tissue]## 
    return {'value':value}

In [672]:
def t_test_ratio_encode_relative(i):
    value={}
    for Transc in data.columns:
        if Transc in i.index:
            value[Transc]={}
            for Tissue in data.index.unique():
                ttest_sample=stats.ttest_1samp(data[Transc][Tissue],i[Transc])
                value[Transc][Tissue]=np.nan
                if ttest_sample.pvalue >= 0.0001: value[Transc][Tissue] = rel_ratiodata[Transc][Tissue]## 
    return {'value':value}

In [671]:
encode_dir='/nfs/research1/zerbino/jhidalgo/inteql_GTEX_v8/data/original-data/RNA-seq/ENCODE/'
a=eQTLs[tissues_common.values()].rename(columns=tissues_common_inv)

for encode_sample in os.listdir(encode_dir):
    if encode_sample.endswith('.tsv'):
        print(encode_sample,':',encode_sample.split('_',1)[0])
        if ~ a.columns.str.startswith(encode_sample.split('_',1)[0].capitalize()).any(): print('Similar tissues:',list(a.columns[a.columns.str.startswith(encode_sample.split('_',1)[0].capitalize())]))
        encode=pd.read_csv(encode_dir+encode_sample,sep='\t',index_col='transcript_id(s)',usecols=['TPM','transcript_id(s)'])
        encode=encode[encode.index.str.startswith('ENST')]
        encode_data=pd.DataFrame().rename_axis('transcript_id')
        for i in encode.index:
            for o in i.split(','):
                if o in HKG_transc or o in data.columns:
                    encode_data=encode_data.append(pd.Series(encode.loc[i]).rename(o))
        encode_data['TPM']=encode_data['TPM']/encode_data[encode_data.index.isin(HKG_transc)]['TPM'].mean()
        #encode.index=encode.index.str.replace('\.[0-9]','')
        #encode=encode.set_index('transcript_id')
        test=t_test_ratio_encode(encode_data['TPM'])
        b=pd.DataFrame(test['value']).sum(axis=1)[tissues_common.keys()]
        b_where_a=b.multiply(~ np.isnan(a))
        b_where_a_sum=b_where_a.sum(axis=1)
        prediction=a.multiply(b_where_a.div(b_where_a_sum,axis=0)).sum(axis=1)
        if ~ a.columns.str.startswith(encode_sample.split('_',1)[0].capitalize()).any(): print('Similar tissues corr:\n',a[a.columns[a.columns.str.startswith(encode_sample.split('_',1)[0].capitalize())]].corrwith(prediction),sep='')
        print('Dissimilar tissues corr:',a[a.columns[~ a.columns.str.startswith('Heart')]].corrwith(prediction).mean())
        #if ~ a.columns.str.startswith(encode_sample.split('_',1)[0].capitalize()).any():
        print('All tissues corr:',a.corrwith(prediction).sort_values(ascending=False)[:10])
        print('######################')

pancreas_ENCFF390JAT.tsv : pancreas
Dissimilar tissues corr: 0.8537214434952402
All tissues corr: TISSUE
Testis                                 0.949154
Muscle - Skeletal                      0.943205
Whole Blood                            0.937687
Cells - Cultured fibroblasts           0.927223
Esophagus - Mucosa                     0.920273
Skin - Sun Exposed (Lower leg)         0.919019
Skin - Not Sun Exposed (Suprapubic)    0.910446
Esophagus - Muscularis                 0.899592
Adipose - Subcutaneous                 0.898567
Artery - Tibial                        0.896788
dtype: float64
######################
small_intestine_ENCFF080HMB.tsv : small
Dissimilar tissues corr: 0.85522050438979
All tissues corr: TISSUE
Testis                            0.942250
Whole Blood                       0.930614
Artery - Tibial                   0.923595
Esophagus - Muscularis            0.922527
Skin - Sun Exposed (Lower leg)    0.920199
Muscle - Skeletal                 0.917111
Esophagus - 

Dissimilar tissues corr: 0.8568458002872319
All tissues corr: TISSUE
Whole Blood                            0.941425
Muscle - Skeletal                      0.924571
Artery - Tibial                        0.923423
Testis                                 0.922383
Cells - Cultured fibroblasts           0.919516
Nerve - Tibial                         0.905999
Skin - Not Sun Exposed (Suprapubic)    0.904840
Skin - Sun Exposed (Lower leg)         0.903935
Adipose - Subcutaneous                 0.903862
Artery - Aorta                         0.903860
dtype: float64
######################
liver_ENCFF203UGC.tsv : liver
Dissimilar tissues corr: 0.8554686939616938
All tissues corr: TISSUE
Testis                            0.956352
Muscle - Skeletal                 0.934631
Whole Blood                       0.933811
Heart - Left Ventricle            0.919514
Cells - Cultured fibroblasts      0.917352
Artery - Tibial                   0.910363
Pancreas                          0.906700
Brain - Cereb

KeyboardInterrupt: 

In [673]:
encode_dir='/nfs/research1/zerbino/jhidalgo/inteql_GTEX_v8/data/original-data/RNA-seq/ENCODE/'
a=eQTLs[tissues_common.values()].rename(columns=tissues_common_inv)

for encode_sample in os.listdir(encode_dir):
    if encode_sample.endswith('.tsv'):
        print(encode_sample,':',encode_sample.split('_',1)[0])
        if ~ a.columns.str.startswith(encode_sample.split('_',1)[0].capitalize()).any(): print('Similar tissues:',list(a.columns[a.columns.str.startswith(encode_sample.split('_',1)[0].capitalize())]))
        encode=pd.read_csv(encode_dir+encode_sample,sep='\t',index_col='transcript_id(s)',usecols=['TPM','transcript_id(s)'])
        encode=encode[encode.index.str.startswith('ENST')]
        encode_data=pd.DataFrame().rename_axis('transcript_id')
        for i in encode.index:
            for o in i.split(','):
                if o in HKG_transc or o in data.columns:
                    encode_data=encode_data.append(pd.Series(encode.loc[i]).rename(o))
        encode_data['TPM']=encode_data['TPM']/encode_data[encode_data.index.isin(HKG_transc)]['TPM'].mean()
        #encode.index=encode.index.str.replace('\.[0-9]','')
        #encode=encode.set_index('transcript_id')
        test=t_test_ratio_encode_relative(encode_data['TPM'])
        b=pd.DataFrame(test['value']).sum(axis=1)[tissues_common.keys()]
        b_where_a=b.multiply(~ np.isnan(a))
        b_where_a_sum=b_where_a.sum(axis=1)
        prediction=a.multiply(b_where_a.div(b_where_a_sum,axis=0)).sum(axis=1)
        if ~ a.columns.str.startswith(encode_sample.split('_',1)[0].capitalize()).any(): print('Similar tissues corr:\n',a[a.columns[a.columns.str.startswith(encode_sample.split('_',1)[0].capitalize())]].corrwith(prediction),sep='')
        print('Dissimilar tissues corr:',a[a.columns[~ a.columns.str.startswith('Heart')]].corrwith(prediction).mean())
        #if ~ a.columns.str.startswith(encode_sample.split('_',1)[0].capitalize()).any():
        print('All tissues corr:',a.corrwith(prediction).sort_values(ascending=False)[:10])
        print('######################')

pancreas_ENCFF390JAT.tsv : pancreas
Dissimilar tissues corr: 0.8408272229242262
All tissues corr: TISSUE
Testis                                 0.978292
Cells - Cultured fibroblasts           0.958763
Muscle - Skeletal                      0.940888
Skin - Sun Exposed (Lower leg)         0.930926
Whole Blood                            0.929164
Lung                                   0.926352
Thyroid                                0.899372
Esophagus - Mucosa                     0.899084
Skin - Not Sun Exposed (Suprapubic)    0.886811
Adipose - Subcutaneous                 0.885652
dtype: float64
######################
small_intestine_ENCFF080HMB.tsv : small
Dissimilar tissues corr: 0.8457332590254601
All tissues corr: TISSUE
Testis                                 0.974031
Whole Blood                            0.961710
Muscle - Skeletal                      0.938867
Skin - Sun Exposed (Lower leg)         0.933083
Heart - Left Ventricle                 0.909876
Artery - Tibial             

KeyboardInterrupt: 