In [None]:
###### Module Loading #######

In [1]:
import pandas as pd
import os
from scipy.stats import ttest_ind
import numpy as np

In [None]:
###### Move to working directory #######

In [2]:
os.chdir('/nfs/research1/zerbino/jhidalgo/inteql_GTEX_v8/data/original-data/RNA-seq/')

In [None]:
###### Load GTEX sample metadata and sample names #######

In [3]:
samples_GTEX=pd.read_csv('../GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt',sep='\t',usecols=['SAMPID','SMTS','SMTSD'])

In [4]:
samples_RNASEQ=list(pd.read_csv('../GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz',header=2,sep='\t',nrows=0).columns[2:])

In [22]:
print(len(samples_GTEX),len(samples_RNASEQ))

22951 17382


In [5]:
samples=pd.DataFrame({'SAMPID':samples_RNASEQ}).merge(samples_GTEX)

In [33]:
samples

Unnamed: 0,SAMPID,SMTS,SMTSD
0,GTEX-1117F-0226-SM-5GZZ7,Adipose Tissue,Adipose - Subcutaneous
1,GTEX-1117F-0426-SM-5EGHI,Muscle,Muscle - Skeletal
2,GTEX-1117F-0526-SM-5EGHJ,Blood Vessel,Artery - Tibial
3,GTEX-1117F-0626-SM-5N9CS,Blood Vessel,Artery - Coronary
4,GTEX-1117F-0726-SM-5GIEN,Heart,Heart - Atrial Appendage
...,...,...,...
17377,GTEX-ZZPU-2126-SM-5EGIU,Ovary,Ovary
17378,GTEX-ZZPU-2226-SM-5EGIV,Vagina,Vagina
17379,GTEX-ZZPU-2426-SM-5E44I,Blood Vessel,Artery - Tibial
17380,GTEX-ZZPU-2626-SM-5E45Y,Muscle,Muscle - Skeletal


In [6]:
unique_tissues=list(samples['SMTSD'].drop_duplicates())

In [None]:
###### Filter TPM file for transcripts with very low T test value for the different tissues #######
"""
GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz is read chunk by chunk, for each line in the chunk
a t-test is performed for each unique tissue. Transcripts with 1 to 3 transcript-tissue t-test values of 0 are
forwarded to two new files:
1) Top_T-test_GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct: Contains the raw TPM values as read.
2) Binary_top_T-test_GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct: Contains the T-test values for 
    each tissue converted to 0 or 1 according to the values being under or over a threshold (0.0001 by default).
"""

In [7]:
chunksize=1000

In [9]:
cols={}
notcols={}
for a in unique_tissues:
    cols[a] = list(samples[samples['SMTSD'] == a]['SAMPID'].values)
    notcols[a] = list(samples[samples['SMTSD'] != a]['SAMPID'].values)

count=0
added=0
write_header=True

In [17]:
for chunk in pd.read_csv('../GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz',header=2,sep='\t',chunksize=chunksize,index_col='transcript_id',usecols=lambda x: x not in ['gene_id']):
    output=pd.DataFrame()
    bin_output=pd.DataFrame()
    for i in chunk.index:
        ttest=[]
        for a in unique_tissues:
            ttest=np.append(ttest,float(ttest_ind(chunk.loc[i][cols[a]], chunk.loc[i][notcols[a]]).pvalue))
            if (ttest == 0).sum() > 3: continue
        if 0 < (ttest == 0).sum() <= 3:
            output=output.append(chunk.loc[i])
            bin_output=bin_output.append(pd.DataFrame(columns=unique_tissues,data=[np.where((ttest < 0.0001) & (ttest > 0),1,0)],index = [i]))
    if len(output) != 0: 
        output.to_csv('Top_T-test_GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct',chunksize=len(output),mode='a',index=True,header=write_header)
        bin_output.to_csv('Binary_top_T-test_GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct',chunksize=len(bin_output),mode='a',index=True,header=write_header)
    write_header=False
    count=count+chunksize
    added=added+len(output)
    print(count,"transcripts processed,",added,'transcripts added.',end="\r",flush=True)

44000 transcripts processed, 24144 transcripts added.

In [None]:
###### Load previous 2) output #######

In [137]:
bin_ttest=pd.read_csv('Binary_top_T-test_GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct',header=0).rename(columns={'Unnamed: 0':'transcript_id'}).set_index('transcript_id')

In [138]:
bin_ttest

Unnamed: 0_level_0,Adipose - Subcutaneous,Muscle - Skeletal,Artery - Tibial,Artery - Coronary,Heart - Atrial Appendage,Adipose - Visceral (Omentum),Uterus,Vagina,Breast - Mammary Tissue,Skin - Not Sun Exposed (Suprapubic),...,Brain - Cerebellar Hemisphere,Liver,Brain - Substantia nigra,Kidney - Cortex,Brain - Amygdala,Cervix - Ectocervix,Fallopian Tube,Cervix - Endocervix,Bladder,Kidney - Medulla
transcript_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENST00000373020.8,1,1,1,1,1,1,1,1,1,1,...,1,1,1,0,1,0,0,0,0,0
ENST00000494424.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENST00000496771.5,0,1,1,0,1,0,1,0,0,1,...,1,1,0,0,0,0,0,0,0,0
ENST00000612152.4,1,1,1,1,1,1,1,1,1,1,...,1,0,0,0,0,1,1,1,0,0
ENST00000614008.4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENST00000638486.1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENST00000381568.9,1,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
ENST00000622217.1,0,1,1,0,1,0,0,1,0,0,...,1,1,1,0,1,0,0,0,0,0
ENST00000639802.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
###### Transcript selection #######
"""
100 transcripts are selected for each tissue, an iterative selection is performed increasing the threshold
(number of tissue with significant T-test value for the transcript). Transcripts with fewer total associations
are selected first to maximise the tissue information each transcript can provide.
"""

In [141]:
data_selection={}
threshold = 1
for tissue in bin_ttest.columns[1:]:
    while tissue not in data_selection:
        data=bin_ttest[(bin_ttest.sum(axis=1) <= threshold)&(bin_ttest.sum(axis=1) > 0)]
        if len(data[data[tissue] <= threshold]) >= 100:
            data_selection[tissue]= data[data[tissue] <= threshold][:100].index
        if threshold == len(bin_ttest.columns[1:]):
            break
        threshold=threshold+1

In [142]:
index_selection=set([value for values in data_selection.values() for value in values])

In [178]:
index_selection

{'ENST00000002165.10',
 'ENST00000002596.5',
 'ENST00000002829.7',
 'ENST00000003084.10',
 'ENST00000003583.12',
 'ENST00000003912.7',
 'ENST00000004103.7',
 'ENST00000004531.14',
 'ENST00000005995.7',
 'ENST00000006724.7',
 'ENST00000007699.9',
 'ENST00000008391.3',
 'ENST00000023064.8',
 'ENST00000046087.6',
 'ENST00000161006.7',
 'ENST00000217885.5',
 'ENST00000222381.7',
 'ENST00000222462.2',
 'ENST00000225275.3',
 'ENST00000226299.8',
 'ENST00000231749.7',
 'ENST00000234701.7',
 'ENST00000247452.3',
 'ENST00000257621.4',
 'ENST00000261377.10',
 'ENST00000262407.5',
 'ENST00000262418.10',
 'ENST00000262630.7',
 'ENST00000262820.7',
 'ENST00000264276.10',
 'ENST00000265723.8',
 'ENST00000265742.7',
 'ENST00000309032.7',
 'ENST00000314669.9',
 'ENST00000321037.4',
 'ENST00000327906.7',
 'ENST00000330550.8',
 'ENST00000330692.11',
 'ENST00000337248.8',
 'ENST00000339809.10',
 'ENST00000340022.6',
 'ENST00000341376.10',
 'ENST00000342002.6',
 'ENST00000348433.10',
 'ENST00000353205.5',

In [None]:
###### Read raw data for selected transcripts #######
"""
The complete list of transcripts is read from the previously exported 1) file, then index of the transcripts
matching the selection is used to read only the raw values of the selected transcripts.
"""

In [183]:
transcripts=pd.read_csv('Top_T-test_GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct',header=0,sep=',',usecols=[0]).rename(columns={'Unnamed: 0':'transcript_id'})


In [187]:
transcripts

Unnamed: 0,transcript_id
0,ENST00000373020.8
1,ENST00000494424.1
2,ENST00000496771.5
3,ENST00000612152.4
4,ENST00000614008.4
...,...
116972,ENST00000638486.1
116973,ENST00000381568.9
116974,ENST00000622217.1
116975,ENST00000639802.1


In [236]:
rows=[0]+list(transcripts[transcripts['transcript_id'].isin(index_selection)].index.values+1)

In [421]:
data=pd.read_csv('Top_T-test_GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct',header=0,sep=',',index_col=0,usecols=lambda x: x not in ['gene_id'],skiprows=lambda x: x not in rows).T


In [422]:
data

Unnamed: 0,ENST00000373020.8,ENST00000496771.5,ENST00000612152.4,ENST00000614008.4,ENST00000373031.4,ENST00000485971.1,ENST00000371582.8,ENST00000371584.8,ENST00000371588.9,ENST00000466152.5,...,ENST00000419417.5,ENST00000522523.5,ENST00000460412.5,ENST00000469954.5,ENST00000372581.1,ENST00000247452.3,ENST00000522159.5,ENST00000538154.5,ENST00000441542.6,ENST00000523867.5
GTEX-1117F-0226-SM-5GZZ7,26.32,2.53,0.47,0.23,8.73,2.84,0.91,0.00,32.33,2.12,...,0.00,0.00,0.0,1.87,0.13,0.03,0.00,0.0,0.0,0.0
GTEX-1117F-0426-SM-5EGHI,3.95,0.48,0.21,0.24,0.44,0.15,1.73,0.96,56.37,2.47,...,0.00,0.00,0.0,1.98,0.00,0.02,0.00,0.0,0.0,0.0
GTEX-1117F-0526-SM-5EGHJ,13.23,1.98,0.15,1.32,1.49,0.26,0.00,0.30,29.72,2.49,...,0.00,0.00,0.0,2.36,0.00,0.00,1.03,0.0,0.0,0.0
GTEX-1117F-0626-SM-5N9CS,30.15,4.18,1.23,1.15,4.70,1.48,0.52,0.30,34.21,2.91,...,0.00,0.00,0.0,1.53,0.00,0.03,0.00,0.0,0.0,0.0
GTEX-1117F-0726-SM-5GIEN,6.60,0.38,0.31,0.74,0.03,0.00,0.00,0.00,17.18,1.78,...,0.00,0.00,0.0,0.59,0.09,0.00,0.00,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTEX-ZZPU-2126-SM-5EGIU,68.85,28.48,5.59,4.23,1.72,0.28,2.02,3.70,30.97,1.19,...,0.11,0.00,0.0,0.70,0.12,0.03,0.00,0.0,0.0,0.0
GTEX-ZZPU-2226-SM-5EGIV,30.73,0.89,0.20,0.00,0.04,0.00,1.18,2.16,25.62,0.61,...,0.00,0.00,0.0,2.14,0.12,0.03,0.00,0.0,0.0,0.0
GTEX-ZZPU-2426-SM-5E44I,5.05,0.56,0.20,0.42,0.13,0.00,0.29,0.74,31.52,0.68,...,0.00,0.00,0.0,1.49,0.00,0.02,0.00,0.0,0.0,0.0
GTEX-ZZPU-2626-SM-5E45Y,1.67,0.50,0.14,0.00,4.73,0.13,1.05,0.00,23.54,0.89,...,0.00,0.01,0.0,6.13,0.08,0.00,0.00,0.0,0.0,0.0


In [None]:
###### HKG raw data #######
"""
REVIEW:
Raw data for a published list of Housekeeping Genes is extracted from GTEX data. Transcripts with at least 1 TPM
are filtered and the top 150 with less relative variation are selected.
Publication:
Wang, Z., Lyu, Z., Pan, L. et al. Defining housekeeping genes suitable for RNA-seq analysis of the human allograft kidney biopsy tissue. BMC Med Genomics 12, 86 (2019). https://doi.org/10.1186/s12920-019-0538-z
"""

In [250]:
HKG={'ANAPC5':'ENSG00000089053','ANAPC15':'ENSG00000110200','ARID3B':'ENSG00000179361','ARL10':'ENSG00000175414','ATXN2':'ENSG00000204842','C3orf49':'ENSG00000163632','CCAR1':'ENSG00000060339','CCDC125':'ENSG00000277868','CCDC90B':'ENSG00000137500','CHFR':'ENSG00000072609','DHRSX':'ENSG00000169084','FRMD8':'ENSG00000126391','GGA1':'ENSG00000100083','HERC4':'ENSG00000148634','MKNK1':'ENSG00000079277','NASP':'ENSG00000132780','NME4':'ENSG00000103202','OTUB1':'ENSG00000167770','POLR2B':'ENSG00000047315','POLR3A':'ENSG00000148606','POMK':'ENSG00000185900','PSMA3-AS1':'ENSG00000257621','PTPN14':'ENSG00000152104','RAPGEF6':'ENSG00000158987','REL':'ENSG00000162924','RRP1':'ENSG00000160214','RUNDC1':'ENSG00000198863','SAMD4B':'ENSG00000179134','SLC4A1AP':'ENSG00000163798','SLMAP':'ENSG00000163681','SMARCAL1':'ENSG00000138375','SNAP29':'ENSG00000099940','SNRNP200':'ENSG00000144028','SUPT4H1':'ENSG00000213246','TBC1D22A':'ENSG00000054611','THUMPD3-AS1':'ENSG00000206573','TSPOAP1-AS1':'ENSG00000265148','TUBGCP2':'ENSG00000130640','WDTC1':'ENSG00000142784','ZNF544':'ENSG00000198131'}

In [243]:
# HKG=['ANAPC5','ANAPC15','ARID3B','ARL10','ATXN2','C16orf62','C3orf49','CCAR1','CCDC125','CCDC90B','CHFR','DHRSX','FRMD8','GGA1','HERC4','MKNK1','NASP','NME4','OTUB1','PMF1/PMF1-BGLAP','POLR2B','POLR3A','POMK','PSMA3-AS1','PTPN14','RAPGEF6','REL','RRP1','RUNDC1','SAMD4B','SLC4A1AP','SLMAP','SMARCAL1','SNAP29','SNRNP200','SUPT4H1','TBC1D22A','THUMPD3-AS1','TSPOAP1-AS1','TUBGCP2','WDTC1','ZNF544']

In [301]:
count=0
added=0
write_header=True

In [302]:
for chunk in pd.read_csv('../GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz',header=2,sep='\t',chunksize=chunksize,index_col='transcript_id'):
    output=chunk[chunk['gene_id'].str.split('.',1).str[0].isin(list(HKG.values()))]
    if not output.empty: 
        output.to_csv('HKG_GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct',chunksize=len(output),mode='a',index=True,header=write_header)
        write_header=False
    count=count+chunksize
    added=added+len(output)
    print(count,"transcripts processed,",added,'transcripts added.',end="\r",flush=True)

200000 transcripts processed, 494 transcripts added.

In [320]:
HKG_raw=pd.read_csv('HKG_GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct',index_col='transcript_id')#,usecols=lambda x: x not in ['gene_id'])

In [415]:
HKG_top=HKG_raw[HKG_raw.mean(axis=1) >= 1]

In [416]:
HKG_top_rel_data=HKG_top.drop(columns='gene_id')/HKG_top.mean()

In [417]:
HKG_transc=HKG_top_rel_data.loc[HKG_top_rel_data.std(axis=1).nsmallest(150).index].index.tolist()

In [445]:
HKG_raw.loc[HKG_transc].drop(columns='gene_id').mean()

GTEX-1117F-0226-SM-5GZZ7    4.257200
GTEX-1117F-0426-SM-5EGHI    2.160800
GTEX-1117F-0526-SM-5EGHJ    3.955667
GTEX-1117F-0626-SM-5N9CS    4.651267
GTEX-1117F-0726-SM-5GIEN    2.100333
                              ...   
GTEX-ZZPU-2126-SM-5EGIU     4.355200
GTEX-ZZPU-2226-SM-5EGIV     2.884800
GTEX-ZZPU-2426-SM-5E44I     3.225267
GTEX-ZZPU-2626-SM-5E45Y     2.067067
GTEX-ZZPU-2726-SM-5NQ8O     2.991467
Length: 17382, dtype: float64

In [None]:
###### Normalize GTEX data #######
"""
Raw data for transcripts selected before are normalized by dividing raw TPM by the mean expression 
of the Housekeeping Genes filtered above.
"""

In [455]:
rel_data=data.T/HKG_raw.loc[HKG_transc].drop(columns='gene_id').mean()

In [459]:
rel_data

Unnamed: 0,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,GTEX-1117F-2826-SM-5GZXL,GTEX-1117F-2926-SM-5GZYI,...,GTEX-ZZPU-1126-SM-5N9CW,GTEX-ZZPU-1226-SM-5N9CK,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
ENST00000373020.8,6.182467,1.828027,3.344569,6.482105,3.142358,13.015987,4.754024,4.268734,13.090899,2.478849,...,2.741102,4.474821,3.775171,3.507979,1.144950,15.808688,10.652385,1.565762,0.807908,11.001293
ENST00000496771.5,0.594287,0.222140,0.500548,0.898680,0.180924,0.995538,1.817238,0.277191,0.407915,0.093063,...,0.382702,0.238513,0.212436,0.436379,0.211064,6.539309,0.308514,0.173629,0.241889,0.715368
ENST00000612152.4,0.110401,0.097186,0.037920,0.264444,0.147596,0.423776,0.228777,0.124181,0.410808,0.104343,...,0.110027,0.089442,0.130560,0.190651,0.069391,1.283523,0.069329,0.062010,0.067729,0.330941
ENST00000614008.4,0.054026,0.111070,0.333698,0.247244,0.352325,0.000000,0.890771,0.000000,0.000000,0.045121,...,0.143513,0.000000,0.000000,0.076260,0.130108,0.971253,0.000000,0.130222,0.000000,0.130371
ENST00000373031.4,2.050644,0.203628,0.376675,1.010477,0.014283,3.766901,0.035696,0.070961,8.473645,0.290468,...,0.095675,0.000000,0.026555,0.038130,0.011565,0.394930,0.013866,0.040307,2.288267,5.987030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENST00000247452.3,0.007047,0.009256,0.000000,0.006450,0.000000,0.000000,0.000000,0.004435,0.000000,0.005640,...,0.009568,0.005421,0.000000,0.000000,0.000000,0.006888,0.010399,0.006201,0.000000,0.000000
ENST00000522159.5,0.000000,0.000000,0.260386,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.176174,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ENST00000538154.5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ENST00000441542.6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [461]:
data=samples.merge(rel_data.T,right_index=True,left_on='SAMPID').rename(columns={'SMTSD':'Tissue'}).set_index('Tissue').drop(columns=['SAMPID','SMTS'])

In [462]:
data

Unnamed: 0_level_0,ENST00000373020.8,ENST00000496771.5,ENST00000612152.4,ENST00000614008.4,ENST00000373031.4,ENST00000485971.1,ENST00000371582.8,ENST00000371584.8,ENST00000371588.9,ENST00000466152.5,...,ENST00000419417.5,ENST00000522523.5,ENST00000460412.5,ENST00000469954.5,ENST00000372581.1,ENST00000247452.3,ENST00000522159.5,ENST00000538154.5,ENST00000441542.6,ENST00000523867.5
Tissue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Adipose - Subcutaneous,6.182467,0.594287,0.110401,0.054026,2.050644,0.667105,0.213756,0.000000,7.594193,0.497980,...,0.000000,0.000000,0.0,0.439256,0.030537,0.007047,0.000000,0.0,0.0,0.0
Muscle - Skeletal,1.828027,0.222140,0.097186,0.111070,0.203628,0.069419,0.800629,0.444280,26.087560,1.143095,...,0.000000,0.000000,0.0,0.916327,0.000000,0.009256,0.000000,0.0,0.0,0.0
Artery - Tibial,3.344569,0.500548,0.037920,0.333698,0.376675,0.065728,0.000000,0.075841,7.513272,0.629477,...,0.000000,0.000000,0.0,0.596612,0.000000,0.000000,0.260386,0.0,0.0,0.0
Artery - Coronary,6.482105,0.898680,0.264444,0.247244,1.010477,0.318193,0.111798,0.064499,7.354986,0.625636,...,0.000000,0.000000,0.0,0.328943,0.000000,0.006450,0.000000,0.0,0.0,0.0
Heart - Atrial Appendage,3.142358,0.180924,0.147596,0.352325,0.014283,0.000000,0.000000,0.000000,8.179654,0.847485,...,0.000000,0.000000,0.0,0.280908,0.042850,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ovary,15.808688,6.539309,1.283523,0.971253,0.394930,0.064291,0.463813,0.849559,7.111040,0.273237,...,0.025257,0.000000,0.0,0.160727,0.027553,0.006888,0.000000,0.0,0.0,0.0
Vagina,10.652385,0.308514,0.069329,0.000000,0.013866,0.000000,0.409040,0.748752,8.881032,0.211453,...,0.000000,0.000000,0.0,0.741819,0.041597,0.010399,0.000000,0.0,0.0,0.0
Artery - Tibial,1.565762,0.173629,0.062010,0.130222,0.040307,0.000000,0.089915,0.229438,9.772835,0.210835,...,0.000000,0.000000,0.0,0.461977,0.000000,0.006201,0.000000,0.0,0.0,0.0
Muscle - Skeletal,0.807908,0.241889,0.067729,0.000000,2.288267,0.062891,0.507966,0.000000,11.388118,0.430562,...,0.000000,0.004838,0.0,2.965555,0.038702,0.000000,0.000000,0.0,0.0,0.0


In [510]:
reindex_data=data.reset_index()

In [511]:
reindex_data

Unnamed: 0,Tissue,ENST00000373020.8,ENST00000496771.5,ENST00000612152.4,ENST00000614008.4,ENST00000373031.4,ENST00000485971.1,ENST00000371582.8,ENST00000371584.8,ENST00000371588.9,...,ENST00000419417.5,ENST00000522523.5,ENST00000460412.5,ENST00000469954.5,ENST00000372581.1,ENST00000247452.3,ENST00000522159.5,ENST00000538154.5,ENST00000441542.6,ENST00000523867.5
0,Adipose - Subcutaneous,6.182467,0.594287,0.110401,0.054026,2.050644,0.667105,0.213756,0.000000,7.594193,...,0.000000,0.000000,0.0,0.439256,0.030537,0.007047,0.000000,0.0,0.0,0.0
1,Muscle - Skeletal,1.828027,0.222140,0.097186,0.111070,0.203628,0.069419,0.800629,0.444280,26.087560,...,0.000000,0.000000,0.0,0.916327,0.000000,0.009256,0.000000,0.0,0.0,0.0
2,Artery - Tibial,3.344569,0.500548,0.037920,0.333698,0.376675,0.065728,0.000000,0.075841,7.513272,...,0.000000,0.000000,0.0,0.596612,0.000000,0.000000,0.260386,0.0,0.0,0.0
3,Artery - Coronary,6.482105,0.898680,0.264444,0.247244,1.010477,0.318193,0.111798,0.064499,7.354986,...,0.000000,0.000000,0.0,0.328943,0.000000,0.006450,0.000000,0.0,0.0,0.0
4,Heart - Atrial Appendage,3.142358,0.180924,0.147596,0.352325,0.014283,0.000000,0.000000,0.000000,8.179654,...,0.000000,0.000000,0.0,0.280908,0.042850,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17377,Ovary,15.808688,6.539309,1.283523,0.971253,0.394930,0.064291,0.463813,0.849559,7.111040,...,0.025257,0.000000,0.0,0.160727,0.027553,0.006888,0.000000,0.0,0.0,0.0
17378,Vagina,10.652385,0.308514,0.069329,0.000000,0.013866,0.000000,0.409040,0.748752,8.881032,...,0.000000,0.000000,0.0,0.741819,0.041597,0.010399,0.000000,0.0,0.0,0.0
17379,Artery - Tibial,1.565762,0.173629,0.062010,0.130222,0.040307,0.000000,0.089915,0.229438,9.772835,...,0.000000,0.000000,0.0,0.461977,0.000000,0.006201,0.000000,0.0,0.0,0.0
17380,Muscle - Skeletal,0.807908,0.241889,0.067729,0.000000,2.288267,0.062891,0.507966,0.000000,11.388118,...,0.000000,0.004838,0.0,2.965555,0.038702,0.000000,0.000000,0.0,0.0,0.0


In [None]:
####### Prediction function ######

In [464]:
def t_test_ratio_external (i):
    value={}
    for Transc in data.columns:
        value[Transc]={}
        for Tissue in data.index.unique():
            ttest_sample=stats.ttest_1samp(data[Transc][Tissue],i[Transc])
            value[Transc][Tissue]=np.nan
            if ttest_sample.pvalue >= 0.0001: value[Transc][Tissue] = ratiodata[Transc][Tissue]## 
    return {'value':value}

In [None]:
##### LOAD eQTLs #######

In [467]:
eqtl_file = '/nfs/research1/zerbino/jhidalgo/inteql_GTEX_v8/data/original-data/pre-finemapped/GTEx_v8_finemapping_CaVEMaN/GTEx_v8_finemapping_CaVEMaN.txt.gz'

In [468]:
eQTLs=pd.read_csv(eqtl_file,sep='\t',compression='gzip')

In [471]:
eQTLs

Unnamed: 0,TISSUE,GENE,eQTL,CHROM,POS,REF,ALT,Probability
0,Adipose_Subcutaneous,ENSG00000000003.14,chrX_100649875_A_G,chrX,100649875,A,G,0.986724
1,Adipose_Subcutaneous,ENSG00000000457.13,chr1_169699715_T_C,chr1,169699715,T,C,0.174525
2,Adipose_Subcutaneous,ENSG00000000457.13,chr1_169891332_G_A,chr1,169891332,G,A,0.266151
3,Adipose_Subcutaneous,ENSG00000000460.16,chr1_169787407_G_C,chr1,169829604,C,G,0.176874
4,Adipose_Subcutaneous,ENSG00000000938.12,chr1_27634321_C_G,chr1,27634281,G,A,0.226577
...,...,...,...,...,...,...,...,...
1314538,Whole_Blood,ENSG00000284167.1,chr2_143097724_G_A,chr2,143097724,G,A,0.264387
1314539,Whole_Blood,ENSG00000284526.1,chr17_76585863_A_G,chr17,76585863,A,G,0.231189
1314540,Whole_Blood,ENSG00000284526.1,chr17_76585863_A_G,chr17,76585864,A,C,0.231189
1314541,Whole_Blood,ENSG00000284526.1,chr17_76585863_A_G,chr17,76585943,G,C,0.231189


In [474]:
eQTLs=eQTLs.pivot_table(index='eQTL',columns='TISSUE',values='Probability')

In [479]:
eQTLs.columns.isin(data.index.unique())

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True,  True, False, False, False,  True,
        True,  True,  True, False, False, False,  True,  True,  True,
        True,  True,  True, False])

In [487]:
tissues_common={}
for b in data.index.unique():
    if b in eQTLs.columns:
        tissues_common[b]=b
    else:
        if b.replace(' - ','_').replace(' ','_').replace('(','').replace(')','') in (eQTLs.columns):
            #print(b.replace(' - ','_').replace(' ','_').replace('(','').replace(')',''),'matches.')
            tissues_common[b]=b.replace(' - ','_').replace(' ','_').replace('(','').replace(')','')
        else:
            print(b.replace(' - ','_').replace(' ','_').replace('(','').replace(')',''),'doesnt match!!!!!!!!!!!!!!')

Cervix_Ectocervix doesnt match!!!!!!!!!!!!!!
Fallopian_Tube doesnt match!!!!!!!!!!!!!!
Cervix_Endocervix doesnt match!!!!!!!!!!!!!!
Bladder doesnt match!!!!!!!!!!!!!!
Kidney_Medulla doesnt match!!!!!!!!!!!!!!


In [490]:
tissues_common_inv = {v: k for k, v in tissues_common.items()}

In [493]:
###### Identify significant (DE) transcripts for each tissue #######

In [494]:
significant={}

In [495]:
for Tissue in data.index.unique(): significant[Tissue]=[]

In [496]:
for Transc in data.columns:
    for Tissue in data.index.unique():
        ttest=ttest_ind(data[Transc][Tissue],data[Transc][data.index != Tissue])
        if ttest.pvalue <= 0.00001:
            significant[Tissue].append(Transc)
            #print(Transc,Tissue,ttest)

In [497]:
pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in significant.items() ]))

Unnamed: 0,Adipose - Subcutaneous,Muscle - Skeletal,Artery - Tibial,Artery - Coronary,Heart - Atrial Appendage,Adipose - Visceral (Omentum),Uterus,Vagina,Breast - Mammary Tissue,Skin - Not Sun Exposed (Suprapubic),...,Brain - Cerebellar Hemisphere,Liver,Brain - Substantia nigra,Kidney - Cortex,Brain - Amygdala,Cervix - Ectocervix,Fallopian Tube,Cervix - Endocervix,Bladder,Kidney - Medulla
0,ENST00000373020.8,ENST00000373020.8,ENST00000373020.8,ENST00000373020.8,ENST00000373020.8,ENST00000373020.8,ENST00000496771.5,ENST00000373020.8,ENST00000373020.8,ENST00000373020.8,...,ENST00000373020.8,ENST00000373020.8,ENST00000612152.4,ENST00000373020.8,ENST00000612152.4,ENST00000222462.2,ENST00000431869.1,ENST00000358495.7,ENST00000464944.1,ENST00000475760.1
1,ENST00000612152.4,ENST00000496771.5,ENST00000496771.5,ENST00000612152.4,ENST00000612152.4,ENST00000612152.4,ENST00000612152.4,ENST00000359326.8,ENST00000496771.5,ENST00000496771.5,...,ENST00000496771.5,ENST00000612152.4,ENST00000371588.9,ENST00000612152.4,ENST00000371588.9,ENST00000361301.6,ENST00000475688.1,ENST00000540167.5,ENST00000493057.1,ENST00000480582.1
2,ENST00000373031.4,ENST00000612152.4,ENST00000612152.4,ENST00000367429.8,ENST00000373031.4,ENST00000373031.4,ENST00000459772.5,ENST00000459772.5,ENST00000612152.4,ENST00000612152.4,...,ENST00000612152.4,ENST00000371588.9,ENST00000496973.5,ENST00000413811.3,ENST00000413811.3,ENST00000540167.5,ENST00000478269.5,,ENST00000540167.5,ENST00000481031.5
3,ENST00000485971.1,ENST00000614008.4,ENST00000485971.1,ENST00000466229.5,ENST00000485971.1,ENST00000485971.1,ENST00000367429.8,ENST00000367429.8,ENST00000373031.4,ENST00000371582.8,...,ENST00000371582.8,ENST00000359326.8,ENST00000367429.8,ENST00000496973.5,ENST00000459772.5,ENST00000572061.1,ENST00000501740.6,,ENST00000573483.1,ENST00000539668.1
4,ENST00000371588.9,ENST00000485971.1,ENST00000371582.8,ENST00000470918.1,ENST00000371582.8,ENST00000371582.8,ENST00000466229.5,ENST00000435187.1,ENST00000485971.1,ENST00000371584.8,...,ENST00000371584.8,ENST00000413811.3,ENST00000496761.1,ENST00000630130.2,ENST00000496973.5,ENST00000416767.8,ENST00000564701.1,,,ENST00000617670.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,,,,,,,,,,,...,,,,,,,,,,
284,,,,,,,,,,,...,,,,,,,,,,
285,,,,,,,,,,,...,,,,,,,,,,
286,,,,,,,,,,,...,,,,,,,,,,


In [505]:
for i in significant:
    print(i,len(significant[i]))

Adipose - Subcutaneous 131
Muscle - Skeletal 233
Artery - Tibial 185
Artery - Coronary 51
Heart - Atrial Appendage 111
Adipose - Visceral (Omentum) 99
Uterus 43
Vagina 39
Breast - Mammary Tissue 75
Skin - Not Sun Exposed (Suprapubic) 129
Minor Salivary Gland 70
Brain - Cortex 109
Adrenal Gland 90
Thyroid 135
Lung 114
Spleen 103
Pancreas 124
Esophagus - Muscularis 114
Esophagus - Mucosa 162
Esophagus - Gastroesophageal Junction 75
Stomach 79
Colon - Sigmoid 91
Small Intestine - Terminal Ileum 55
Colon - Transverse 102
Prostate 77
Testis 288
Nerve - Tibial 152
Skin - Sun Exposed (Lower leg) 152
Heart - Left Ventricle 122
Brain - Cerebellum 155
Cells - Cultured fibroblasts 197
Whole Blood 272
Artery - Aorta 106
Cells - EBV-transformed lymphocytes 142
Pituitary 108
Brain - Frontal Cortex (BA9) 93
Brain - Caudate (basal ganglia) 107
Brain - Nucleus accumbens (basal ganglia) 115
Brain - Putamen (basal ganglia) 91
Brain - Hypothalamus 103
Brain - Spinal cord (cervical c-1) 69
Brain - Hippocam

In [506]:
####### Calculate ratio of success for each transcript for each tissue #######

In [507]:
fails={}
success={}
ratio={}

In [512]:
len_tran=len(data.columns)
for Tissue in data.index.unique().sort_values():
    print(Tissue)
    fails[Tissue]=[]
    success[Tissue]=[]
    ratio[Tissue]={}
    for Transc in data.columns:
        print_cum=('Transcript '+str(data.columns.get_loc(Transc)+1)+' of '+str(len_tran)+": ")  
        ttest_tissue=ttest_ind(data[Transc][Tissue],data[Transc][data.index != Tissue])
        if ttest_tissue.pvalue <= 0.00001:
            print_cum=print_cum+'Tissue expression significant:'
            for sample in list((reindex_data[reindex_data['Tissue'] == Tissue]).index):
                print_cum_fin=print_cum+'Sample '+str((reindex_data[reindex_data['Tissue'] == Tissue]).index.get_loc(sample))+' of '+str(len(list((reindex_data[reindex_data['Tissue'] == Tissue]).index)))
                print(print_cum_fin,end="\r",flush=True)
                ttest_sample=stats.ttest_1samp(data[Transc][Tissue],data.iloc[sample][Transc])
                if ttest_sample.pvalue <= 0.0001: success[Tissue].append(Transc)
                elif ttest_sample.pvalue >= 0.0001: fails[Tissue].append(Transc)
                if (data[Transc][Tissue]).mean() == 0 and data.iloc[sample][Transc] == 0:
                    success[Tissue].append(Transc)
                if (data[Transc][Tissue]).mean() == 0 and data.iloc[sample][Transc] != 0:
                    fails[Tissue].append(Transc)
            ratio[Tissue][Transc]=success[Tissue].count(Transc)/(success[Tissue].count(Transc)+fails[Tissue].count(Transc))
        else: ratio[Tissue][Transc]= np.nan
    print(len_tran,"transcripts processed,",len(success[Tissue]),"successful associations",len(fails[Tissue]),"failed. Mean ratio of",pd.Series(ratio[Tissue]).mean(),end="\r",flush=True)
    print("")
    print('###########')

Adipose - Subcutaneous
Transcript 1 of 448: Tissue expression significant:Sample 0 of 663

NameError: name 'stats' is not defined