# Assemble the final Dataframe from tarpmir bindingsites

## Read in bindingsite & Ensembl data

In [1]:
%load_ext autoreload
%autoreload 2
from helper_fcts import *
import sqlite3
import os
import pandas as pd
from pathlib import Path

path = Path('data')
tcga_path = Path(path/'PANCAN')
path_tarp = Path(path/'tarp-bs')
#ann_path = Path('C:/Users/Lena/Documents/Master big files/manual_GDC_download')

In [2]:
#read in df_gene: gene_id, g_id_v, transcript_id, t_id_v, chromosome, strand, start, end (1-N)
df_gene = pd.read_csv(path/'gene_infos.csv', dtype={'ensembl_gene_id':str,'ensembl_gene_id_version':str,'ensembl_transcript_id':str, 'ensembl_transcript_id_version':str, 'chromosome_name':'category', 'strand':'int8', 'start_position':int, 'end_position':int})
del df_gene['Unnamed: 0']
df_gene[['strand']] = df_gene[['strand']].astype('int').astype('Int64')

#executed once, read in from pickled files
ids = read_in_ids(path/'all_mapping_ids.fasta') # mapping of transcript ids to exon ids
chrom_exon_starts, exon_starts, exon_ends, df_exon = calc_exon_data(path) # exon starts and ends per transcript + exon id

df_exon = df_exon.merge(df_gene, left_on = 'transcript_id', right_on = 'ensembl_transcript_id', how='left')
df_exon[['chrom_exon_start','chrom_exon_end','start_position','end_position']] = df_exon[['chrom_exon_start','chrom_exon_end','start_position','end_position']].astype('Int64')
df_exon[['strand']] = df_exon[['strand']].astype('str').astype('category')

#len(df_exon[df_exon.ensembl_transcript_id.isna()].transcript_id.unique())
#371 transcripts are not in df_gene, TODO, why? which transcripts are in tarpmir predictions
#TODO instead of joining df_gene, exon_info -> directly read in from R feather file & also use that for calculating chrom pos

In [3]:
#read in bs
seperate_bs = []
for filename in os.listdir(path_tarp):
    binding_sites = parse_tarp_bs(path_tarp/filename)
    seperate_bs.append(binding_sites)
bs = pd.concat(seperate_bs, axis=0, ignore_index=True) #14 388 + 10 326 = 24 714 rows
bs.to_feather(path/'bs.feather')
print('All predicted bindingsites were read into pandas.')

# alternative: set bs from feather
#bs = pd.read_feather(path/'bs.feather')

All predicted bindingsites were read into pandas.


In [4]:
#parse miRNA and mRNA seed from sequence
miRNA_sequences = parse_seq(path/'input_miRNA.fasta')#2661 miRNAs 
bs['miRNA_seed'] = bs.apply(lambda row: miRNA_sequences[row['miRNA']], axis=1)
mRNA_sequences = parse_seq(path/'cdna6247.fasta') #TODO change to bigger file with all mRNAs or iterate over all files
bs['mRNA_bs_seq'] = bs.apply(lambda row: mRNA_sequences[row['mRNA']][row['bs_start']:row['bs_end']], axis=1)

In [5]:
%load_ext Cython

In [6]:
%%cython
def pos_to_chrom(pos, tid, ids, exon_starts, exon_ends, chrom_exon_starts):
    for j, eid in enumerate(ids[tid]):
        if tid in exon_starts and eid in exon_starts[tid] and tid in exon_ends and eid in exon_ends[tid]:
            if pos >= exon_starts[tid][eid] and pos <= exon_ends[tid][eid]:
                diff = pos - exon_starts[tid][eid]
                chrom_pos = chrom_exon_starts[tid][eid] + diff
                return chrom_pos
        else: print('Error: Either',tid,'or',eid,'not in exon_starts or exon_ends')
    return None

In [7]:
%%cython
def get_eid(start, end, tid, ids, exon_starts, exon_ends, chrom_exon_starts):
    for j, eid in enumerate(ids[tid]):
        if tid in exon_starts and eid in exon_starts[tid] and tid in exon_ends and eid in exon_ends[tid]:
            if (start >= exon_starts[tid][eid]) and (start <= exon_ends[tid][eid]) and (end >= exon_starts[tid][eid]) and (end <= exon_ends[tid][eid]):
                return eid
        else: print('Error: Either',tid,'or',eid,'not in exon_starts or exon_ends')
    return None

In [8]:
#translate genome position relative to transcript to chromosome
#fastest, always puts bs start and end even if bs spans several exons, but only puts exonid if only 1 exon
bs['chrom_bs_start'] = bs.apply(lambda row: pos_to_chrom(row.bs_start, row.mRNA, ids, exon_starts, exon_ends, chrom_exon_starts), axis=1)
bs['chrom_bs_end'] = bs.apply(lambda row: pos_to_chrom(row.bs_end, row.mRNA, ids, exon_starts, exon_ends, chrom_exon_starts), axis=1)
bs['exon_id'] = bs.apply(lambda row: get_eid(row.bs_start, row.bs_end, row.mRNA, ids, exon_starts, exon_ends, chrom_exon_starts), axis=1)
bs['bs_id'] = bs.index
bs.head()

Unnamed: 0,miRNA,mRNA,binding_probability,energy,seed,accessibility,AU_content,PhyloP_Stem,PyloP_Flanking,m/e,...,pairings_in_3prime_end,difference_of_pairings_between_seed_and_3prime_end,bs_start,bs_end,miRNA_seed,mRNA_bs_seq,chrom_bs_start,chrom_bs_end,exon_id,bs_id
0,hsa-let-7a-2-3p,ENST00000576537,1.0,-25.9,0,0.000156,0.338,0.005183,-0.035634,-11.206441,...,7,1,309,335,CTGTACAGCCTCCTAGCTTTCC,,1578440,1578466,ENSE00002650258,0
1,hsa-let-7a-2-3p,ENST00000576537,0.74359,-21.6,1,2e-06,0.338,-0.161796,0.055277,-3.793325,...,3,4,430,450,CTGTACAGCCTCCTAGCTTTCC,,1578561,1578581,ENSE00002650258,1
2,hsa-let-7b-5p,ENST00000576537,0.846154,-18.4,1,2.6e-05,0.441,3.547385,2.945605,-6.324962,...,8,1,105,130,TGAGGTAGTAGGTTGTGTGGTT,,1576707,1578261,,2
3,hsa-let-7b-3p,ENST00000576537,0.615385,-20.6,0,0.000143,0.338,-0.021616,-0.047693,-18.306612,...,7,3,315,336,CTATACAACCTACTGCCTTCCC,,1578446,1578467,ENSE00002650258,3
4,hsa-let-7b-3p,ENST00000576537,0.538462,-15.3,0,2e-05,0.456,3.7732,3.161275,-9.685015,...,5,1,23,41,CTATACAACCTACTGCCTTCCC,GGTGGCGTGGGCCTGTAA,1576625,1576643,ENSE00002671000,4


## Read in TCGA data (processed by Xena) from file

In [187]:
#TODO option 1: read in exon_id annotation file (compare)
#import gffutils
#fn = gffutils.example_filename(ann_path/"gencode.v22.annotation.gtf")
#exon_db = gffutils.create_db(fn, dbfn='exon.db', disable_infer_genes=True, disable_infer_transcripts=True, force=True)
#gene = 'ENSG00000174231.15' #todo version wichtig!
#for i in exon_db.children(gene, featuretype='exon'):#, order_by='start'):
 #   print(i)
#for exon in exon_db.region("chr17:1578446-1578633", strand="-", featuretype='exon'):#, completely_within=True):
    #print(exon)

chr17	HAVANA	gene	1650629	1684882	.	-	.	gene_id "ENSG00000174231.15"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "PRPF8"; level "1"; havana_gene "OTTHUMG00000090553.5";


In [None]:
#download PANCAN data from web, once
#wget https://pancanatlas.xenahubs.net/download/pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs_08_04_16.xena.gz
#wget https://tcga.xenahubs.net/download/TCGA.PANCAN.sampleMap/HiSeqV2_exon.gz
#wget https://tcga.xenahubs.net/download/unc_v2_exon_hg19_probe_TCGA

mirna_counts = pd.read_csv(tcga_path/'pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs_08_04_16.xena', delimiter='	')
exon_counts = pd.read_csv(tcga_path/'HiSeqV2_exon', delimiter='	')
tran_exon = pd.read_csv(tcga_path/'unc_v2_exon_hg19_probe_TCGA', delimiter='	')

In [13]:
exon_counts = exon_counts.merge(tran_exon, left_on='Sample', right_on='id')
exon_counts.drop(['Sample', 'id'], axis=1, inplace=True)
#transform strand {-,+} to {-1,+1}
exon_counts['strand'] = exon_counts.apply(lambda row: -1 if row.strand == '-' else 1, axis=1)
exon_counts['chrom'] = exon_counts['chrom'].str[3:]
exon_counts[['chrom','strand']] = exon_counts[['chrom','strand']].astype('str').astype('category')
exon_counts[['chromStart','chromEnd']] = exon_counts[['chromStart','chromEnd']].astype('Int64')
#TODO option 2: get exonid per exon count from Ensembl (compare)
exon_counts2 = exon_counts.merge(df_exon[['exon_id','chromosome_name','strand','chrom_exon_start','chrom_exon_end']], left_on = ['chrom','strand','chromStart','chromEnd'], right_on = ['chromosome_name','strand','chrom_exon_start','chrom_exon_end'], how='left')

caseids = list(set(mirna_counts.columns).intersection(list(exon_counts.columns)))
#TODO to feather & read in 
#exon_counts2.to_feather(path/'exon_counts_kidney.feather')
#exon_counts2 = pd.read_feather(path/'exon_counts_kidney.feather')

In [14]:
exon_counts

Unnamed: 0,TCGA-S9-A7J2-01,TCGA-G3-A3CH-11,TCGA-EK-A2RE-01,TCGA-44-6778-01,TCGA-VM-A8C8-01,TCGA-AB-2863-03,TCGA-C8-A1HL-01,TCGA-EE-A17X-06,TCGA-YB-A89D-11,TCGA-05-4420-01,...,TCGA-95-7947-01,TCGA-VQ-AA6F-01,TCGA-55-6985-11,TCGA-DD-A115-01,TCGA-FV-A3I0-11,gene,chrom,chromStart,chromEnd,strand
0,1.4367,2.1578,0.8145,1.2139,0.5548,1.1198,1.4589,2.1414,1.8688,1.3562,...,1.2963,1.2239,1.4330,1.6064,2.4094,"ABHD14B,RP11-155D18.14,RP11-155D18.12",3,52007981,52008646,-1
1,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.6338,0.0000,USH2A,1,215901372,215901726,-1
2,1.0947,0.0000,2.2705,2.5067,1.6157,3.7026,1.3928,0.0000,1.4322,1.9901,...,2.7503,2.5152,2.7521,2.0291,1.2827,SLC10A7,4,147214081,147214132,-1
3,7.6697,2.0744,3.6112,3.5746,4.9626,4.6708,3.0232,4.9309,3.8545,3.0587,...,3.8308,3.4310,3.2923,1.8304,1.5921,TNK2,3,195599147,195599341,-1
4,3.6420,0.1360,1.0323,1.3883,2.9340,5.0756,1.6862,1.7695,0.6393,0.7787,...,0.9110,1.8473,1.4798,0.1682,0.0000,LRRC37B,17,30351730,30351801,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208965,5.4664,3.7032,1.7697,3.3787,5.4812,3.4796,4.9258,4.3847,4.2477,5.6621,...,3.2680,3.5055,4.4544,4.4662,3.4955,"ABHD14A,ACY1,ABHD14A-ACY1",3,52012275,52012390,1
208966,3.0332,0.4627,1.7947,2.3325,0.9258,3.0244,1.3866,2.1483,1.4093,2.4386,...,2.6390,1.7642,1.6986,1.8316,0.5412,ZNF318,6,43324864,43325503,-1
208967,2.4818,0.2817,1.2538,1.0861,3.4473,2.7324,0.7754,0.1212,2.2907,1.3198,...,1.8832,1.4991,1.0195,0.0000,0.6923,TBC1D3B,17,34495988,34496053,-1
208968,7.0346,2.5697,4.7081,5.3368,6.2845,8.2727,4.6313,3.6474,5.4170,3.9304,...,5.8426,3.9124,5.3203,3.5739,2.6522,NBPF14,1,148010884,148011056,-1


In [11]:
mirna_counts

Unnamed: 0,sample,TCGA-C4-A0F6-01,TCGA-CU-A0YO-01,TCGA-BT-A0S7-01,TCGA-CU-A0YR-01,TCGA-BL-A0C8-01,TCGA-C4-A0F0-01,TCGA-BL-A13J-01,TCGA-BT-A0YX-01,TCGA-CU-A0YN-01,...,TCGA-AG-A020-01,TCGA-AG-A01Y-01,TCGA-AG-A01W-01,TCGA-AG-3726-01,TCGA-AG-3605-01,TCGA-AG-3584-01,TCGA-AG-3599-01,TCGA-AG-3583-01,TCGA-AG-3598-01,TCGA-AG-3586-01
0,hsa-let-7a-2-3p,0.99,1.91,3.02,0.85,0.85,2.70,2.50,1.22,1.57,...,-0.91,1.18,1.29,1.02,0.16,2.09,1.59,0.74,1.56,1.35
1,hsa-let-7a-3p,5.08,5.99,6.03,4.23,5.06,5.05,5.43,4.64,5.09,...,5.91,5.75,5.40,4.45,4.61,4.54,4.91,4.38,4.75,4.73
2,hsa-let-7a-5p,14.12,14.35,15.97,14.96,15.16,15.33,14.95,14.67,14.53,...,14.47,14.83,14.70,14.39,14.73,14.50,14.13,14.04,14.50,13.98
3,hsa-let-7b-3p,2.95,3.68,5.40,3.92,4.26,4.38,4.33,3.42,4.87,...,4.52,3.24,4.08,4.73,3.58,4.43,4.89,4.55,5.42,3.96
4,hsa-let-7b-5p,12.75,12.55,15.08,12.90,12.54,14.03,13.23,12.83,12.89,...,11.67,11.78,12.56,12.09,12.38,12.22,12.47,12.89,12.58,12.72
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
738,hsa-miR-888-5p,0.00,0.00,0.36,0.00,0.00,0.00,0.00,0.25,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
739,hsa-miR-890,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
740,hsa-miR-891b,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
741,hsa-miR-892b,0.00,0.00,0.00,0.00,0.00,0.26,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


In [17]:
exon_counts2

Unnamed: 0,TCGA-S9-A7J2-01,TCGA-G3-A3CH-11,TCGA-EK-A2RE-01,TCGA-44-6778-01,TCGA-VM-A8C8-01,TCGA-AB-2863-03,TCGA-C8-A1HL-01,TCGA-EE-A17X-06,TCGA-YB-A89D-11,TCGA-05-4420-01,...,TCGA-FV-A3I0-11,gene,chrom,chromStart,chromEnd,strand,exon_id,chromosome_name,chrom_exon_start,chrom_exon_end
0,1.4367,2.1578,0.8145,1.2139,0.5548,1.1198,1.4589,2.1414,1.8688,1.3562,...,2.4094,"ABHD14B,RP11-155D18.14,RP11-155D18.12",3,52007981,52008646,-1,,,,
1,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,USH2A,1,215901372,215901726,-1,,,,
2,1.0947,0.0000,2.2705,2.5067,1.6157,3.7026,1.3928,0.0000,1.4322,1.9901,...,1.2827,SLC10A7,4,147214081,147214132,-1,,,,
3,7.6697,2.0744,3.6112,3.5746,4.9626,4.6708,3.0232,4.9309,3.8545,3.0587,...,1.5921,TNK2,3,195599147,195599341,-1,,,,
4,3.6420,0.1360,1.0323,1.3883,2.9340,5.0756,1.6862,1.7695,0.6393,0.7787,...,0.0000,LRRC37B,17,30351730,30351801,1,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209584,5.4664,3.7032,1.7697,3.3787,5.4812,3.4796,4.9258,4.3847,4.2477,5.6621,...,3.4955,"ABHD14A,ACY1,ABHD14A-ACY1",3,52012275,52012390,1,,,,
209585,3.0332,0.4627,1.7947,2.3325,0.9258,3.0244,1.3866,2.1483,1.4093,2.4386,...,0.5412,ZNF318,6,43324864,43325503,-1,,,,
209586,2.4818,0.2817,1.2538,1.0861,3.4473,2.7324,0.7754,0.1212,2.2907,1.3198,...,0.6923,TBC1D3B,17,34495988,34496053,-1,,,,
209587,7.0346,2.5697,4.7081,5.3368,6.2845,8.2727,4.6313,3.6474,5.4170,3.9304,...,2.6522,NBPF14,1,148010884,148011056,-1,,,,


In [46]:
exon_counts2[(exon_counts2.chrom =='17') & (exon_counts2.gene == 'PRPF8')]

Unnamed: 0,TCGA-S9-A7J2-01,TCGA-G3-A3CH-11,TCGA-EK-A2RE-01,TCGA-44-6778-01,TCGA-VM-A8C8-01,TCGA-AB-2863-03,TCGA-C8-A1HL-01,TCGA-EE-A17X-06,TCGA-YB-A89D-11,TCGA-05-4420-01,...,TCGA-FV-A3I0-11,gene,chrom,chromStart,chromEnd,strand,exon_id,chromosome_name,chrom_exon_start,chrom_exon_end
248,6.2424,3.7619,5.2324,6.0556,5.5296,6.9092,4.5639,6.8845,4.821,4.9702,...,3.6236,PRPF8,17,1562651,1562842,-1,,,,
1827,5.2108,2.0311,3.5119,5.2997,2.7258,6.514,2.7636,5.3613,2.8978,3.0417,...,1.6623,PRPF8,17,1585114,1585332,-1,,,,
3647,4.1398,1.1659,2.4005,3.7702,1.6251,3.7283,1.128,3.8327,2.1725,1.9854,...,0.8393,PRPF8,17,1588074,1588176,-1,,,,
6987,6.1047,3.7512,5.0742,6.0695,5.3212,7.471,4.49,6.7893,4.6632,4.8475,...,3.5469,PRPF8,17,1563135,1563295,-1,,,,
12232,6.6567,4.9697,6.1152,6.3514,6.7634,6.9182,6.0376,7.4532,5.9299,5.6154,...,4.718,PRPF8,17,1554942,1555082,-1,,,,
12768,6.0817,3.3411,4.9479,6.0837,4.865,6.7285,4.2203,6.7074,4.4395,4.6103,...,3.243,PRPF8,17,1564905,1565084,-1,,,,
18292,7.0901,5.6442,6.5329,6.4448,7.2739,6.5134,6.6935,7.7643,6.666,6.0434,...,5.3615,PRPF8,17,1554402,1554604,-1,,,,
18340,6.5507,4.384,5.9145,6.3703,6.3466,7.2695,5.6114,7.3956,5.382,5.4154,...,4.319,PRPF8,17,1559686,1559859,-1,,,,
20102,6.8365,4.8428,6.2537,6.6045,6.66,7.1285,5.9692,7.5176,5.9422,5.7407,...,4.6499,PRPF8,17,1557071,1557310,-1,,,,
20281,5.3664,2.3886,3.9104,5.3955,3.1286,7.5675,3.2364,5.6481,2.8442,3.398,...,1.8402,PRPF8,17,1584223,1584348,-1,,,,


In [45]:
exon_counts2[(exon_counts2.chrom =='17') & (exon_counts2.chromEnd >= ) & (exon_counts2.chromStart <=) & (exon_counts2.strand ==-1)]

SyntaxError: invalid syntax (<ipython-input-45-ecfa89ea4191>, line 1)

In [16]:
len(exon_counts2[exon_counts2.chromosome_name.isna()])

207797

## Map miRNA to counts

In [9]:
#get subset counts of bs and merge with df_gene
counts = bs[['bs_id', 'miRNA', 'mRNA', 'chrom_bs_start', 'chrom_bs_end']].copy()
counts = counts.merge(df_gene[['ensembl_transcript_id','chromosome_name','strand']], left_on='mRNA', right_on='ensembl_transcript_id', how='left')

#mapping mature ID to miRNA family
trans = pd.read_feather(path/'mature2families.feather')
counts = counts.merge(trans, left_on='miRNA',right_on='mature_name', how='left')

#alternative 0: dont join with mirnaids
#counts.drop(['ensembl_transcript_id', 'mature_name'], axis=1, inplace=True)

#! alternative 1: for all caseids at the same time !
counts = counts.merge(mirna_counts, left_on='mature_acc', right_on='sample', how='left')
counts.drop(['ensembl_transcript_id', 'mature_name', 'sample'], axis=1, inplace=True)

# alternative 2: for 1 caseids at a time
for caseid in []:#[caseids[0]]:
    counts = counts.merge(mirna_counts[['sample',caseid]], left_on='mature_acc', right_on='sample', how='left')
    counts.drop(['ensembl_transcript_id', 'mature_name', 'sample'], axis=1, inplace=True)

#counts[['chromosome_name','strand','miRNA','mRNA']] = counts[['chromosome_name','strand','miRNA','mRNA']].astype('str')#.astype('category')
counts.head()

Unnamed: 0,bs_id,miRNA,mRNA,chrom_bs_start,chrom_bs_end,chromosome_name,strand,mature_acc,mirna_family,TCGA-KN-8426-01,...,TCGA-KO-8408-01,TCGA-KM-8438-01,TCGA-KL-8323-01,TCGA-KN-8429-11,TCGA-KO-8413-01,TCGA-KO-8411-01,TCGA-KL-8332-11,TCGA-KL-8335-01,TCGA-KL-8339-11,TCGA-KO-8415-01
0,0,hsa-let-7a-2-3p,ENST00000576537,1578440,1578466,17,-1,MIMAT0010195,let-7,2.893871,...,2.829471,2.015161,2.594806,2.644861,1.786991,1.854876,2.824365,2.395032,2.791374,2.819371
1,1,hsa-let-7a-2-3p,ENST00000576537,1578561,1578581,17,-1,MIMAT0010195,let-7,2.893871,...,2.829471,2.015161,2.594806,2.644861,1.786991,1.854876,2.824365,2.395032,2.791374,2.819371
2,2,hsa-let-7b-5p,ENST00000576537,1576707,1578261,17,-1,MIMAT0000063,let-7,12.820202,...,12.829037,12.38224,13.638682,14.190721,14.086433,13.729646,13.115391,13.531265,13.56459,14.094524
3,3,hsa-let-7b-3p,ENST00000576537,1578446,1578467,17,-1,MIMAT0004482,let-7,4.104356,...,4.210218,3.770706,4.64693,4.29571,5.294043,5.520265,4.097683,4.664511,4.351934,4.063494
4,4,hsa-let-7b-3p,ENST00000576537,1576625,1576643,17,-1,MIMAT0004482,let-7,4.104356,...,4.210218,3.770706,4.64693,4.29571,5.294043,5.520265,4.097683,4.664511,4.351934,4.063494


In [246]:
#TODO put in presentation, analysis mirna, how many in xena data
#TODO find out : if na should i use it as 0????
counts['TCGA-KN-8426-01'].isna().sum() #for caseid TCGA-KN-8426-01: 56035 bs are thrown away 
len(list(counts[counts['TCGA-KN-8426-01'].isna()].mature_acc.unique())) #we don't have the counts for 1948 mirnas in bs
len(list(counts[~ counts['TCGA-KN-8426-01'].isna()].mature_acc.unique())) #we have counts for 658 mirnas

len(list(mirna_counts['sample'].unique())) #1917 = absolute nr mirnas from TCGA-KN-8426-01
len(list(mirna_counts[~ mirna_counts['TCGA-KN-8426-01'].isna()]['sample'].unique())) #1236 = not na nr mirnas from TCGA-KN-8426-01

658

## Map exons to counts

In [12]:
#for a large data set, you will likely get a significant speed increase by creating an index for any column(s) used in the join condition. 
#https://stackoverflow.com/questions/30627968/merge-pandas-dataframes-where-one-value-is-between-two-others
#instead use SQL
conn = sqlite3.connect('db.db')#':memory:') #Make the database in memory
c = conn.cursor()

#write the tables
counts.to_sql('counts', conn, index=False, dtype={"bs_id": 'INTEGER'})
exon_counts.to_sql('exon_counts', conn, index=True, index_label='exon_id', dtype={caseid : 'INTEGER' for caseid in caseids}.update({"chromStart": 'INTEGER', "chromEnd": 'INTEGER'}))

In [258]:
#print sqlite3 table 
qry = '''
    select * from counts
    '''
c.execute(qry)
conn.commit()
c.fetchall()
pd.read_sql_query(qry,conn)

Unnamed: 0,bs_id,miRNA,mRNA,chrom_bs_start,chrom_bs_end,chromosome_name,strand,mature_acc,mirna_family,TCGA-KN-8426-01,...,TCGA-KO-8408-01,TCGA-KM-8438-01,TCGA-KL-8323-01,TCGA-KN-8429-11,TCGA-KO-8413-01,TCGA-KO-8411-01,TCGA-KL-8332-11,TCGA-KL-8335-01,TCGA-KL-8339-11,TCGA-KO-8415-01
0,0,hsa-let-7a-2-3p,ENST00000576537,1578440,1578466,17,-1,MIMAT0010195,let-7,2.893871,...,2.829471,2.015161,2.594806,2.644861,1.786991,1.854876,2.824365,2.395032,2.791374,2.819371
1,1,hsa-let-7a-2-3p,ENST00000576537,1578561,1578581,17,-1,MIMAT0010195,let-7,2.893871,...,2.829471,2.015161,2.594806,2.644861,1.786991,1.854876,2.824365,2.395032,2.791374,2.819371
2,2,hsa-let-7b-5p,ENST00000576537,1576707,1578261,17,-1,MIMAT0000063,let-7,12.820202,...,12.829037,12.382240,13.638682,14.190721,14.086433,13.729646,13.115391,13.531265,13.564590,14.094524
3,3,hsa-let-7b-3p,ENST00000576537,1578446,1578467,17,-1,MIMAT0004482,let-7,4.104356,...,4.210218,3.770706,4.646930,4.295710,5.294043,5.520265,4.097683,4.664511,4.351934,4.063494
4,4,hsa-let-7b-3p,ENST00000576537,1576625,1576643,17,-1,MIMAT0004482,let-7,4.104356,...,4.210218,3.770706,4.646930,4.295710,5.294043,5.520265,4.097683,4.664511,4.351934,4.063494
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73188,73188,hsa-miR-12128,ENST00000216014,38479688,38479728,22,1,MIMAT0049022,hsa-miR-12128,,...,,,,,,,,,,
73189,73189,hsa-miR-12129,ENST00000216014,38481326,38481371,22,1,MIMAT0049023,hsa-miR-12129,,...,,,,,,,,,,
73190,73190,hsa-miR-12129,ENST00000216014,38479605,38479631,22,1,MIMAT0049023,hsa-miR-12129,,...,,,,,,,,,,
73191,73191,hsa-miR-12129,ENST00000216014,38479693,38479708,22,1,MIMAT0049023,hsa-miR-12129,,...,,,,,,,,,,


In [11]:
#DELETE table from sqlite3 DB
c.execute("DROP TABLE counts;")
conn.commit()
c.execute("DROP TABLE exon_counts;")
conn.commit()

In [181]:
counts[(counts.chrom_bs_start > 1578445) & (counts.chrom_bs_end < 1578634)]

Unnamed: 0,bs_id,miRNA,mRNA,chrom_bs_start,chrom_bs_end,chromosome_name,strand,mature_acc,mirna_family,TCGA-KN-8426-01,...,TCGA-KO-8408-01,TCGA-KM-8438-01,TCGA-KL-8323-01,TCGA-KN-8429-11,TCGA-KO-8413-01,TCGA-KO-8411-01,TCGA-KL-8332-11,TCGA-KL-8335-01,TCGA-KL-8339-11,TCGA-KO-8415-01
1,1,hsa-let-7a-2-3p,ENST00000576537,1578561,1578581,17,-1,MIMAT0010195,let-7,2.893871,...,2.829471,2.015161,2.594806,2.644861,1.786991,1.854876,2.824365,2.395032,2.791374,2.819371
3,3,hsa-let-7b-3p,ENST00000576537,1578446,1578467,17,-1,MIMAT0004482,let-7,4.104356,...,4.210218,3.770706,4.646930,4.295710,5.294043,5.520265,4.097683,4.664511,4.351934,4.063494
6,6,hsa-let-7c-3p,ENST00000576537,1578453,1578471,17,-1,MIMAT0026472,let-7,2.695914,...,2.714853,2.455017,2.155998,3.673442,2.676288,3.043974,4.407776,2.500585,3.844001,2.977714
9,9,hsa-let-7d-3p,ENST00000576537,1578446,1578473,17,-1,MIMAT0004484,let-7,7.990090,...,8.056507,8.939957,8.391837,7.818937,7.722212,8.106506,7.446075,8.217851,8.154153,8.335863
12,12,hsa-let-7e-3p,ENST00000576537,1578561,1578579,17,-1,MIMAT0004485,let-7,3.147645,...,2.523778,3.836892,2.846723,4.056371,2.882109,3.414638,3.685095,2.885547,3.698584,3.274872
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3384,3384,hsa-miR-12124,ENST00000576537,1578599,1578616,17,-1,MIMAT0049018,hsa-miR-12124,,...,,,,,,,,,,
3385,3385,hsa-miR-12124,ENST00000576537,1578488,1578508,17,-1,MIMAT0049018,hsa-miR-12124,,...,,,,,,,,,,
3388,3388,hsa-miR-12126,ENST00000576537,1578447,1578480,17,-1,MIMAT0049020,hsa-miR-12126,,...,,,,,,,,,,
3390,3390,hsa-miR-12128,ENST00000576537,1578517,1578566,17,-1,MIMAT0049022,hsa-miR-12128,,...,,,,,,,,,,


In [325]:
exon_counts[(exon_counts.chromStart > 1578445) & (exon_counts.chromEnd < 1578634)]

Unnamed: 0,TCGA-KN-8419-01,TCGA-KL-8346-01,TCGA-KN-8422-01,TCGA-KN-8431-11,TCGA-KN-8430-11,TCGA-KM-8440-01,TCGA-KO-8414-01,TCGA-KL-8323-01,TCGA-KM-8639-01,TCGA-KO-8415-11,...,TCGA-KO-8408-01,TCGA-KM-8443-01,TCGA-KM-8442-01,TCGA-KL-8332-11,TCGA-KL-8327-01,gene,chrom,chromStart,chromEnd,strand
45710,4.922,2.6972,2.0686,4.7578,4.705,4.5074,3.3096,4.5863,4.6372,4.7683,...,2.5759,4.1899,4.4041,3.6642,1.3794,PRPF8,chr17,1578446,1578633,-1


In [None]:
#TODO change column name case_id to column name case_id_mirna so it doesnt overlap with exon
#instead of joining like i do & repeating the exon info -> only add exon_id + pivot / addup per exon + add exon counts

In [None]:
#TODO right now I hardcoded TCGA-KN-8419-01!!
#exon_counts.chrom, exon_counts.chromStart, exon_counts.chromEnd, exon_counts.strand, exon_counts.'TCGA-KN-8419-01', exon_counts.exon_id, counts.bs_id, counts.miRNA, counts.mRNA, counts.chrom_bs_start, counts.chrom_bs_end

qry = '''
    CREATE TABLE n_counts AS
    select
        *
    from
        counts inner join (
        select exon_counts.chrom, exon_counts.chromStart, exon_counts.chromEnd, exon_counts.strand, exon_counts.exon_id
        from exon_counts )
        on
        counts.chromosome_name = exon_counts.chrom and counts.strand = exon_counts.strand and counts.chrom_bs_start >= exon_counts.chromStart and counts.chrom_bs_end <= exon_counts.chromEnd
    '''
c.execute(qry)
conn.commit()
print('n_counts done.')
qry = '''
    select * from n_counts
    '''
c.execute(qry)
conn.commit()
c.fetchall()
#n_counts = pd.read_sql_query(qry,conn) #TODO somehow empty idk why

#todo google pandas 3D
#look at old code idk why

#only returns bs with minimum 1 exon_id cause inner join (else left join)
#automatically filters out all binding sites that have more than 1 exon_id
#if u dont want that use outer join -> actually wrong, it uses the first instead of leaving it out!

In [327]:
n_counts

Unnamed: 0,bs_id,miRNA,mRNA,chrom_bs_start,chrom_bs_end,chromosome_name,strand,mature_acc,mirna_family,TCGA-KN-8426-01,...,TCGA-KO-8408-01,TCGA-KM-8443-01,TCGA-KM-8442-01,TCGA-KL-8332-11,TCGA-KL-8327-01,gene,chrom,chromStart,chromEnd,strand.1
0,0,hsa-let-7a-2-3p,ENST00000576537,1578440,1578466,17,-1,MIMAT0010195,let-7,2.893871,...,,,,,,,,,,
1,1,hsa-let-7a-2-3p,ENST00000576537,1578561,1578581,17,-1,MIMAT0010195,let-7,2.893871,...,,,,,,,,,,
2,2,hsa-let-7b-5p,ENST00000576537,1576707,1578261,17,-1,MIMAT0000063,let-7,12.820202,...,,,,,,,,,,
3,3,hsa-let-7b-3p,ENST00000576537,1578446,1578467,17,-1,MIMAT0004482,let-7,4.104356,...,,,,,,,,,,
4,4,hsa-let-7b-3p,ENST00000576537,1576625,1576643,17,-1,MIMAT0004482,let-7,4.104356,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73188,73188,hsa-miR-12128,ENST00000216014,38479688,38479728,22,1,MIMAT0049022,hsa-miR-12128,,...,,,,,,,,,,
73189,73189,hsa-miR-12129,ENST00000216014,38481326,38481371,22,1,MIMAT0049023,hsa-miR-12129,,...,,,,,,,,,,
73190,73190,hsa-miR-12129,ENST00000216014,38479605,38479631,22,1,MIMAT0049023,hsa-miR-12129,,...,,,,,,,,,,
73191,73191,hsa-miR-12129,ENST00000216014,38479693,38479708,22,1,MIMAT0049023,hsa-miR-12129,,...,,,,,,,,,,


In [293]:
#old
#TODO takes fuuuuuckin long, instead join on strand, chromosome_name, start, end
#merge bs with exon counts, real slow, TODO cython, 22.34 - lookup when
def is_in_exon(row, exon_counts):
    bs_start = row.chrom_bs_start
    bs_end = row.chrom_bs_end
    here = exon_counts.loc[(exon_counts.chrom == row.chromosome_name) & (exon_counts['strand'] == row.strand) & (exon_counts.chromStart == bs_start) & (exon_counts.chromEnd == bs_end)]
    #print(here)
    count_list = str(list(here['TCGA-KN-8419-01']))[1:-1]
    return count_list if not count_list == '' else None

#takes long! for 1 case id: 2 min 
#exon_counts = all_exon_dfs['aba26e6b-f11c-4ae6-a0d6-85bdd2060e8f'] #df#all_exon_dfs[case_id]
counts['exon'] = counts.apply(lambda row: is_in_exon(row,exon_counts), axis=1)
bs_exons = pd.DataFrame(counts.exons.str.split(',').tolist(),index=counts.bs_id)
bs_exons = bs_exons.rename(columns={0: "exon_raw_count"})
bs_exons.head()

KeyboardInterrupt: 

In [158]:
#visualize amount exons by bindingsite | amount bindingsites by exon
import sqlite3
conn = sqlite3.connect(':memory:') #Make the database in memory

#write the tables
important_counts.to_sql('bs', conn, index=False, dtype={"bs_id": 'INTEGER'})
new_exon_counts.to_sql('counts', conn, index=True, index_label='exon_id', dtype={"exon_id": 'INTEGER'})
qry = '''
    select
        counts.chromosome_name, counts.chrom_exon_start, counts.chrom_exon_end, counts.strand, counts.raw_counts, counts.exon_id, bs.bs_id, bs.miRNA, bs.mRNA, bs.chrom_bs_start, bs.chrom_bs_end
    from
        bs left join counts on
        bs.chromosome_name = counts.chromosome_name and bs.strand = counts.strand and bs.chrom_bs_start >= counts.chrom_exon_start and bs.chrom_bs_end <= counts.chrom_exon_end
    '''
bs_exons2 = pd.read_sql_query(qry, conn)
bs_exons2.head()
#bs_exons2.groupby('bs_id').count().raw_counts.hist()
#bs_exons2.groupby('exon_id').count()

In [287]:
n_counts

Unnamed: 0,chrom,chromStart,chromEnd,strand,TCGA-KN-8419-01,exon_id,bs_id,miRNA,mRNA,chrom_bs_start,chrom_bs_end


## Put all counts into one & pivot table

In [248]:
mirna_counts.head()

Unnamed: 0,sample,TCGA-KN-8426-01,TCGA-KN-8419-11,TCGA-KL-8332-01,TCGA-KN-8423-01,TCGA-KO-8405-01,TCGA-KL-8337-01,TCGA-KN-8424-11,TCGA-KN-8422-11,TCGA-KL-8334-01,...,TCGA-KO-8408-01,TCGA-KM-8438-01,TCGA-KL-8323-01,TCGA-KN-8429-11,TCGA-KO-8413-01,TCGA-KO-8411-01,TCGA-KL-8332-11,TCGA-KL-8335-01,TCGA-KL-8339-11,TCGA-KO-8415-01
0,MIMAT0019868,,,,,,,,,,...,,,,,,,,,,
1,MIMAT0019869,,0.1644,,0.1544,,,,0.301479,,...,,,,,,,,,,
2,MIMAT0019860,,,,,,,,,,...,,,,,,,,,,0.4087
3,MIMAT0019862,,,,0.1544,,,,,,...,,,,,,,0.2042,,,
4,MIMAT0019864,,0.1644,0.1774,,,0.1638,0.4246,,0.2075,...,,0.2143,,,,,,0.1803,,


In [279]:
exon_counts.head()

Unnamed: 0,TCGA-KN-8419-01,TCGA-KL-8346-01,TCGA-KN-8422-01,TCGA-KN-8431-11,TCGA-KN-8430-11,TCGA-KM-8440-01,TCGA-KO-8414-01,TCGA-KL-8323-01,TCGA-KM-8639-01,TCGA-KO-8415-11,...,TCGA-KO-8408-01,TCGA-KM-8443-01,TCGA-KM-8442-01,TCGA-KL-8332-11,TCGA-KL-8327-01,gene,chrom,chromStart,chromEnd,strand
0,2.1156,1.7396,2.0641,3.1457,3.0436,2.2209,1.8213,1.9092,2.9139,3.1049,...,2.2369,2.4951,2.101,3.0617,0.7424,"ABHD14B,RP11-155D18.14,RP11-155D18.12",chr3,52007981,52008646,-1
1,0.0,0.0,0.0,0.0251,0.0233,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,USH2A,chr1,215901372,215901726,-1
2,2.529,0.7196,1.3094,2.3074,1.6894,1.5829,1.0835,1.9041,1.2131,1.7214,...,0.6381,1.5227,1.7786,1.4157,0.0,SLC10A7,chr4,147214081,147214132,-1
3,2.3839,3.8702,2.5373,2.3464,2.2976,3.0639,2.5666,2.7963,3.0875,2.0227,...,3.0412,3.5838,3.7551,1.4157,1.8708,TNK2,chr3,195599147,195599341,-1
4,1.9974,0.6984,0.9011,0.6827,1.2425,0.9715,1.6701,1.3011,1.3712,1.5433,...,1.141,1.2345,1.7403,0.0,0.0,LRRC37B,chr17,30351730,30351801,1


In [None]:
#TODO put all different caseids sqlite tables together into one
#INSERT INTO artists_backup 
#SELECT ArtistId, Name
#FROM artists;

In [None]:
#output: for each case_id for each bs_id: interesting miRNA expression + exon expression
#unstack family counts from 2 columns to several columns (mirna_family=name,mirna_read_count=value)
pivoted = bs_mirnas.copy()
pivoted = pivoted.pivot_table(values='mirna_read_count', index=['exon_id', 'exon_raw_counts'], columns='mirna_family', aggfunc='sum', fill_value=0)
pivoted#.head()
#TODO debug does this do it right OR use sqllite3

In [None]:
#TODO pivot in sqlite3 
#https://modern-sql.com/use-case/pivot
#https://stackoverflow.com/questions/1237068/how-to-pivot-in-sqlite-or-i-e-select-in-wide-format-a-table-stored-in-long-form

In [227]:
bs_mirnas[bs_mirnas.exon_id == 216595]

Unnamed: 0,exon_id,exon_raw_counts,miRNA,miRNA_ID,mirna_family,mirna_read_count
7,216595,0,hsa-let-7b-5p,hsa-let-7b,let-7,55647
14,216595,0,hsa-let-7d-5p,hsa-let-7d,let-7,3767
25,216595,0,hsa-let-7e-3p,hsa-let-7e,let-7,11424
33,216595,0,hsa-let-7e-3p,hsa-let-7e,let-7,11424
46,216595,0,hsa-miR-15a-3p,hsa-mir-15a,mir-15,2258
...,...,...,...,...,...,...
12562,216595,0,hsa-miR-5692a,hsa-mir-5692a-2,mir-5692,0
12563,216595,0,hsa-miR-548az-5p,hsa-mir-548az,mir-548,0
12566,216595,0,hsa-miR-8062,hsa-mir-8062,hsa-miR-8062,0
12571,216595,0,hsa-miR-8064,hsa-mir-8064,hsa-miR-8064,0


In [226]:
bs_mirnas[bs_mirnas.mirna_family =='hsa-miR-1229-5p']

# 1 right? count is added for exonid 216595, but do i already add it before ?
# 2 also : exon_id + mirna double (3368 & 3370) wrong!!

Unnamed: 0,exon_id,exon_raw_counts,miRNA,miRNA_ID,mirna_family,mirna_read_count
3365,216603,0,hsa-miR-1229-5p,hsa-mir-1229,hsa-miR-1229-5p,4
3368,216595,0,hsa-miR-1229-5p,hsa-mir-1229,hsa-miR-1229-5p,4
3370,216595,0,hsa-miR-1229-5p,hsa-mir-1229,hsa-miR-1229-5p,4
3371,216596,0,hsa-miR-1229-5p,hsa-mir-1229,hsa-miR-1229-5p,4


In [None]:
#visualize amount exons by bindingsite
#bs_exons.groupby('bs_id').count().raw_counts.hist()

In [274]:
#TODO check - is nr mirna downloaded also here? if not: mistake
print(X.shape,y[0].shape)

(8, 1680) (8,)


# Elastic Net Regression

In [None]:
import sys
sys.getsizeof(pivoted)#TODO

In [276]:
#regression btw miRNA expression, exon expression
import sklearn
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression
#faster if np.array(x, order='F')
#If you are interested in controlling the L1 and L2 penalty separately, keep in mind that this is equivalent to:
#a * L1 + b * L2 where: alpha = a + b and l1_ratio = a / (a + b)
X = np.array(pivoted)
y = np.array(pivoted.index.get_level_values('exon_raw_counts'))

regr = ElasticNet(random_state=0)
regr.fit(X, y)
print(regr.coef_)
print(regr.intercept_)

[0. 0. 0. ... 0. 0. 0.]
0.0


  model = cd_fast.enet_coordinate_descent(
