# Assemble the final Dataframe from tarpmir bindingsites

## Read in bindingsite & Ensembl data

In [3]:
%load_ext autoreload
%autoreload 2
from helper_fcts import *
import sqlite3
import os
import pandas as pd
from pathlib import Path

path = Path('data')
tcga_path = Path(path/'KICH')
path_tarp = Path(path/'tarp-bs')
#ann_path = Path('C:/Users/Lena/Documents/Master big files/manual_GDC_download')

In [7]:
#read in df_gene: gene_id, g_id_v, transcript_id, t_id_v, chromosome, strand, start, end (1-N)
df_gene = pd.read_csv(path/'gene_infos.csv', dtype={'ensembl_gene_id':str,'ensembl_gene_id_version':str,'ensembl_transcript_id':str, 'ensembl_transcript_id_version':str, 'chromosome_name':'category', 'strand':'int8', 'start_position':int, 'end_position':int})
del df_gene['Unnamed: 0']
df_gene[['strand']] = df_gene[['strand']].astype('int').astype('Int64')

#executed once, read in from pickled files
ids = read_in_ids(path/'all_mapping_ids.fasta') # mapping of transcript ids to exon ids
chrom_exon_starts, exon_starts, exon_ends, df_exon = calc_exon_data(path) # exon starts and ends per transcript + exon id

df_exon = df_exon.merge(df_gene, left_on = 'transcript_id', right_on = 'ensembl_transcript_id', how='left')
df_exon[['chrom_exon_start','chrom_exon_end','start_position','end_position']] = df_exon[['chrom_exon_start','chrom_exon_end','start_position','end_position']].astype('Int64')
df_exon[['strand']] = df_exon[['strand']].astype('str').astype('category')

#len(df_exon[df_exon.ensembl_transcript_id.isna()].transcript_id.unique())
#371 transcripts are not in df_gene, TODO, why? which transcripts are in tarpmir predictions
#TODO instead of joining df_gene, exon_info -> directly read in from R feather file & also use that for calculating chrom pos

In [8]:
#read in bs
seperate_bs = []
for filename in os.listdir(path_tarp):
    binding_sites = parse_tarp_bs(path_tarp/filename)
    seperate_bs.append(binding_sites)
bs = pd.concat(seperate_bs, axis=0, ignore_index=True) #14 388 + 10 326 = 24 714 rows
bs.to_feather(path/'bs.feather')
print('All predicted bindingsites were read into pandas.')

# alternative: set bs from feather
#bs = pd.read_feather(path/'bs.feather')

All predicted bindingsites were read into pandas.


In [9]:
#parse miRNA and mRNA seed from sequence
miRNA_sequences = parse_seq(path/'input_miRNA.fasta')#2661 miRNAs 
bs['miRNA_seed'] = bs.apply(lambda row: miRNA_sequences[row['miRNA']], axis=1)
mRNA_sequences = parse_seq(path/'cdna6247.fasta') #TODO change to bigger file with all mRNAs or iterate over all files
bs['mRNA_bs_seq'] = bs.apply(lambda row: mRNA_sequences[row['mRNA']][row['bs_start']:row['bs_end']], axis=1)

In [10]:
%load_ext Cython

In [4]:
%%cython
def pos_to_chrom(pos, tid, ids, exon_starts, exon_ends, chrom_exon_starts):
    for j, eid in enumerate(ids[tid]):
        if tid in exon_starts and eid in exon_starts[tid] and tid in exon_ends and eid in exon_ends[tid]:
            if pos >= exon_starts[tid][eid] and pos <= exon_ends[tid][eid]:
                diff = pos - exon_starts[tid][eid]
                chrom_pos = chrom_exon_starts[tid][eid] + diff
                return chrom_pos
        else: print('Error: Either',tid,'or',eid,'not in exon_starts or exon_ends')
    return None

In [12]:
%%cython
def get_eid(start, end, tid, ids, exon_starts, exon_ends, chrom_exon_starts):
    for j, eid in enumerate(ids[tid]):
        if tid in exon_starts and eid in exon_starts[tid] and tid in exon_ends and eid in exon_ends[tid]:
            if (start >= exon_starts[tid][eid]) and (start <= exon_ends[tid][eid]) and (end >= exon_starts[tid][eid]) and (end <= exon_ends[tid][eid]):
                return eid
        else: print('Error: Either',tid,'or',eid,'not in exon_starts or exon_ends')
    return None

In [13]:
#translate genome position relative to transcript to chromosome
#fastest, always puts bs start and end even if bs spans several exons, but only puts exonid if only 1 exon
bs['chrom_bs_start'] = bs.apply(lambda row: pos_to_chrom(row.bs_start, row.mRNA, ids, exon_starts, exon_ends, chrom_exon_starts), axis=1)
bs['chrom_bs_end'] = bs.apply(lambda row: pos_to_chrom(row.bs_end, row.mRNA, ids, exon_starts, exon_ends, chrom_exon_starts), axis=1)
bs['exon_id'] = bs.apply(lambda row: get_eid(row.bs_start, row.bs_end, row.mRNA, ids, exon_starts, exon_ends, chrom_exon_starts), axis=1)
bs['bs_id'] = bs.index
bs.head()

Unnamed: 0,miRNA,mRNA,binding_probability,energy,seed,accessibility,AU_content,PhyloP_Stem,PyloP_Flanking,m/e,...,pairings_in_3prime_end,difference_of_pairings_between_seed_and_3prime_end,bs_start,bs_end,miRNA_seed,mRNA_bs_seq,chrom_bs_start,chrom_bs_end,exon_id,bs_id
0,hsa-let-7a-2-3p,ENST00000576537,1.0,-25.9,0,0.000156,0.338,0.005183,-0.035634,-11.206441,...,7,1,309,335,CTGTACAGCCTCCTAGCTTTCC,,1578440,1578466,ENSE00002650258,0
1,hsa-let-7a-2-3p,ENST00000576537,0.74359,-21.6,1,2e-06,0.338,-0.161796,0.055277,-3.793325,...,3,4,430,450,CTGTACAGCCTCCTAGCTTTCC,,1578561,1578581,ENSE00002650258,1
2,hsa-let-7b-5p,ENST00000576537,0.846154,-18.4,1,2.6e-05,0.441,3.547385,2.945605,-6.324962,...,8,1,105,130,TGAGGTAGTAGGTTGTGTGGTT,,1576707,1578261,,2
3,hsa-let-7b-3p,ENST00000576537,0.615385,-20.6,0,0.000143,0.338,-0.021616,-0.047693,-18.306612,...,7,3,315,336,CTATACAACCTACTGCCTTCCC,,1578446,1578467,ENSE00002650258,3
4,hsa-let-7b-3p,ENST00000576537,0.538462,-15.3,0,2e-05,0.456,3.7732,3.161275,-9.685015,...,5,1,23,41,CTATACAACCTACTGCCTTCCC,GGTGGCGTGGGCCTGTAA,1576625,1576643,ENSE00002671000,4


## Read in TCGA data (processed by Xena) from file

In [187]:
#TODO option 1: read in exon_id annotation file (compare)
#import gffutils
#fn = gffutils.example_filename(ann_path/"gencode.v22.annotation.gtf")
#exon_db = gffutils.create_db(fn, dbfn='exon.db', disable_infer_genes=True, disable_infer_transcripts=True, force=True)
#gene = 'ENSG00000174231.15' #todo version wichtig!
#for i in exon_db.children(gene, featuretype='exon'):#, order_by='start'):
 #   print(i)
#for exon in exon_db.region("chr17:1578446-1578633", strand="-", featuretype='exon'):#, completely_within=True):
    #print(exon)

chr17	HAVANA	gene	1650629	1684882	.	-	.	gene_id "ENSG00000174231.15"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "PRPF8"; level "1"; havana_gene "OTTHUMG00000090553.5";


In [3]:
#download GDC ChGR37 data from web, once, TCGA Kidney Chromophobe (KICH)
#https://xenabrowser.net/datapages/?hub=https://tcga.xenahubs.net:443
'''
#!wget https://tcga.xenahubs.net/download/unc_v2_exon_hg19_probe_TCGA #unnecessary cause some exons missing
!wget https://tcga.xenahubs.net/download/TCGA.KICH.sampleMap/HiSeqV2_exon.gz
!wget https://tcga.xenahubs.net/download/TCGA.KICH.sampleMap/miRNA_HiSeq_gene.gz
!gunzip HiSeqV2_exon.gz
!gunzip miRNA_HiSeq_gene.gz
!mv HiSeqV2_exon data/KICH
!mv miRNA_HiSeq_gene data/KICH
!mv unc_v2_exon_hg19_probe_TCGA data/KICH'''

--2020-10-31 15:05:57--  https://tcga.xenahubs.net/download/unc_v2_exon_hg19_probe_TCGA
Resolving tcga.xenahubs.net (tcga.xenahubs.net)... 54.237.213.246, 52.73.198.193
Connecting to tcga.xenahubs.net (tcga.xenahubs.net)|54.237.213.246|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12431049 (12M) [application/octet-stream]
Saving to: ‘unc_v2_exon_hg19_probe_TCGA’


2020-10-31 15:06:15 (715 KB/s) - ‘unc_v2_exon_hg19_probe_TCGA’ saved [12431049/12431049]

--2020-10-31 15:06:15--  https://tcga.xenahubs.net/download/TCGA.KICH.sampleMap/HiSeqV2_exon.gz
Resolving tcga.xenahubs.net (tcga.xenahubs.net)... 52.73.198.193, 54.237.213.246
Connecting to tcga.xenahubs.net (tcga.xenahubs.net)|52.73.198.193|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 54181940 (52M) [application/gzip]
Saving to: ‘HiSeqV2_exon.gz’


2020-10-31 15:07:59 (510 KB/s) - ‘HiSeqV2_exon.gz’ saved [54181940/54181940]

--2020-10-31 15:08:00--  https://tcga.xenahubs.net/down

NameError: name 'pd' is not defined

In [3]:
mirna_counts = pd.read_csv(tcga_path/'miRNA_HiSeq_gene', delimiter='	')

In [8]:
f = Path(path/'exon_counts_kidney.feather')
if f.is_file() and False: 
    exon_counts = pd.read_feather(f)
else:
    print('translate exon count positions from anew')
    exon_counts = pd.read_csv(tcga_path/'HiSeqV2_exon', delimiter='	')
    #exon_counts[['chromosome_name','strand','chrom_exon_start','chrom_exon_end']] = exon_counts.id.apply(lambda x: pd.Series(c_split_chrom_column(str(x))))
    print(len(exon_counts))
    tran_exon = pd.read_csv(tcga_path/'unc_v2_exon_hg19_probe_TCGA', delimiter='	')
    tran_exon = tran_exon.drop_duplicates(keep='first') # 1884 duplicates -> 948 rows less
    #save positions to BED input file
    tran_exon['score'] = 0
    #tran_exon.to_csv('data/exon_hg19_pos.bed', sep='\t', columns=['chrom','chromStart','chromEnd','id','score','strand'], header=False, index=False)
    #translate positions from GRCh37 to GRCh38
    !../liftOver data/exon_hg19_pos.bed data/hg19ToHg38.over.chain.gz data/exon_hg38_pos.bed data/unlifted.bed
    #read in translated positions from liftOver output BED file
    tran_exon_new = pd.read_csv('data/exon_hg38_pos.bed', delimiter='	', names=['chrom','chromStart','chromEnd','id','score','strand'])
    exon_counts = exon_counts.merge(tran_exon[['gene','id']], left_on='sample', right_on='id', how='left',validate='1:1')
    exon_counts = exon_counts.merge(tran_exon_new, left_on='sample', right_on='id', how='inner',validate='1:m')
    print(len(exon_counts))
   # exon_counts.drop(['sample', 'id_x','id_y','score'], axis=1, inplace=True)
    #transform strand {-,+} to {-1,+1}
    exon_counts['strand'] = exon_counts.apply(lambda row: -1 if row.strand == '-' else 1, axis=1)
    exon_counts['chrom'] = exon_counts['chrom'].str[3:]
    exon_counts[['chrom','strand']] = exon_counts[['chrom','strand']].astype('str').astype('category')
    exon_counts[['chromStart','chromEnd']] = exon_counts[['chromStart','chromEnd']].astype('Int64')
        
    #TODO option 2: get exonid per exon count from Ensembl (compare)
    #exon_counts2 = exon_counts.merge(df_exon[['exon_id','chromosome_name','strand','chrom_exon_start','chrom_exon_end']], left_on = ['chrom','strand','chromStart','chromEnd'], right_on = ['chromosome_name','strand','chrom_exon_start','chrom_exon_end'], how='left')

    caseids = list(set(mirna_counts.columns).intersection(list(exon_counts.columns)))
    exon_counts.to_feather(f)

translate exon count positions from anew
239322
Reading liftover chains
Mapping coordinates
207695


NameError: name 'mirna_counts' is not defined

In [119]:
print(len(tran_exon))
print(len(tran_exon_new))

208022
207695


In [149]:
tran_exon

Unnamed: 0,id,gene,chrom,chromStart,chromEnd,strand
0,chr1:12776118-12776347:+,AADACL3,chr1,12776118,12776347,+
1,chr1:12779477-12779693:+,AADACL3,chr1,12779477,12779693,+
2,chr1:12780885-12780948:+,AADACL3,chr1,12780885,12780948,+
3,chr1:12785189-12788726:+,AADACL3,chr1,12785189,12788726,+
4,chr1:12704566-12704733:+,AADACL4,chr1,12704566,12704733,+
...,...,...,...,...,...,...
208965,chrY:2829115-2829687:+,ZFY,chrY,2829115,2829687,+
208966,chrY:2843136-2843285:+,ZFY,chrY,2843136,2843285,+
208967,chrY:2843552-2843695:+,ZFY,chrY,2843552,2843695,+
208968,chrY:2844711-2845221:+,ZFY,chrY,2844711,2845221,+


In [133]:
exon_counts

Unnamed: 0,sample,TCGA-KN-8419-01,TCGA-KL-8346-01,TCGA-KN-8422-01,TCGA-KN-8431-11,TCGA-KN-8430-11,TCGA-KM-8440-01,TCGA-KO-8414-01,TCGA-KL-8323-01,TCGA-KM-8639-01,...,TCGA-KL-8332-11,TCGA-KL-8327-01,gene,id_x,chrom,chromStart,chromEnd,id_y,score,strand
0,chr3:52007981-52008646:-,2.1156,1.7396,2.0641,3.1457,3.0436,2.2209,1.8213,1.9092,2.9139,...,3.0617,0.7424,"ABHD14B,RP11-155D18.14,RP11-155D18.12",chr3:52007981-52008646:-,3,51973965,51974630,chr3:52007981-52008646:-,0,-1
1,chr1:215901372-215901726:-,0.0000,0.0000,0.0000,0.0251,0.0233,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,USH2A,chr1:215901372-215901726:-,1,215728030,215728384,chr1:215901372-215901726:-,0,-1
2,chr4:147214081-147214132:-,2.5290,0.7196,1.3094,2.3074,1.6894,1.5829,1.0835,1.9041,1.2131,...,1.4157,0.0000,SLC10A7,chr4:147214081-147214132:-,4,146292929,146292980,chr4:147214081-147214132:-,0,-1
3,chr3:195599147-195599341:-,2.3839,3.8702,2.5373,2.3464,2.2976,3.0639,2.5666,2.7963,3.0875,...,1.4157,1.8708,TNK2,chr3:195599147-195599341:-,3,195872276,195872470,chr3:195599147-195599341:-,0,-1
4,chr17:30351730-30351801:+,1.9974,0.6984,0.9011,0.6827,1.2425,0.9715,1.6701,1.3011,1.3712,...,0.0000,0.0000,LRRC37B,chr17:30351730-30351801:+,17,32024711,32024782,chr17:30351730-30351801:+,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207690,chr3:52012275-52012390:+,2.4455,5.1078,2.9468,5.4212,5.1299,4.5371,3.5954,4.4245,4.4484,...,7.3459,4.1126,"ABHD14A,ACY1,ABHD14A-ACY1",chr3:52012275-52012390:+,3,51978259,51978374,chr3:52012275-52012390:+,0,1
207691,chr6:43324864-43325503:-,1.5701,0.5500,0.3451,2.2783,1.7366,0.6813,0.5851,1.4460,2.4188,...,1.2410,0.2725,ZNF318,chr6:43324864-43325503:-,6,43357126,43357765,chr6:43324864-43325503:-,0,-1
207692,chr17:34495988-34496053:-,0.4810,0.4219,0.3687,0.7313,0.7666,0.7646,2.4586,0.9557,1.1452,...,0.0000,1.9102,TBC1D3B,chr17:34495988-34496053:-,17,36168630,36168695,chr17:34495988-34496053:-,0,-1
207693,chr1:148010884-148011056:-,2.6944,2.2891,1.8186,4.0291,4.0015,3.7968,2.3984,2.6564,4.2824,...,2.3518,1.1468,NBPF14,chr1:148010884-148011056:-,1,120833602,120833774,chr1:148010884-148011056:-,0,1


In [148]:
#exon_counts_old 239.322, exon_counts 207.695
#all rows different in old and mergedwith2: 31.627
#samples only in old 
missing = exon_counts_old.merge(exon_counts['sample'], how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='left_only']
missing

Unnamed: 0,sample,TCGA-KN-8419-01,TCGA-KL-8346-01,TCGA-KN-8422-01,TCGA-KN-8431-11,TCGA-KN-8430-11,TCGA-KM-8440-01,TCGA-KO-8414-01,TCGA-KL-8323-01,TCGA-KM-8639-01,...,TCGA-KO-8417-01,TCGA-KM-8438-01,TCGA-KL-8340-01,TCGA-KO-8406-01,TCGA-KO-8408-01,TCGA-KM-8443-01,TCGA-KM-8442-01,TCGA-KL-8332-11,TCGA-KL-8327-01,_merge
2,chr17:49268962-49270377:-,0.3884,0.2219,0.2050,0.9168,0.6069,0.4547,0.1571,0.3857,0.7535,...,0.4021,0.1127,0.3816,0.5932,0.1602,0.3507,0.7154,0.4353,0.0966,left_only
4,chr1:16074041-16074475:+,0.0000,0.0000,0.0000,1.4721,0.1284,0.0000,0.0000,0.0000,0.0374,...,0.0397,0.0000,0.0000,0.0000,0.0000,0.0189,0.1334,4.8648,0.0900,left_only
7,chr14:106521424-106521433:-,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,left_only
20,chr11:65620753-65621170:+,0.0000,0.2804,0.0000,0.0630,0.3368,0.0000,0.1703,0.0000,0.4093,...,0.0615,0.0614,0.0000,0.0000,0.0491,0.0390,0.1386,0.1424,0.0240,left_only
44,chr15:31008518-31008545:+,1.1151,0.4858,0.2285,0.0000,0.0000,1.0212,0.2724,0.7350,0.4937,...,0.2830,0.2826,0.0000,0.2700,0.6008,1.7890,0.8953,0.2553,0.0000,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239233,chr1:15798485-15798585:+,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.2261,...,0.0000,0.0000,0.0849,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,left_only
239271,chr14:69555924-69557873:-,0.3462,0.4367,0.0106,0.1936,0.3415,0.4876,0.2449,0.5533,0.2155,...,0.0311,0.5334,0.4010,0.4566,0.0474,0.5502,0.4836,0.2087,0.1523,left_only
239279,chr1:45249259-45250155:-,0.2589,0.5108,0.1468,0.2811,0.4449,0.0456,0.6869,0.0198,0.1730,...,0.1036,0.6545,0.2105,0.2620,0.4969,0.1978,0.4603,0.2030,0.0660,left_only
239304,chr11:93211641-93212450:-,4.0361,3.7048,2.1906,4.6267,4.3716,1.5919,3.8414,4.4238,4.2922,...,3.6441,2.7758,4.0947,4.4172,3.2479,2.9461,4.0862,4.1306,1.3293,left_only


In [136]:
exon_counts_old = pd.read_csv(tcga_path/'HiSeqV2_exon', delimiter='	')
#exon_counts[exon_counts.gene.isna()]#5059 , 208970 - 208643 = 327
exon_counts_old#[exon_counts.duplicated(subset=['sample'],keep=False)]#[exon_counts.id.isna()] #left 240270 -> 31627 NA
#counts after duplicates: inner:207695 

Unnamed: 0,sample,TCGA-KN-8419-01,TCGA-KL-8346-01,TCGA-KN-8422-01,TCGA-KN-8431-11,TCGA-KN-8430-11,TCGA-KM-8440-01,TCGA-KO-8414-01,TCGA-KL-8323-01,TCGA-KM-8639-01,...,TCGA-KN-8421-01,TCGA-KO-8417-01,TCGA-KM-8438-01,TCGA-KL-8340-01,TCGA-KO-8406-01,TCGA-KO-8408-01,TCGA-KM-8443-01,TCGA-KM-8442-01,TCGA-KL-8332-11,TCGA-KL-8327-01
0,chr3:52007981-52008646:-,2.1156,1.7396,2.0641,3.1457,3.0436,2.2209,1.8213,1.9092,2.9139,...,1.8941,2.8727,2.1821,2.2543,2.2040,2.2369,2.4951,2.1010,3.0617,0.7424
1,chr1:215901372-215901726:-,0.0000,0.0000,0.0000,0.0251,0.0233,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
2,chr17:49268962-49270377:-,0.3884,0.2219,0.2050,0.9168,0.6069,0.4547,0.1571,0.3857,0.7535,...,0.2138,0.4021,0.1127,0.3816,0.5932,0.1602,0.3507,0.7154,0.4353,0.0966
3,chr4:147214081-147214132:-,2.5290,0.7196,1.3094,2.3074,1.6894,1.5829,1.0835,1.9041,1.2131,...,2.1785,1.6327,0.0000,1.5862,1.7814,0.6381,1.5227,1.7786,1.4157,0.0000
4,chr1:16074041-16074475:+,0.0000,0.0000,0.0000,1.4721,0.1284,0.0000,0.0000,0.0000,0.0374,...,0.0000,0.0397,0.0000,0.0000,0.0000,0.0000,0.0189,0.1334,4.8648,0.0900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239317,chr3:52012275-52012390:+,2.4455,5.1078,2.9468,5.4212,5.1299,4.5371,3.5954,4.4245,4.4484,...,4.3723,4.6508,4.2172,4.4777,4.4666,5.0518,4.9067,3.8206,7.3459,4.1126
239318,chr6:43324864-43325503:-,1.5701,0.5500,0.3451,2.2783,1.7366,0.6813,0.5851,1.4460,2.4188,...,0.4057,1.2708,0.7898,0.8588,1.3135,0.8498,1.0774,0.8774,1.2410,0.2725
239319,chr17:34495988-34496053:-,0.4810,0.4219,0.3687,0.7313,0.7666,0.7646,2.4586,0.9557,1.1452,...,1.0783,0.7170,0.7943,0.3544,0.6077,1.3415,1.8842,0.8646,0.0000,1.9102
239320,chr1:148010884-148011056:-,2.6944,2.2891,1.8186,4.0291,4.0015,3.7968,2.3984,2.6564,4.2824,...,2.9737,2.2911,1.5489,2.4209,3.6234,3.0631,1.8938,2.9015,2.3518,1.1468


In [11]:
mirna_counts

Unnamed: 0,sample,TCGA-C4-A0F6-01,TCGA-CU-A0YO-01,TCGA-BT-A0S7-01,TCGA-CU-A0YR-01,TCGA-BL-A0C8-01,TCGA-C4-A0F0-01,TCGA-BL-A13J-01,TCGA-BT-A0YX-01,TCGA-CU-A0YN-01,...,TCGA-AG-A020-01,TCGA-AG-A01Y-01,TCGA-AG-A01W-01,TCGA-AG-3726-01,TCGA-AG-3605-01,TCGA-AG-3584-01,TCGA-AG-3599-01,TCGA-AG-3583-01,TCGA-AG-3598-01,TCGA-AG-3586-01
0,hsa-let-7a-2-3p,0.99,1.91,3.02,0.85,0.85,2.70,2.50,1.22,1.57,...,-0.91,1.18,1.29,1.02,0.16,2.09,1.59,0.74,1.56,1.35
1,hsa-let-7a-3p,5.08,5.99,6.03,4.23,5.06,5.05,5.43,4.64,5.09,...,5.91,5.75,5.40,4.45,4.61,4.54,4.91,4.38,4.75,4.73
2,hsa-let-7a-5p,14.12,14.35,15.97,14.96,15.16,15.33,14.95,14.67,14.53,...,14.47,14.83,14.70,14.39,14.73,14.50,14.13,14.04,14.50,13.98
3,hsa-let-7b-3p,2.95,3.68,5.40,3.92,4.26,4.38,4.33,3.42,4.87,...,4.52,3.24,4.08,4.73,3.58,4.43,4.89,4.55,5.42,3.96
4,hsa-let-7b-5p,12.75,12.55,15.08,12.90,12.54,14.03,13.23,12.83,12.89,...,11.67,11.78,12.56,12.09,12.38,12.22,12.47,12.89,12.58,12.72
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
738,hsa-miR-888-5p,0.00,0.00,0.36,0.00,0.00,0.00,0.00,0.25,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
739,hsa-miR-890,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
740,hsa-miR-891b,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
741,hsa-miR-892b,0.00,0.00,0.00,0.00,0.00,0.26,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


In [48]:
exon_counts[(exon_counts.chrom =='17') & (exon_counts.gene == 'PRPF8')]

AttributeError: 'DataFrame' object has no attribute 'gene'

In [45]:
exon_counts[(exon_counts.chrom =='17') & (exon_counts.chromEnd >= ) & (exon_counts.chromStart <=) & (exon_counts.strand ==-1)]

SyntaxError: invalid syntax (<ipython-input-45-9c13fa3f8aff>, line 1)

In [51]:
len(exon_counts[exon_counts.chromosome_name.isna()])

AttributeError: 'DataFrame' object has no attribute 'chromosome_name'

## Map miRNA to counts

In [9]:
#get subset counts of bs and merge with df_gene
counts = bs[['bs_id', 'miRNA', 'mRNA', 'chrom_bs_start', 'chrom_bs_end']].copy()
counts = counts.merge(df_gene[['ensembl_transcript_id','chromosome_name','strand']], left_on='mRNA', right_on='ensembl_transcript_id', how='left')

#mapping mature ID to miRNA family
trans = pd.read_feather(path/'mature2families.feather')
counts = counts.merge(trans, left_on='miRNA',right_on='mature_name', how='left')

#alternative 0: dont join with mirnaids
#counts.drop(['ensembl_transcript_id', 'mature_name'], axis=1, inplace=True)

#! alternative 1: for all caseids at the same time !
counts = counts.merge(mirna_counts, left_on='mature_acc', right_on='sample', how='left')
counts.drop(['ensembl_transcript_id', 'mature_name', 'sample'], axis=1, inplace=True)

# alternative 2: for 1 caseids at a time
for caseid in []:#[caseids[0]]:
    counts = counts.merge(mirna_counts[['sample',caseid]], left_on='mature_acc', right_on='sample', how='left')
    counts.drop(['ensembl_transcript_id', 'mature_name', 'sample'], axis=1, inplace=True)

#counts[['chromosome_name','strand','miRNA','mRNA']] = counts[['chromosome_name','strand','miRNA','mRNA']].astype('str')#.astype('category')
counts.head()

Unnamed: 0,bs_id,miRNA,mRNA,chrom_bs_start,chrom_bs_end,chromosome_name,strand,mature_acc,mirna_family,TCGA-KN-8426-01,...,TCGA-KO-8408-01,TCGA-KM-8438-01,TCGA-KL-8323-01,TCGA-KN-8429-11,TCGA-KO-8413-01,TCGA-KO-8411-01,TCGA-KL-8332-11,TCGA-KL-8335-01,TCGA-KL-8339-11,TCGA-KO-8415-01
0,0,hsa-let-7a-2-3p,ENST00000576537,1578440,1578466,17,-1,MIMAT0010195,let-7,2.893871,...,2.829471,2.015161,2.594806,2.644861,1.786991,1.854876,2.824365,2.395032,2.791374,2.819371
1,1,hsa-let-7a-2-3p,ENST00000576537,1578561,1578581,17,-1,MIMAT0010195,let-7,2.893871,...,2.829471,2.015161,2.594806,2.644861,1.786991,1.854876,2.824365,2.395032,2.791374,2.819371
2,2,hsa-let-7b-5p,ENST00000576537,1576707,1578261,17,-1,MIMAT0000063,let-7,12.820202,...,12.829037,12.38224,13.638682,14.190721,14.086433,13.729646,13.115391,13.531265,13.56459,14.094524
3,3,hsa-let-7b-3p,ENST00000576537,1578446,1578467,17,-1,MIMAT0004482,let-7,4.104356,...,4.210218,3.770706,4.64693,4.29571,5.294043,5.520265,4.097683,4.664511,4.351934,4.063494
4,4,hsa-let-7b-3p,ENST00000576537,1576625,1576643,17,-1,MIMAT0004482,let-7,4.104356,...,4.210218,3.770706,4.64693,4.29571,5.294043,5.520265,4.097683,4.664511,4.351934,4.063494


In [246]:
#TODO put in presentation, analysis mirna, how many in xena data
#TODO find out : if na should i use it as 0????
counts['TCGA-KN-8426-01'].isna().sum() #for caseid TCGA-KN-8426-01: 56035 bs are thrown away 
len(list(counts[counts['TCGA-KN-8426-01'].isna()].mature_acc.unique())) #we don't have the counts for 1948 mirnas in bs
len(list(counts[~ counts['TCGA-KN-8426-01'].isna()].mature_acc.unique())) #we have counts for 658 mirnas

len(list(mirna_counts['sample'].unique())) #1917 = absolute nr mirnas from TCGA-KN-8426-01
len(list(mirna_counts[~ mirna_counts['TCGA-KN-8426-01'].isna()]['sample'].unique())) #1236 = not na nr mirnas from TCGA-KN-8426-01

658

## Map exons to counts

In [12]:
#for a large data set, you will likely get a significant speed increase by creating an index for any column(s) used in the join condition. 
#https://stackoverflow.com/questions/30627968/merge-pandas-dataframes-where-one-value-is-between-two-others
#instead use SQL
conn = sqlite3.connect('db.db')#':memory:') #Make the database in memory
c = conn.cursor()

#write the tables
counts.to_sql('counts', conn, index=False, dtype={"bs_id": 'INTEGER'})
exon_counts.to_sql('exon_counts', conn, index=True, index_label='exon_id', dtype={caseid : 'INTEGER' for caseid in caseids}.update({"chromStart": 'INTEGER', "chromEnd": 'INTEGER'}))

In [258]:
#print sqlite3 table 
qry = '''
    select * from counts
    '''
c.execute(qry)
conn.commit()
c.fetchall()
pd.read_sql_query(qry,conn)

Unnamed: 0,bs_id,miRNA,mRNA,chrom_bs_start,chrom_bs_end,chromosome_name,strand,mature_acc,mirna_family,TCGA-KN-8426-01,...,TCGA-KO-8408-01,TCGA-KM-8438-01,TCGA-KL-8323-01,TCGA-KN-8429-11,TCGA-KO-8413-01,TCGA-KO-8411-01,TCGA-KL-8332-11,TCGA-KL-8335-01,TCGA-KL-8339-11,TCGA-KO-8415-01
0,0,hsa-let-7a-2-3p,ENST00000576537,1578440,1578466,17,-1,MIMAT0010195,let-7,2.893871,...,2.829471,2.015161,2.594806,2.644861,1.786991,1.854876,2.824365,2.395032,2.791374,2.819371
1,1,hsa-let-7a-2-3p,ENST00000576537,1578561,1578581,17,-1,MIMAT0010195,let-7,2.893871,...,2.829471,2.015161,2.594806,2.644861,1.786991,1.854876,2.824365,2.395032,2.791374,2.819371
2,2,hsa-let-7b-5p,ENST00000576537,1576707,1578261,17,-1,MIMAT0000063,let-7,12.820202,...,12.829037,12.382240,13.638682,14.190721,14.086433,13.729646,13.115391,13.531265,13.564590,14.094524
3,3,hsa-let-7b-3p,ENST00000576537,1578446,1578467,17,-1,MIMAT0004482,let-7,4.104356,...,4.210218,3.770706,4.646930,4.295710,5.294043,5.520265,4.097683,4.664511,4.351934,4.063494
4,4,hsa-let-7b-3p,ENST00000576537,1576625,1576643,17,-1,MIMAT0004482,let-7,4.104356,...,4.210218,3.770706,4.646930,4.295710,5.294043,5.520265,4.097683,4.664511,4.351934,4.063494
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73188,73188,hsa-miR-12128,ENST00000216014,38479688,38479728,22,1,MIMAT0049022,hsa-miR-12128,,...,,,,,,,,,,
73189,73189,hsa-miR-12129,ENST00000216014,38481326,38481371,22,1,MIMAT0049023,hsa-miR-12129,,...,,,,,,,,,,
73190,73190,hsa-miR-12129,ENST00000216014,38479605,38479631,22,1,MIMAT0049023,hsa-miR-12129,,...,,,,,,,,,,
73191,73191,hsa-miR-12129,ENST00000216014,38479693,38479708,22,1,MIMAT0049023,hsa-miR-12129,,...,,,,,,,,,,


In [11]:
#DELETE table from sqlite3 DB
c.execute("DROP TABLE counts;")
conn.commit()
c.execute("DROP TABLE exon_counts;")
conn.commit()

In [181]:
counts[(counts.chrom_bs_start > 1578445) & (counts.chrom_bs_end < 1578634)]

Unnamed: 0,bs_id,miRNA,mRNA,chrom_bs_start,chrom_bs_end,chromosome_name,strand,mature_acc,mirna_family,TCGA-KN-8426-01,...,TCGA-KO-8408-01,TCGA-KM-8438-01,TCGA-KL-8323-01,TCGA-KN-8429-11,TCGA-KO-8413-01,TCGA-KO-8411-01,TCGA-KL-8332-11,TCGA-KL-8335-01,TCGA-KL-8339-11,TCGA-KO-8415-01
1,1,hsa-let-7a-2-3p,ENST00000576537,1578561,1578581,17,-1,MIMAT0010195,let-7,2.893871,...,2.829471,2.015161,2.594806,2.644861,1.786991,1.854876,2.824365,2.395032,2.791374,2.819371
3,3,hsa-let-7b-3p,ENST00000576537,1578446,1578467,17,-1,MIMAT0004482,let-7,4.104356,...,4.210218,3.770706,4.646930,4.295710,5.294043,5.520265,4.097683,4.664511,4.351934,4.063494
6,6,hsa-let-7c-3p,ENST00000576537,1578453,1578471,17,-1,MIMAT0026472,let-7,2.695914,...,2.714853,2.455017,2.155998,3.673442,2.676288,3.043974,4.407776,2.500585,3.844001,2.977714
9,9,hsa-let-7d-3p,ENST00000576537,1578446,1578473,17,-1,MIMAT0004484,let-7,7.990090,...,8.056507,8.939957,8.391837,7.818937,7.722212,8.106506,7.446075,8.217851,8.154153,8.335863
12,12,hsa-let-7e-3p,ENST00000576537,1578561,1578579,17,-1,MIMAT0004485,let-7,3.147645,...,2.523778,3.836892,2.846723,4.056371,2.882109,3.414638,3.685095,2.885547,3.698584,3.274872
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3384,3384,hsa-miR-12124,ENST00000576537,1578599,1578616,17,-1,MIMAT0049018,hsa-miR-12124,,...,,,,,,,,,,
3385,3385,hsa-miR-12124,ENST00000576537,1578488,1578508,17,-1,MIMAT0049018,hsa-miR-12124,,...,,,,,,,,,,
3388,3388,hsa-miR-12126,ENST00000576537,1578447,1578480,17,-1,MIMAT0049020,hsa-miR-12126,,...,,,,,,,,,,
3390,3390,hsa-miR-12128,ENST00000576537,1578517,1578566,17,-1,MIMAT0049022,hsa-miR-12128,,...,,,,,,,,,,


In [325]:
exon_counts[(exon_counts.chromStart > 1578445) & (exon_counts.chromEnd < 1578634)]

Unnamed: 0,TCGA-KN-8419-01,TCGA-KL-8346-01,TCGA-KN-8422-01,TCGA-KN-8431-11,TCGA-KN-8430-11,TCGA-KM-8440-01,TCGA-KO-8414-01,TCGA-KL-8323-01,TCGA-KM-8639-01,TCGA-KO-8415-11,...,TCGA-KO-8408-01,TCGA-KM-8443-01,TCGA-KM-8442-01,TCGA-KL-8332-11,TCGA-KL-8327-01,gene,chrom,chromStart,chromEnd,strand
45710,4.922,2.6972,2.0686,4.7578,4.705,4.5074,3.3096,4.5863,4.6372,4.7683,...,2.5759,4.1899,4.4041,3.6642,1.3794,PRPF8,chr17,1578446,1578633,-1


In [None]:
#TODO change column name case_id to column name case_id_mirna so it doesnt overlap with exon
#instead of joining like i do & repeating the exon info -> only add exon_id + pivot / addup per exon + add exon counts

In [None]:
#TODO right now I hardcoded TCGA-KN-8419-01!!
#exon_counts.chrom, exon_counts.chromStart, exon_counts.chromEnd, exon_counts.strand, exon_counts.'TCGA-KN-8419-01', exon_counts.exon_id, counts.bs_id, counts.miRNA, counts.mRNA, counts.chrom_bs_start, counts.chrom_bs_end

qry = '''
    CREATE TABLE n_counts AS
    select
        *
    from
        counts inner join (
        select exon_counts.chrom, exon_counts.chromStart, exon_counts.chromEnd, exon_counts.strand, exon_counts.exon_id
        from exon_counts )
        on
        counts.chromosome_name = exon_counts.chrom and counts.strand = exon_counts.strand and counts.chrom_bs_start >= exon_counts.chromStart and counts.chrom_bs_end <= exon_counts.chromEnd
    '''
c.execute(qry)
conn.commit()
print('n_counts done.')
qry = '''
    select * from n_counts
    '''
c.execute(qry)
conn.commit()
c.fetchall()
#n_counts = pd.read_sql_query(qry,conn) #TODO somehow empty idk why

#todo google pandas 3D
#look at old code idk why

#only returns bs with minimum 1 exon_id cause inner join (else left join)
#automatically filters out all binding sites that have more than 1 exon_id
#if u dont want that use outer join -> actually wrong, it uses the first instead of leaving it out!

In [327]:
n_counts

Unnamed: 0,bs_id,miRNA,mRNA,chrom_bs_start,chrom_bs_end,chromosome_name,strand,mature_acc,mirna_family,TCGA-KN-8426-01,...,TCGA-KO-8408-01,TCGA-KM-8443-01,TCGA-KM-8442-01,TCGA-KL-8332-11,TCGA-KL-8327-01,gene,chrom,chromStart,chromEnd,strand.1
0,0,hsa-let-7a-2-3p,ENST00000576537,1578440,1578466,17,-1,MIMAT0010195,let-7,2.893871,...,,,,,,,,,,
1,1,hsa-let-7a-2-3p,ENST00000576537,1578561,1578581,17,-1,MIMAT0010195,let-7,2.893871,...,,,,,,,,,,
2,2,hsa-let-7b-5p,ENST00000576537,1576707,1578261,17,-1,MIMAT0000063,let-7,12.820202,...,,,,,,,,,,
3,3,hsa-let-7b-3p,ENST00000576537,1578446,1578467,17,-1,MIMAT0004482,let-7,4.104356,...,,,,,,,,,,
4,4,hsa-let-7b-3p,ENST00000576537,1576625,1576643,17,-1,MIMAT0004482,let-7,4.104356,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73188,73188,hsa-miR-12128,ENST00000216014,38479688,38479728,22,1,MIMAT0049022,hsa-miR-12128,,...,,,,,,,,,,
73189,73189,hsa-miR-12129,ENST00000216014,38481326,38481371,22,1,MIMAT0049023,hsa-miR-12129,,...,,,,,,,,,,
73190,73190,hsa-miR-12129,ENST00000216014,38479605,38479631,22,1,MIMAT0049023,hsa-miR-12129,,...,,,,,,,,,,
73191,73191,hsa-miR-12129,ENST00000216014,38479693,38479708,22,1,MIMAT0049023,hsa-miR-12129,,...,,,,,,,,,,


In [293]:
#old
#TODO takes fuuuuuckin long, instead join on strand, chromosome_name, start, end
#merge bs with exon counts, real slow, TODO cython, 22.34 - lookup when
def is_in_exon(row, exon_counts):
    bs_start = row.chrom_bs_start
    bs_end = row.chrom_bs_end
    here = exon_counts.loc[(exon_counts.chrom == row.chromosome_name) & (exon_counts['strand'] == row.strand) & (exon_counts.chromStart == bs_start) & (exon_counts.chromEnd == bs_end)]
    #print(here)
    count_list = str(list(here['TCGA-KN-8419-01']))[1:-1]
    return count_list if not count_list == '' else None

#takes long! for 1 case id: 2 min 
#exon_counts = all_exon_dfs['aba26e6b-f11c-4ae6-a0d6-85bdd2060e8f'] #df#all_exon_dfs[case_id]
counts['exon'] = counts.apply(lambda row: is_in_exon(row,exon_counts), axis=1)
bs_exons = pd.DataFrame(counts.exons.str.split(',').tolist(),index=counts.bs_id)
bs_exons = bs_exons.rename(columns={0: "exon_raw_count"})
bs_exons.head()

KeyboardInterrupt: 

In [158]:
#visualize amount exons by bindingsite | amount bindingsites by exon
import sqlite3
conn = sqlite3.connect(':memory:') #Make the database in memory

#write the tables
important_counts.to_sql('bs', conn, index=False, dtype={"bs_id": 'INTEGER'})
new_exon_counts.to_sql('counts', conn, index=True, index_label='exon_id', dtype={"exon_id": 'INTEGER'})
qry = '''
    select
        counts.chromosome_name, counts.chrom_exon_start, counts.chrom_exon_end, counts.strand, counts.raw_counts, counts.exon_id, bs.bs_id, bs.miRNA, bs.mRNA, bs.chrom_bs_start, bs.chrom_bs_end
    from
        bs left join counts on
        bs.chromosome_name = counts.chromosome_name and bs.strand = counts.strand and bs.chrom_bs_start >= counts.chrom_exon_start and bs.chrom_bs_end <= counts.chrom_exon_end
    '''
bs_exons2 = pd.read_sql_query(qry, conn)
bs_exons2.head()
#bs_exons2.groupby('bs_id').count().raw_counts.hist()
#bs_exons2.groupby('exon_id').count()

In [287]:
n_counts

Unnamed: 0,chrom,chromStart,chromEnd,strand,TCGA-KN-8419-01,exon_id,bs_id,miRNA,mRNA,chrom_bs_start,chrom_bs_end


## Put all counts into one & pivot table

In [248]:
mirna_counts.head()

Unnamed: 0,sample,TCGA-KN-8426-01,TCGA-KN-8419-11,TCGA-KL-8332-01,TCGA-KN-8423-01,TCGA-KO-8405-01,TCGA-KL-8337-01,TCGA-KN-8424-11,TCGA-KN-8422-11,TCGA-KL-8334-01,...,TCGA-KO-8408-01,TCGA-KM-8438-01,TCGA-KL-8323-01,TCGA-KN-8429-11,TCGA-KO-8413-01,TCGA-KO-8411-01,TCGA-KL-8332-11,TCGA-KL-8335-01,TCGA-KL-8339-11,TCGA-KO-8415-01
0,MIMAT0019868,,,,,,,,,,...,,,,,,,,,,
1,MIMAT0019869,,0.1644,,0.1544,,,,0.301479,,...,,,,,,,,,,
2,MIMAT0019860,,,,,,,,,,...,,,,,,,,,,0.4087
3,MIMAT0019862,,,,0.1544,,,,,,...,,,,,,,0.2042,,,
4,MIMAT0019864,,0.1644,0.1774,,,0.1638,0.4246,,0.2075,...,,0.2143,,,,,,0.1803,,


In [279]:
exon_counts.head()

Unnamed: 0,TCGA-KN-8419-01,TCGA-KL-8346-01,TCGA-KN-8422-01,TCGA-KN-8431-11,TCGA-KN-8430-11,TCGA-KM-8440-01,TCGA-KO-8414-01,TCGA-KL-8323-01,TCGA-KM-8639-01,TCGA-KO-8415-11,...,TCGA-KO-8408-01,TCGA-KM-8443-01,TCGA-KM-8442-01,TCGA-KL-8332-11,TCGA-KL-8327-01,gene,chrom,chromStart,chromEnd,strand
0,2.1156,1.7396,2.0641,3.1457,3.0436,2.2209,1.8213,1.9092,2.9139,3.1049,...,2.2369,2.4951,2.101,3.0617,0.7424,"ABHD14B,RP11-155D18.14,RP11-155D18.12",chr3,52007981,52008646,-1
1,0.0,0.0,0.0,0.0251,0.0233,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,USH2A,chr1,215901372,215901726,-1
2,2.529,0.7196,1.3094,2.3074,1.6894,1.5829,1.0835,1.9041,1.2131,1.7214,...,0.6381,1.5227,1.7786,1.4157,0.0,SLC10A7,chr4,147214081,147214132,-1
3,2.3839,3.8702,2.5373,2.3464,2.2976,3.0639,2.5666,2.7963,3.0875,2.0227,...,3.0412,3.5838,3.7551,1.4157,1.8708,TNK2,chr3,195599147,195599341,-1
4,1.9974,0.6984,0.9011,0.6827,1.2425,0.9715,1.6701,1.3011,1.3712,1.5433,...,1.141,1.2345,1.7403,0.0,0.0,LRRC37B,chr17,30351730,30351801,1


In [None]:
#TODO put all different caseids sqlite tables together into one
#INSERT INTO artists_backup 
#SELECT ArtistId, Name
#FROM artists;

In [None]:
#output: for each case_id for each bs_id: interesting miRNA expression + exon expression
#unstack family counts from 2 columns to several columns (mirna_family=name,mirna_read_count=value)
pivoted = bs_mirnas.copy()
pivoted = pivoted.pivot_table(values='mirna_read_count', index=['exon_id', 'exon_raw_counts'], columns='mirna_family', aggfunc='sum', fill_value=0)
pivoted#.head()
#TODO debug does this do it right OR use sqllite3

In [None]:
#TODO pivot in sqlite3 
#https://modern-sql.com/use-case/pivot
#https://stackoverflow.com/questions/1237068/how-to-pivot-in-sqlite-or-i-e-select-in-wide-format-a-table-stored-in-long-form

In [227]:
bs_mirnas[bs_mirnas.exon_id == 216595]

Unnamed: 0,exon_id,exon_raw_counts,miRNA,miRNA_ID,mirna_family,mirna_read_count
7,216595,0,hsa-let-7b-5p,hsa-let-7b,let-7,55647
14,216595,0,hsa-let-7d-5p,hsa-let-7d,let-7,3767
25,216595,0,hsa-let-7e-3p,hsa-let-7e,let-7,11424
33,216595,0,hsa-let-7e-3p,hsa-let-7e,let-7,11424
46,216595,0,hsa-miR-15a-3p,hsa-mir-15a,mir-15,2258
...,...,...,...,...,...,...
12562,216595,0,hsa-miR-5692a,hsa-mir-5692a-2,mir-5692,0
12563,216595,0,hsa-miR-548az-5p,hsa-mir-548az,mir-548,0
12566,216595,0,hsa-miR-8062,hsa-mir-8062,hsa-miR-8062,0
12571,216595,0,hsa-miR-8064,hsa-mir-8064,hsa-miR-8064,0


In [226]:
bs_mirnas[bs_mirnas.mirna_family =='hsa-miR-1229-5p']

# 1 right? count is added for exonid 216595, but do i already add it before ?
# 2 also : exon_id + mirna double (3368 & 3370) wrong!!

Unnamed: 0,exon_id,exon_raw_counts,miRNA,miRNA_ID,mirna_family,mirna_read_count
3365,216603,0,hsa-miR-1229-5p,hsa-mir-1229,hsa-miR-1229-5p,4
3368,216595,0,hsa-miR-1229-5p,hsa-mir-1229,hsa-miR-1229-5p,4
3370,216595,0,hsa-miR-1229-5p,hsa-mir-1229,hsa-miR-1229-5p,4
3371,216596,0,hsa-miR-1229-5p,hsa-mir-1229,hsa-miR-1229-5p,4


In [None]:
#visualize amount exons by bindingsite
#bs_exons.groupby('bs_id').count().raw_counts.hist()

In [274]:
#TODO check - is nr mirna downloaded also here? if not: mistake
print(X.shape,y[0].shape)

(8, 1680) (8,)


# Elastic Net Regression

In [None]:
import sys
sys.getsizeof(pivoted)#TODO

In [276]:
#regression btw miRNA expression, exon expression
import sklearn
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression
#faster if np.array(x, order='F')
#If you are interested in controlling the L1 and L2 penalty separately, keep in mind that this is equivalent to:
#a * L1 + b * L2 where: alpha = a + b and l1_ratio = a / (a + b)
X = np.array(pivoted)
y = np.array(pivoted.index.get_level_values('exon_raw_counts'))

regr = ElasticNet(random_state=0)
regr.fit(X, y)
print(regr.coef_)
print(regr.intercept_)

[0. 0. 0. ... 0. 0. 0.]
0.0


  model = cd_fast.enet_coordinate_descent(
