In [2]:
from typing import (
    List,
    Tuple,
    NamedTuple
)
import pprint

import pandas as pd

import conftest
import libnano.ensemblrest as er

In [3]:
GENE: str = 'SLC17A8'
GENE_EID: str = 'ENSG00000179520'
TRANSCRIPT: str = 'ENST00000323346.9'
THREE_P_EXON: str = 'ENSE00001244923'
PROBE_NAME: str = 'ILMN_1767842'
SPECIES: str = 'human'

In [4]:
GeneInfo = NamedTuple('GeneInfo', [
    ('symbol', str),
    ('barcode', str),
    ('gene_id', str),
    ('transcript_id', str), # canonical id
    ('utr_id', str)  #
    ]
)

In [5]:
GENES_AND_BARCODES: List[GeneInfo] = [
    GeneInfo('SLC17A8', 'ACAGC', 'ENSG00000179520', 'ENST00000323346', 'ENSE00001244923'),
    GeneInfo('GFAP',    'TACAT', 'ENSG00000131095', 'ENST00000638281', 'ENSE00003806990'),
    GeneInfo('FOXO1',   'TTTGC', 'ENSG00000150907', 'ENST00000379561', 'ENSE00001481591'),
    GeneInfo('PSEN2',   'CATTA', 'ENSG00000143801', 'ENST00000366783', 'ENSE00001380688'),
    GeneInfo('DAXX',    'AACCG', 'ENSG00000204209', 'ENST00000374542', 'ENSE00001815438'),
    GeneInfo('CDK5R1',  'CGAGA', 'ENSG00000176749', 'ENST00000313401', 'ENSE00001271015')
]

In [6]:
out_columns: List[str] = [
'symbol',
'gene_id',
'exon_id'
'probe_name',
'probe_seq',
'probe_start',
'probe_end',
'probe_strand'
]

In [19]:
df_out: pd.DataFrame = pd.DataFrame(columns=out_columns)
todfdict = lambda x: {a: b for a, b in zip(out_columns, x)}
for item in GENES_AND_BARCODES:
    three_p_exon_id: str = item.utr_id
    filtered_probes: pd.DataFrame = er.getProbesForID(three_p_exon_id, keep_n=5)
    for i in range(len(filtered_probes)):
        probe = filtered_probes.iloc[i]
#         print(probe)
        p_start: int =  probe['start']
        p_end: int =    probe['end']
        p_strand: int = probe['strand']
        seq: str = er.getRegionSequence(SPECIES,
                                        chromosome=probe['seq_region_name'],
                                        start_idx=p_start,
                                        end_idx=p_end,
                                        strand=probe['strand']
                                        )
#         print(seq)
        row: list = [
            item.symbol,
            item.gene_id,
            item.utr_id,
            probe['probe_name'],
            seq,
            p_start,
            p_end,
            p_strand
        ]
        df_out = df_out.append(todfdict(row), ignore_index=True)

In [20]:
df_out

Unnamed: 0,symbol,gene_id,exon_idprobe_name,probe_seq,probe_start,probe_end,probe_strand
0,SLC17A8,ENSG00000179520,ENSE00001244923,A_24_P124647,TCCTGTTCATTGATTTTAAACATTTTATTCCTACTTTCAGAAGAAA...,100421560,100421619
1,SLC17A8,ENSG00000179520,ENSE00001244923,ILMN_1767842,ATCCATGCAAGCCCCATAAAACAGTTCCTAGCATGCAGAAAATGCC...,100421854,100421903
2,SLC17A8,ENSG00000179520,ENSE00001244923,0004120551,ATCCATGCAAGCCCCATAAAACAGTTCCTAGCATGCAGAAAATGCC...,100421854,100421903
3,SLC17A8,ENSG00000179520,ENSE00001244923,A_23_P159076,CCCACGTAAATAGCTGTCATCATCATTATCTTTTAACATTTTGGGG...,100421898,100421957
4,SLC17A8,ENSG00000179520,ENSE00001244923,100:691;,GTTGTGCTGTTTTCTAAGTAAAATA,100422012,100422036
5,GFAP,ENSG00000131095,ENSE00003806990,A_24_P59786,GTGGTACAGAGTAACTGTACATTAAACTGGCAGAGCTTGTTAGTGG...,44909918,44909977
6,GFAP,ENSG00000131095,ENSE00003806990,A_33_P3335895,AAAGCCTCACAATACGAGTTATACCAATACAGGCTCACCAGATTGT...,44910150,44910209
7,GFAP,ENSG00000131095,ENSE00003806990,2620230,GGAGAAGGTCTGCACGGGAATGGTG,44910631,44910655
8,GFAP,ENSG00000131095,ENSE00003806990,1280138,GAGAAGGTCTGCACGGGAATGGTGA,44910632,44910656
9,GFAP,ENSG00000131095,ENSE00003806990,2117594,AGAAGGTCTGCACGGGAATGGTGAT,44910633,44910657
