In [5]:
from typing import (
    List,
    Tuple,
    NamedTuple
)
import pprint

import pandas as pd

import conftest
import libnano.ensemblrest as er

In [6]:
GENE: str = 'SLC17A8'
GENE_EID: str = 'ENSG00000179520'
TRANSCRIPT: str = 'ENST00000323346.9'
THREE_P_EXON: str = 'ENSE00001244923'
PROBE_NAME: str = 'ILMN_1767842'
SPECIES: str = 'human'

In [7]:
GeneInfo = NamedTuple('GeneInfo', [
    ('symbol', str),
    ('barcode', str),
    ('gene_id', str),
    ('transcript_id', str), # canonical id
    ('utr_id', str)  #
    ]
)

In [8]:
GENES_AND_BARCODES: List[GeneInfo] = [
    GeneInfo('SLC17A8', 'ACAGC', 'ENSG00000179520', 'ENST00000323346', 'ENSE00001244923'),
    GeneInfo('GFAP',    'TACAT', 'ENSG00000131095', 'ENST00000638281', 'ENSE00003806990'),
    GeneInfo('FOXO1',   'TTTGC', 'ENSG00000150907', 'ENST00000379561', 'ENSE00001481591'),
    GeneInfo('PSEN2',   'CATTA', 'ENSG00000143801', 'ENST00000366783', 'ENSE00001380688'),
    GeneInfo('DAXX',    'AACCG', 'ENSG00000204209', 'ENST00000374542', 'ENSE00001815438'),
    GeneInfo('CDK5R1',  'CGAGA', 'ENSG00000176749', 'ENST00000313401', 'ENSE00001271015')
]

In [120]:
out_columns: List[str] = [
'symbol',
'gene_id',
'exon_id',
'probe_name',
'probe_seq',
'probe_start',
'probe_end',
'probe_strand',
'probe_length',
'barcode'
]

In [121]:
df_out: pd.DataFrame = pd.DataFrame(columns=out_columns)
todfdict = lambda x: {a: b for a, b in zip(out_columns, x)}
for item in GENES_AND_BARCODES:
    three_p_exon_id: str = item.utr_id
    filtered_probes: pd.DataFrame = er.getProbesForID(three_p_exon_id, keep_n=5)
    barcode: str = item.barcode
    for i in range(len(filtered_probes)):
        probe = filtered_probes.iloc[i]
#         print(probe)
        p_start: int =  probe['start']
        p_end: int =    probe['end']
        p_strand: int = probe['strand']
        seq: str = er.getRegionSequence(SPECIES,
                                        chromosome=probe['seq_region_name'],
                                        start_idx=p_start,
                                        end_idx=p_end,
                                        strand=probe['strand']
                                        )
#         print(seq)
        row: list = [
            item.symbol,
            item.gene_id,
            item.utr_id,
            probe['probe_name'],
            seq,
            p_start,
            p_end,
            p_strand,
            probe['probe_length'],
            barcode
        ]
        df_out = df_out.append(todfdict(row), ignore_index=True)

In [122]:
df_out

Unnamed: 0,symbol,gene_id,exon_id,probe_name,probe_seq,probe_start,probe_end,probe_strand,probe_length,barcode
0,SLC17A8,ENSG00000179520,ENSE00001244923,A_24_P124647,TCCTGTTCATTGATTTTAAACATTTTATTCCTACTTTCAGAAGAAA...,100421560,100421619,1,60,ACAGC
1,SLC17A8,ENSG00000179520,ENSE00001244923,ILMN_1767842,ATCCATGCAAGCCCCATAAAACAGTTCCTAGCATGCAGAAAATGCC...,100421854,100421903,1,50,ACAGC
2,SLC17A8,ENSG00000179520,ENSE00001244923,0004120551,ATCCATGCAAGCCCCATAAAACAGTTCCTAGCATGCAGAAAATGCC...,100421854,100421903,1,50,ACAGC
3,SLC17A8,ENSG00000179520,ENSE00001244923,A_23_P159076,CCCACGTAAATAGCTGTCATCATCATTATCTTTTAACATTTTGGGG...,100421898,100421957,1,60,ACAGC
4,SLC17A8,ENSG00000179520,ENSE00001244923,100:691;,GTTGTGCTGTTTTCTAAGTAAAATA,100422012,100422036,1,25,ACAGC
5,GFAP,ENSG00000131095,ENSE00003806990,A_24_P59786,GTGGTACAGAGTAACTGTACATTAAACTGGCAGAGCTTGTTAGTGG...,44909918,44909977,-1,60,TACAT
6,GFAP,ENSG00000131095,ENSE00003806990,A_33_P3335895,AAAGCCTCACAATACGAGTTATACCAATACAGGCTCACCAGATTGT...,44910150,44910209,-1,60,TACAT
7,GFAP,ENSG00000131095,ENSE00003806990,2620230,GGAGAAGGTCTGCACGGGAATGGTG,44910631,44910655,1,25,TACAT
8,GFAP,ENSG00000131095,ENSE00003806990,1280138,GAGAAGGTCTGCACGGGAATGGTGA,44910632,44910656,1,25,TACAT
9,GFAP,ENSG00000131095,ENSE00003806990,2117594,AGAAGGTCTGCACGGGAATGGTGAT,44910633,44910657,1,25,TACAT


Padlock structure reminder, left and right are in terms of the hybridized sequence

LINEAR VERSION:

5'    Right Arm       Scaffold Seq (aka Loop)      Left Arm     3'
+------------------>+-----------~-----------++------------------>

HYBRIDIZED VERSION

                 Scaffold Seq (aka Loop)
        -------------------~--------------------
        |                                      |
        <     Left Arm    3' 5'   Right Arm    +
3'      +------------------>+------------------>     5'
<----------------------------------------------------+
              copied RT'd cDNA reverse strand
              
The scaffold seq looks like this:

5'+--TTCCTTT-\[barcode_solid\]---\[T2S_sequence\]--TCTT->3'

We will use 5 mer barcodes

In [123]:
T2S_SEQ: str =    'ACTTCAGCTGCCCCGGGTGAAGA'
RIGHT_LOOP: str = 'TTCCTTT'
LEFT_LOOP: str =  'TCTT'

In [124]:
len(T2S_SEQ)

23

the padlocks will be:

In [125]:
df_padlocks: pd.DataFrame = df_out.copy()
def probe_generator(x) -> str:
    seq: str = x.probe_seq
    len_x: int = len(seq)
    right_len: int = (len_x // 2) + 1
    return ''.join([seq[:right_len], RIGHT_LOOP, x.barcode, LEFT_LOOP, x.barcode, seq[right_len:]])

padlocks: List[str] = [probe_generator(df_padlocks.loc[i]) for i in range(len(df_padlocks)) ]
df_padlocks = df_padlocks.assign(padlock=padlocks)

In [126]:

df_padlocks

Unnamed: 0,symbol,gene_id,exon_id,probe_name,probe_seq,probe_start,probe_end,probe_strand,probe_length,barcode,padlock
0,SLC17A8,ENSG00000179520,ENSE00001244923,A_24_P124647,TCCTGTTCATTGATTTTAAACATTTTATTCCTACTTTCAGAAGAAA...,100421560,100421619,1,60,ACAGC,TCCTGTTCATTGATTTTAAACATTTTATTCCTTCCTTTACAGCTCT...
1,SLC17A8,ENSG00000179520,ENSE00001244923,ILMN_1767842,ATCCATGCAAGCCCCATAAAACAGTTCCTAGCATGCAGAAAATGCC...,100421854,100421903,1,50,ACAGC,ATCCATGCAAGCCCCATAAAACAGTTTTCCTTTACAGCTCTTACAG...
2,SLC17A8,ENSG00000179520,ENSE00001244923,0004120551,ATCCATGCAAGCCCCATAAAACAGTTCCTAGCATGCAGAAAATGCC...,100421854,100421903,1,50,ACAGC,ATCCATGCAAGCCCCATAAAACAGTTTTCCTTTACAGCTCTTACAG...
3,SLC17A8,ENSG00000179520,ENSE00001244923,A_23_P159076,CCCACGTAAATAGCTGTCATCATCATTATCTTTTAACATTTTGGGG...,100421898,100421957,1,60,ACAGC,CCCACGTAAATAGCTGTCATCATCATTATCTTTCCTTTACAGCTCT...
4,SLC17A8,ENSG00000179520,ENSE00001244923,100:691;,GTTGTGCTGTTTTCTAAGTAAAATA,100422012,100422036,1,25,ACAGC,GTTGTGCTGTTTTTTCCTTTACAGCTCTTACAGCCTAAGTAAAATA
5,GFAP,ENSG00000131095,ENSE00003806990,A_24_P59786,GTGGTACAGAGTAACTGTACATTAAACTGGCAGAGCTTGTTAGTGG...,44909918,44909977,-1,60,TACAT,GTGGTACAGAGTAACTGTACATTAAACTGGCTTCCTTTTACATTCT...
6,GFAP,ENSG00000131095,ENSE00003806990,A_33_P3335895,AAAGCCTCACAATACGAGTTATACCAATACAGGCTCACCAGATTGT...,44910150,44910209,-1,60,TACAT,AAAGCCTCACAATACGAGTTATACCAATACATTCCTTTTACATTCT...
7,GFAP,ENSG00000131095,ENSE00003806990,2620230,GGAGAAGGTCTGCACGGGAATGGTG,44910631,44910655,1,25,TACAT,GGAGAAGGTCTGCTTCCTTTTACATTCTTTACATACGGGAATGGTG
8,GFAP,ENSG00000131095,ENSE00003806990,1280138,GAGAAGGTCTGCACGGGAATGGTGA,44910632,44910656,1,25,TACAT,GAGAAGGTCTGCATTCCTTTTACATTCTTTACATCGGGAATGGTGA
9,GFAP,ENSG00000131095,ENSE00003806990,2117594,AGAAGGTCTGCACGGGAATGGTGAT,44910633,44910657,1,25,TACAT,AGAAGGTCTGCACTTCCTTTTACATTCTTTACATGGGAATGGTGAT
