In [1]:
from typing import (
    List,
    Tuple,
    NamedTuple
)
import pprint

import pandas as pd

import conftest
import libnano.ensemblrest as er

In [2]:
SPECIES: str = 'human'

In [3]:
GeneInfo = NamedTuple('GeneInfo', [
    ('symbol', str),
    ('barcode', str),
    ('gene_id', str),
    ('transcript_id', str), # canonical id
    ('utr_id', str)  #
    ]
)

In [4]:
GENES_AND_BARCODES: List[GeneInfo] = [
    GeneInfo('SLC17A8', 'ACAGC', 'ENSG00000179520', 'ENST00000323346', 'ENSE00001244923'),
    GeneInfo('GFAP',    'TACAT', 'ENSG00000131095', 'ENST00000638281', 'ENSE00003806990'),
    GeneInfo('FOXO1',   'TTTGC', 'ENSG00000150907', 'ENST00000379561', 'ENSE00001481591'),
    GeneInfo('PSEN2',   'CATTA', 'ENSG00000143801', 'ENST00000366783', 'ENSE00001380688'),
    GeneInfo('DAXX',    'AACCG', 'ENSG00000204209', 'ENST00000374542', 'ENSE00001815438'),
    GeneInfo('CDK5R1',  'CGAGA', 'ENSG00000176749', 'ENST00000313401', 'ENSE00001271015')
]

In [5]:
out_columns: List[str] = [
'symbol',
'gene_id',
'exon_id',
'probe_name',
'probe_seq',
'probe_start',
'probe_end',
'probe_strand',
'probe_length',
'barcode'
]

# NOTE

Get all of the probe sequences in terms of the transcript RNA strand direction.

Therefore, if the transcript is forward and the probe is reverse, we find the probe and RC it. `er.filterRegionSequence` accomplishes this  

In [6]:
df_out: pd.DataFrame = pd.DataFrame(columns=out_columns)
todfdict = lambda x: {a: b for a, b in zip(out_columns, x)}
for item in GENES_AND_BARCODES:
    three_p_exon_id: str = item.utr_id
    transcript_id: str = item.transcript_id
    filtered_probes: pd.DataFrame = er.getProbesForID(three_p_exon_id, keep_n=5)
    barcode: str = item.barcode
    for i in range(len(filtered_probes)):
        probe = filtered_probes.iloc[i]
        p_start: int =  probe['start']
        p_end: int =    probe['end']
        p_strand: int = probe['strand']
        p_length: int = probe['probe_length']
            
        # sometimes p_length doesn't match start and end indices so let's filter those out
        if (p_end - p_start + 1) > p_length:
            print(probe['probe_name'])
            continue
        '''NOTE ADD CODE HERE IF YOU WANT PROBES OF ALL THE SAME LENGTH
        
        For instance we could extend the probe to be longer and check that it is 
        still in the transcript or exon
        or we could trim probes that are too long
        '''
        seq: str = er.getRegionSequence( 
            SPECIES,
            chromosome=probe['seq_region_name'],
            start_idx=p_start,
            end_idx=p_end,
            strand=p_strand
        )
        try:
            was_rc: bool
            seq, was_rc = er.filterRegionSequence(
                    seq,
                    p_strand,
                    transcript_id
            )
        except:
            print(probe)
            raise
        row: list = [
            item.symbol,
            item.gene_id,
            item.utr_id,
            probe['probe_name'],
            seq,
            p_start,
            p_end,
            p_strand,
            probe['probe_length'],
            barcode
        ]
        df_out = df_out.append(todfdict(row), ignore_index=True)

In [7]:
df_out

Unnamed: 0,symbol,gene_id,exon_id,probe_name,probe_seq,probe_start,probe_end,probe_strand,probe_length,barcode
0,SLC17A8,ENSG00000179520,ENSE00001244923,A_24_P124647,TCCTGTTCATTGATTTTAAACATTTTATTCCTACTTTCAGAAGAAA...,100421560,100421619,1,60,ACAGC
1,SLC17A8,ENSG00000179520,ENSE00001244923,ILMN_1767842,ATCCATGCAAGCCCCATAAAACAGTTCCTAGCATGCAGAAAATGCC...,100421854,100421903,1,50,ACAGC
2,SLC17A8,ENSG00000179520,ENSE00001244923,0004120551,ATCCATGCAAGCCCCATAAAACAGTTCCTAGCATGCAGAAAATGCC...,100421854,100421903,1,50,ACAGC
3,SLC17A8,ENSG00000179520,ENSE00001244923,A_23_P159076,CCCACGTAAATAGCTGTCATCATCATTATCTTTTAACATTTTGGGG...,100421898,100421957,1,60,ACAGC
4,SLC17A8,ENSG00000179520,ENSE00001244923,100:691;,GTTGTGCTGTTTTCTAAGTAAAATA,100422012,100422036,1,25,ACAGC
5,GFAP,ENSG00000131095,ENSE00003806990,A_24_P59786,GTGGTACAGAGTAACTGTACATTAAACTGGCAGAGCTTGTTAGTGG...,44909918,44909977,-1,60,TACAT
6,GFAP,ENSG00000131095,ENSE00003806990,A_33_P3335895,AAAGCCTCACAATACGAGTTATACCAATACAGGCTCACCAGATTGT...,44910150,44910209,-1,60,TACAT
7,GFAP,ENSG00000131095,ENSE00003806990,2620230,CACCATTCCCGTGCAGACCTTCTCC,44910631,44910655,1,25,TACAT
8,GFAP,ENSG00000131095,ENSE00003806990,1280138,TCACCATTCCCGTGCAGACCTTCTC,44910632,44910656,1,25,TACAT
9,GFAP,ENSG00000131095,ENSE00003806990,2117594,ATCACCATTCCCGTGCAGACCTTCT,44910633,44910657,1,25,TACAT


# Padlock structure reminder,
left and right are in terms of the hybridized sequence

LINEAR VERSION:

    5'    Right Arm       Scaffold Seq (aka Loop)      Left Arm     3'
    +------------------>+-----------~-----------++------------------>

    HYBRIDIZED VERSION

                     Scaffold Seq (aka Loop)
            -------------------~--------------------
            |                                      |
            <     Left Arm    3' 5'   Right Arm    +
    3'      +------------------>+------------------>     5'
    <----------------------------------------------------+
              copied RT'd cDNA reverse strand
              
The scaffold seq looks like this:

    5'+--TTCCTTT-\[barcode_solid\]---\[T2S_sequence\]--TCTT->3'

We will use 5 mer barcodes

In [8]:
T2S_SEQ: str =    'ACTTCAGCTGCCCCGGGTGAAGA'
RIGHT_LOOP: str = 'TTCCTTT'
LEFT_LOOP: str =  'TCTT'

In [9]:
len(T2S_SEQ)

23

the padlocks will be:

In [10]:
df_padlocks: pd.DataFrame = df_out.copy()
def probe_generator(x, do_rt: bool = True) -> str:
    seq: str = x.probe_seq
    if not do_rt:
        seq = reverseComplement(seq)
    len_x: int = len(seq)
    right_len: int = (len_x // 2) + 1
    return ''.join([seq[:right_len], RIGHT_LOOP, x.barcode, T2S_SEQ, LEFT_LOOP, seq[right_len:]])

padlocks: List[str] = [probe_generator(df_padlocks.loc[i]) for i in range(len(df_padlocks)) ]
df_padlocks = df_padlocks.assign(padlock=padlocks)

In [11]:

df_padlocks[['symbol', 'probe_name', 'barcode', 'probe_length', 'padlock']]

Unnamed: 0,symbol,probe_name,barcode,probe_length,padlock
0,SLC17A8,A_24_P124647,ACAGC,60,TCCTGTTCATTGATTTTAAACATTTTATTCCTTCCTTTACAGCACT...
1,SLC17A8,ILMN_1767842,ACAGC,50,ATCCATGCAAGCCCCATAAAACAGTTTTCCTTTACAGCACTTCAGC...
2,SLC17A8,0004120551,ACAGC,50,ATCCATGCAAGCCCCATAAAACAGTTTTCCTTTACAGCACTTCAGC...
3,SLC17A8,A_23_P159076,ACAGC,60,CCCACGTAAATAGCTGTCATCATCATTATCTTTCCTTTACAGCACT...
4,SLC17A8,100:691;,ACAGC,25,GTTGTGCTGTTTTTTCCTTTACAGCACTTCAGCTGCCCCGGGTGAA...
5,GFAP,A_24_P59786,TACAT,60,GTGGTACAGAGTAACTGTACATTAAACTGGCTTCCTTTTACATACT...
6,GFAP,A_33_P3335895,TACAT,60,AAAGCCTCACAATACGAGTTATACCAATACATTCCTTTTACATACT...
7,GFAP,2620230,TACAT,25,CACCATTCCCGTGTTCCTTTTACATACTTCAGCTGCCCCGGGTGAA...
8,GFAP,1280138,TACAT,25,TCACCATTCCCGTTTCCTTTTACATACTTCAGCTGCCCCGGGTGAA...
9,GFAP,2117594,TACAT,25,ATCACCATTCCCGTTCCTTTTACATACTTCAGCTGCCCCGGGTGAA...
