#### Imports:

In [1]:
ENV_FILE = './6896_model.env'

from utils.notebook import set_env_vars
set_env_vars(ENV_FILE)

from constants import *
from server import find_orf

from Bio import SeqIO
import pandas as pd
import numpy as np

#### Load test samples:

In [2]:
mRNA_data_path = '../classification_playground/data/mRNAs.fasta'
mRNA_coordinates_path = '../classification_playground/data/mRNA_CDS_coordinates.txt'

mRNA_sequences = [{'transcription_id': str(sequence.id), 'sequence': str(sequence.seq)} for sequence in SeqIO.parse(open(mRNA_data_path),'fasta')]
mRNA = pd.DataFrame(mRNA_sequences).set_index('transcription_id')
mRNA['sequence'] = mRNA['sequence'].str.lower()

mRNA_coordinates = pd.read_csv(mRNA_coordinates_path, sep='\t', header=0, index_col=0)

In [3]:
mRNA_coordinates = mRNA_coordinates[~(
    mRNA_coordinates.start.str.startswith('<') |
    mRNA_coordinates.start.str.startswith('>') |
    mRNA_coordinates.end.str.startswith('<') |
    mRNA_coordinates.end.str.startswith('>')
)]

mRNA_coordinates['start'] = mRNA_coordinates.start.astype(int) - 1  # python style start position
mRNA_coordinates['end'] = mRNA_coordinates.end.astype(int)

mRNA_coordinates = mRNA_coordinates[mRNA_coordinates.start > 20]

In [4]:
mRNA_full = mRNA.join(other=mRNA_coordinates, how='right')

subsample = np.random.choice(mRNA_full.index, size=10)

mRNA_subsample = mRNA_full.loc[subsample]

In [5]:
mRNA_subsample

Unnamed: 0_level_0,sequence,start,end
transcript_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NM_001014342.2,accctgcaagctgcatcaggctttatcctacttgttcctttggtga...,73,7249
NM_001291447.1,agctcccgcctccctccccagcagctcctccccataaactcccctc...,281,3002
NM_173477.4,agcgtttcagatgtcttggtagtcgcggctctggcgctccgcaccc...,190,1576
XM_006710938.4,atcgcttgaactcaggaggcagaggttgcagtgagccaagagcgct...,693,1608
NM_001256335.1,agagagaaccgcaacacctggtgccgggtcgggtcgtttccggggc...,332,893
NM_201264.1,cagagatcgcgagcgaggcaccagcctgcagccggcccccagcaca...,791,2459
NM_207577.1,aagattatagagacttgctttagaaccacaagaagaaagaggaggc...,65,1385
XM_006712960.3,accctggccgtcatcaggttcaacctcatattcaactgctgggcct...,275,1343
XM_011519170.2,tttcaaaaaaaaaaaaaaaaggctgaacaaactgaaaaaccagcgg...,647,1916
XM_011515526.2,ggtctgactccagtagcccatgttataaataaaggtttggtgccac...,98,1514


In [6]:
seq = mRNA_subsample.sequence[1]

find_orf(seq, return_best=False, return_top=10, include_seq=False)

[{'start': 1827, 'end': 3093, 'probability': 0.9999755620956421},
 {'start': 1818, 'end': 3093, 'probability': 0.9999737739562988},
 {'start': 1827, 'end': 3075, 'probability': 0.9999463558197021},
 {'start': 1818, 'end': 3075, 'probability': 0.9999431371688843},
 {'start': 1163, 'end': 3086, 'probability': 0.9999359846115112},
 {'start': 1163, 'end': 3068, 'probability': 0.9999338388442993},
 {'start': 1163, 'end': 3002, 'probability': 0.999932050704956},
 {'start': 1163, 'end': 3116, 'probability': 0.9999222755432129},
 {'start': 1818, 'end': 3330, 'probability': 0.9999005794525146},
 {'start': 1818, 'end': 3408, 'probability': 0.9998998641967773}]