# Setup 

## Import packages 

In [63]:
# General 
import os 
import numpy as np
# For running bash scripts from inside python ... 
import subprocess
# For manipulating string objects 
import re
# for generating any necessary directories
import pathlib 

In [2]:
# For working with sequence objects 
from Bio.Seq import Seq

In [3]:
# For fetching sequences from Entrez 
from Bio import Entrez
from Bio import SeqIO

In [4]:
# For extracting features 
from Bio.SeqFeature import SeqFeature, FeatureLocation
# For creating SeqRecord objects 
from Bio.SeqRecord import SeqRecord

## Misc

In [5]:
Entrez.email = "kehaliwoldemichael@gmail.com"  # Always tell NCBI who you are

# Functions 

## Sequence

In [6]:
def seq_returnEntrez(sequenceID, retType):
    with Entrez.efetch(
        db="nucleotide", rettype=retType, retmode="text", id=sequenceID
    ) as handle:
        seqRecord = SeqIO.read(handle, "gb")  # using "gb" as an alias for "genbank"
        
    handle = Entrez.efetch(db="nucleotide", id=sequenceID, rettype=retType, retmode="text")
    
    return seqRecord, handle 

## Metrics 

In [7]:
# Returns GC content 
def metric_gcContent(sequence):
    return (sequence.count("G") + sequence.count("C"))/(len(sequence))

# Sequence

## Loading sequences 

In [8]:
geneName = 'Fezf2'
fileName = os.getcwd() + '/Output/biomaRt/Reverse_' + geneName + '.fasta'
fileName

'/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Output/biomaRt/Reverse_Fezf2.fasta'

In [9]:
# Loading sequences for gene exons 
exon_records = list(SeqIO.parse(fileName, "fasta"))
exon_records  

[SeqRecord(seq=Seq('TAGTGGTTCTGTTTATTGAGTCATATATGTGTAATATTCCGTGTTCGCTTGTAC...TCC'), id='Fezf2', name='Fezf2', description='Fezf2', dbxrefs=[]),
 SeqRecord(seq=Seq('CTTTTTCCCCCACCGCCAAGGAGATGCGTTCCGAGCCATGCAGCGTGTCTCTTC...CTA'), id='Fezf2', name='Fezf2', description='Fezf2', dbxrefs=[]),
 SeqRecord(seq=Seq('CTTGCCGCACACTTCGCAGGTGAAGTTTTTGGGTTTGCTGTCAGTAGAGCCCCC...AGT'), id='Fezf2', name='Fezf2', description='Fezf2', dbxrefs=[])]

In [10]:
len(exon_records[2].seq)

890

In [11]:
seq_record = exon_records[2]

In [12]:
metric_gcContent(seq_record.seq)

0.6348314606741573

## Selecting sensor 

In [13]:
seq_record

SeqRecord(seq=Seq('CTTGCCGCACACTTCGCAGGTGAAGTTTTTGGGTTTGCTGTCAGTAGAGCCCCC...AGT'), id='Fezf2', name='Fezf2', description='Fezf2', dbxrefs=[])

In [14]:
seq = seq_record.seq
len(seq)

890

In [82]:
def check_sesRNA(sequence, length):
    start = 0
    center = length/2
    
    global numPass, total, numTGG  
    numPass = 0 
    total = 0 
    numTGG = []
    
    sesSeq = []
    
    while(start <= (len(seq) - length)):
        # Defining current sub-sequence to process 
        subsequence = seq[start:(start+length)]
        
        # Genrating index of any stop codons 
        stopCodons = ['TAG', 'TAA', 'TGA']
        indiciesStop = []
        for codons in stopCodons:
            indiciesStop.extend([m.start() for m in re.finditer(codons, str(subsequence))])
        
        # GC content 
        gcContent = metric_gcContent(subsequence)*100
        # Index of last ATG and TGG 
        lastATG = 0 
        if(subsequence.count('ATG') != 0):
            lastATG = [m.start() for m in re.finditer('ATG', str(subsequence))][-1]
        lastTGG = [m.start() for m in re.finditer('TGG', str(subsequence))][-1]
        # Getting indicies of TGG 
        indiciesTGG = [m.start() for m in re.finditer('TGG', str(subsequence))]
        # Generating arrays indicies for TGGs and stop codons 
        arrayStop = np.array(indiciesStop)
        arrayIndicies = np.array(indiciesTGG) 
        centralTGGs = arrayIndicies[abs(arrayIndicies - center) < 10]
        
        numATG = subsequence.count('ATG')
        
        # Only proceed if passed 
        cond1 = len(indiciesStop) < 4 
        cond2 = gcContent > 40
        cond3 = gcContent < 65
        
        
        cond4 = lastATG < lastTGG 
        # cond4 = numATG == 0
        # Checking if TGG near center of subsequence 
        cond5 = any(abs(x - center) < 10 for x in indiciesTGG)
        # Checking if any central array is more than 10 by away from stop  
        cond6 = any((min(abs(arrayStop - i)) >= 20) for i in centralTGGs)
        
        if(cond1 & cond2 & cond3 & cond4 & cond5 & cond6):
            numPass += 1
            numTGG.append(subsequence.count('TGG'))
            sesSeq.append(subsequence)
            
        total += 1 
        # Updating start index 
        start += 1 
    
    return sesSeq

In [83]:
seq.count('ATG')

11

In [84]:
sesRNAs = check_sesRNA(seq, 204)

In [85]:
len(sesRNAs)

27

In [18]:
numTGG

[8,
 8,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 10,
 10,
 10,
 11,
 11,
 11]

In [19]:
total 

687

In [20]:
str(sesRNAs[0])

'CTCGTTCAGCGGTCAAGGCCGAGTTCTCCTTCAGCACCTGCTCCAGCGGCGCATGCAAGCGCTCCTTATGGGGATAGGAAGCTGGGTGGGGGAACTTGTCCGCAGTCAGGCTGGCCAGTTTGGCATTCTCCAGCAGAAAAAGCTTGGGGTGAGCAGCCAGGGAAGTGGGGGCCTGTGCGTTGAGGAGGCCAGATGGGAAAAGGT'

In [21]:
for i in range(200, 300):
    if(i%3 == 0):
        print(i)

201
204
207
210
213
216
219
222
225
228
231
234
237
240
243
246
249
252
255
258
261
264
267
270
273
276
279
282
285
288
291
294
297


## Outputing sesRNA

In [22]:
# Generating BioPython directory if does not exist 
pathlib.Path('Output/BioPython').mkdir(parents=True, exist_ok=True)

# Generate SeqRecord object for each sequence and append to list 
outputID = geneName + '_sesRNA'
outputDescription = "sesRNA for Fezf2"
outputSeqMulti = []
for i in sesRNAs:
    outputSeqMulti.append(SeqRecord(i, id = outputID, description = outputDescription))
    
# Write output fasta files 
outputName = "Output/BioPython/Fezf2_sesRNA_V3.fasta"
with open(outputName, "w") as output_handle:
    SeqIO.write(outputSeqMulti, output_handle, "fasta")

In [23]:
len(sesRNAs)

27

In [26]:
# Write output fasta files 
outputName = "Output/sesRNA_test.fasta"
with open(outputName, "w") as output_handle:
    SeqIO.write(outputSeqMulti[0], output_handle, "fasta")

In [27]:
str(outputSeqMulti[0].seq)

'CTCGTTCAGCGGTCAAGGCCGAGTTCTCCTTCAGCACCTGCTCCAGCGGCGCATGCAAGCGCTCCTTATGGGGATAGGAAGCTGGGTGGGGGAACTTGTCCGCAGTCAGGCTGGCCAGTTTGGCATTCTCCAGCAGAAAAAGCTTGGGGTGAGCAGCCAGGGAAGTGGGGGCCTGTGCGTTGAGGAGGCCAGATGGGAAAAGGT'

In [28]:
os.getcwd()

'/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code'

In [34]:
subprocess.Popen(['bash', 'c', '. rnaFold_prob.sh'])

<Popen: returncode: None args: ['bash', 'c', '. rnaFold_prob.sh']>

# Secondary structure 

In [61]:
process = subprocess.Popen("./rnaFold_prob.sh", shell=True, stdout=subprocess.PIPE)
subprocess_return = process.stdout.read()

In [62]:
float(subprocess_return)

0.00195248

In [64]:
from seqfold import dg, dg_cache, fold

In [67]:
# just returns minimum free energy
dg("GGGAGGTCGTTACATCTGGGTAACACCGGTACTGATCCGGTGACCTCCC", temp = 37.0)  # -12.94

-13.4

# Misc

In [None]:
# For returning index of findings 
searchCodon = 'TAG'
[m.start() for m in re.finditer(searchCodon, str(seq))]

In [None]:
testSeq = sesRNAs[0]
testSeq

In [None]:
lastATG = [m.start() for m in re.finditer('ATG', str(testSeq))][-1]
lastTGG = [m.start() for m in re.finditer('TGG', str(testSeq))][-1]

In [None]:
lastTGG

In [None]:
lastATG

In [None]:
lastATG < lastTGG

In [None]:
searchCodon = 'ATG'
[m.start() for m in re.finditer(searchCodon, str(testSeq))][-1]

In [None]:
seq.count('TAG') < 4

In [None]:
seq[0:100].count('TAG')

In [None]:
stopCodons = ['TAG', 'TAA', 'TGA']
stopCodons 

In [None]:
indiciesTGG

In [None]:
indiciesStop

In [None]:
length = 200 
center = length/2

In [None]:
arrayStop = np.array(indiciesStop)
arrayIndicies = np.array(indiciesTGG) 
centralTGGs = arrayIndicies[abs(arrayIndicies - center) < 10]

In [None]:
centralTGGs

In [None]:
np.in1d(centralTGGs,arrayStop)

In [None]:
# Check if array contains values that are within range of values in another array 
any((min(abs(arrayStop - i)) > 10) for i in centralTGGs)

In [None]:
centralTGGs

In [None]:
indiciesStop

In [None]:
testStop = [90, 16, 174]

In [None]:
(min(abs(arrayStop - centralTGGs[0])) > 10)

In [None]:
min(abs(arrayStop - centralTGGs[0])) > 10

In [None]:
centeralTGGs = offset.min()
centeralTGGs

In [None]:
centralTGGs = np.all(offset == offset.min())
centralTGGs

In [None]:
centeralTGGs = np.where(offset == offset.min())
centeralTGG

In [None]:
offset = abs(arrayIndicies - center) 
centerTGG = indiciesTGG[np.argmin(offset)]

In [None]:
any(abs(x - centerTGG) < 10 for x in indiciesStop)

In [None]:
indiciesStop = []
for codons in stopCodons:
    indiciesStop.extend([m.start() for m in re.finditer(codons, str(testSeq))])

In [None]:
len(indiciesStop)

In [None]:
[m.start() for m in re.finditer('TGA', str(testSeq))]

In [None]:
[m.start() for m in re.finditer('TAA', str(testSeq))]

In [None]:
[m.start() for m in re.finditer('TAG', str(testSeq))]

In [None]:
testSeq.count(stopCodons)

In [None]:
indiciesTGG

In [None]:
len(indiciesTGG)

In [None]:
testSeq = sesRNAs[0]

In [None]:
lastTGG = [m.start() for m in re.finditer('TGG', str(testSeq))][-1]

In [None]:
testSeq

In [None]:
indiciesTGG = [m.start() for m in re.finditer('TGG', str(testSeq))]

In [None]:
start = 0 
stop = 200

In [None]:
middle = (start + stop) / 2

In [None]:
abs(middle - indiciesTGG[0])

In [None]:
type(indiciesTGG)

In [None]:
indiciesTGG

In [None]:
any(indiciesTGG) > 2

In [None]:
length = 200 

In [None]:
any(abs(x - (length/2)) < 20 for x in indiciesTGG)

In [None]:
testList = [50, 60, 170, 200]

In [None]:
any(abs(x - (length/2)) < 10 for x in testList)

In [None]:
testSeq

In [None]:
os.path.isdir('Output/BioPython')

In [None]:
outputFileName = os.getcwd() + ''

In [None]:
testSeq