# Setup 

## Import packages 

In [1]:
# General 
import os 
import numpy as np
# For running bash scripts from inside python ... 
import subprocess
# For manipulating string objects 
import re
# for generating any necessary directories
import pathlib 
# For manipulating list objects 
import itertools 

In [2]:
# For working with sequence objects 
from Bio.Seq import Seq

In [3]:
# For fetching sequences from Entrez 
from Bio import Entrez
from Bio import SeqIO

In [4]:
# For extracting features 
from Bio.SeqFeature import SeqFeature, FeatureLocation
# For creating SeqRecord objects 
from Bio.SeqRecord import SeqRecord

## Misc

In [5]:
Entrez.email = "kehaliwoldemichael@gmail.com"  # Always tell NCBI who you are

# Functions 

## Sequence

In [6]:
def seq_returnEntrez(sequenceID, retType):
    with Entrez.efetch(
        db="nucleotide", rettype=retType, retmode="text", id=sequenceID
    ) as handle:
        seqRecord = SeqIO.read(handle, "gb")  # using "gb" as an alias for "genbank"
        
    handle = Entrez.efetch(db="nucleotide", id=sequenceID, rettype=retType, retmode="text")
    
    return seqRecord, handle 

In [7]:
# Checks if continuous open reading frame by translating to stop ... 
def check_cORF(sequence):
    return len(sequence.translate(to_stop=True)) == len(sequence)/3

In [8]:
# Return sesRNAs that are in CDS 
def check_inCDS(sequence, searchSequence, isoForm, typeSes): 
    if typeSes == 'Reverse':
        return 0 != searchSequence[isoForm].seq.count(sequence.reverse_complement())
    elif typeSes == 'Complement':
        return 0 != searchSequence[isoForm].seq.count(sequence.complement())

In [9]:
# Checking for in frame TGG and ATG (both number and indices of occurances)
def return_inFrame(sequence, choice):
    # Definnig stop codons 
    stopCodons = ['TAG', 'TAA', 'TGA']
    
    # Generating list of codons in sequence 
    strSeq = str(sequence)
    codons = [strSeq for strSeq in re.split(r'(\w{3})', strSeq) if strSeq]
    
    # Number of in frame TGG and ATG 
    num_inF_TGG = codons.count('TGG')
    num_inF_ATG = codons.count('ATG')
    num_inF_Stop = codons.count(stopCodons[0]) + codons.count(stopCodons[1]) + codons.count(stopCodons[2])
    
    # Indices of TGG, ATG, and defined stop codons 
    indicesTGG = np.array([key for key, val in enumerate(codons) if val == 'TGG'])*3
    indicesATG = np.array([key for key, val in enumerate(codons) if val == 'ATG'])*3
    indiciesStop = np.array([key for key, val in enumerate(codons) if val in stopCodons])*3
    
    if choice == 'all': return num_inF_TGG, num_inF_ATG, num_inF_Stop, indicesTGG, indicesATG, indiciesStop 
    if choice == 'numTGG': return num_inF_TGG

In [10]:
# Return sesRNAs that are in CDS 
def return_inCDS(sesRNAs, CDS, isoForm, typeSes): 
    temp_cds_sesRNAs = []
    
    for sesRNA in sesRNAs: 
        if typeSes == 'Reverse':
            if 0 != CDS[isoForm].seq.count(sesRNA.reverse_complement()):
                temp_cds_sesRNAs.append(sesRNA)
        elif typeSes == 'Complement':
            if 0 != CDS[isoForm].seq.count(sesRNA.complement()):
                temp_cds_sesRNAs.append(sesRNA)
    
    return temp_cds_sesRNAs

## Metrics 

In [11]:
# Returns GC content 
def metric_gcContent(sequence):
    return (sequence.count("G") + sequence.count("C"))/(len(sequence))

# Sequence

## Loading sequences 

In [12]:
geneName = 'Fezf2'

In [13]:
# Loading sequences for reverse complement gene exons 
rC_fileName = os.getcwd() + '/Output/biomaRt/Reverse_' + geneName + '.fasta'
rC_exon_records = list(SeqIO.parse(rC_fileName, "fasta"))
rC_exon_records  

[SeqRecord(seq=Seq('TAGTGGTTCTGTTTATTGAGTCATATATGTGTAATATTCCGTGTTCGCTTGTAC...TCC'), id='Fezf2', name='Fezf2', description='Fezf2', dbxrefs=[]),
 SeqRecord(seq=Seq('CTTTTTCCCCCACCGCCAAGGAGATGCGTTCCGAGCCATGCAGCGTGTCTCTTC...CTA'), id='Fezf2', name='Fezf2', description='Fezf2', dbxrefs=[]),
 SeqRecord(seq=Seq('CTTGCCGCACACTTCGCAGGTGAAGTTTTTGGGTTTGCTGTCAGTAGAGCCCCC...AGT'), id='Fezf2', name='Fezf2', description='Fezf2', dbxrefs=[])]

In [14]:
# Loading sequences for complement gene exons 
C_fileName = os.getcwd() + '/Output/biomaRt/Complement_' + geneName + '.fasta'
C_exon_records = list(SeqIO.parse(C_fileName, "fasta"))
C_exon_records  

[SeqRecord(seq=Seq('CCTTGATGTTCTTGGTGTTCGAGTGTGTGTCGCCGCTCTTCGTCATGTTTACGT...GAT'), id='Fezf2', name='Fezf2', description='Fezf2', dbxrefs=[]),
 SeqRecord(seq=Seq('ATCAACGGAGGAAATTTCAAACTCCCCGCCACCGCCGCCGGCCGTCCGCGCCCC...TTC'), id='Fezf2', name='Fezf2', description='Fezf2', dbxrefs=[]),
 SeqRecord(seq=Seq('TGAATCTCCTCTCTCCGACACGGGACCGGGTCGGACCGAGTCGAACCGCGCGGT...TTC'), id='Fezf2', name='Fezf2', description='Fezf2', dbxrefs=[])]

In [15]:
# Loading sequences for gene CDS 
CDS_fileName = os.getcwd() + '/Output/biomaRt/CDS_' + geneName + '.fasta'
CDS = list(SeqIO.parse(CDS_fileName, "fasta"))
CDS  

[SeqRecord(seq=Seq('ATGGCCAGCTCAGCTTCCCTGGAGACCATGGTGCCCCCGGCCTGCCCGCGCGCT...TGA'), id='Fezf2', name='Fezf2', description='Fezf2', dbxrefs=[])]

In [16]:
len(CDS[0].seq)

1368

In [17]:
len(rC_exon_records[2].seq)

890

In [18]:
seq_record = rC_exon_records[1]

In [19]:
metric_gcContent(seq_record.seq)

0.5761316872427984

## Selecting sensor 

In [20]:
seq_record

SeqRecord(seq=Seq('CTTTTTCCCCCACCGCCAAGGAGATGCGTTCCGAGCCATGCAGCGTGTCTCTTC...CTA'), id='Fezf2', name='Fezf2', description='Fezf2', dbxrefs=[])

In [21]:
seq = seq_record.seq
len(seq)

243

In [22]:
def generate_sesRNA(sequence, length, searchSequence, isoForm, typeSes):
    start = 0
    center = length/2
    
    global numPass, total, numTGG  
    numPass = 0 
    total = 0 
    numTGG = []
    
    sesSeq = []
    startSeq = []
    stopSeq = []
    
    # For storing number of in frame TGG, ATG, and Stop codons 
    num_inF_TGGs = []
    num_inF_ATGs = []
    num_inF_Stops = []
    
    while(start <= (len(sequence) - length)):
        # Defining current sub-sequence to process 
        subsequence = sequence[start:(start+length)]
        
        # Genrating index of any stop codons 
        stopCodons = ['TAG', 'TAA', 'TGA']
        indiciesStop = []
        for codons in stopCodons:
            indiciesStop.extend([m.start() for m in re.finditer(codons, str(subsequence))])
        
        # GC content 
        gcContent = metric_gcContent(subsequence)*100
        # Index of last ATG and TGG 
        lastATG = 0 
        if(subsequence.count('ATG') != 0):
            lastATG = [m.start() for m in re.finditer('ATG', str(subsequence))][-1]
        if(subsequence.count('TGG') != 0):
            lastTGG = [m.start() for m in re.finditer('TGG', str(subsequence))][-1]
        # Getting indicies of TGG 
        indiciesTGG = [m.start() for m in re.finditer('TGG', str(subsequence))]
        # Generating arrays indicies for TGGs and stop codons 
        arrayStop = np.array(indiciesStop)
        arrayIndicies = np.array(indiciesTGG) 
        centralTGGs = arrayIndicies[abs(arrayIndicies - center) < 10]
        
        num_inF_TGG, num_inF_ATG, num_inF_Stop, indices_inF_TGG, indices_inF_ATG, indices_inF_Stop = return_inFrame(subsequence, 'all')
        numATG = subsequence.count('ATG')
        
        # Only proceed if passed 
        # cond1 = len(indiciesStop) < 4 
#         cond1 = check_cORF(subsequence)
        cond1 = num_inF_Stop <= 0
        
        cond2 = num_inF_TGG >= 2
        
        cond3 = num_inF_ATG == 0 
#         if num_inF_TGG != 0 and num_inF_ATG != 0:
#             cond3 = (min(indices_inF_TGG) > max(indices_inF_ATG)) # just making sure that all in frame ATG's upstream of all in frame TGG's 
#         else:
#             cond3 = num_inF_ATG == 0 
        
        cond4 = gcContent > 40
        cond5 = gcContent < 65
        
        # cond4 = lastATG < lastTGG 
        # Checking if TGG near center of subsequence 
        cond6 = any(abs(x - center) < 10 for x in indices_inF_TGG)
        
        # Checking if any central array is more than 10 by away from an in frame stop
#         cond7 = any((min(abs(arrayStop - i)) >= 20) for i in centralTGGs) # Original ... broken condition ... just as a check 
        if num_inF_Stop != 0:
            cond7 = any((min(abs(indices_inF_Stop - i)) >= 20) for i in centralTGGs)
        else:
            cond7 = True
        
        
        if(cond1 & cond2 & cond3 & cond4 & cond5 & cond6 & cond7):
            # Only include if in region of gene (currently in CDS) 
            if check_inCDS(subsequence, searchSequence, isoForm, typeSes):
                numPass += 1
                    
                numTGG.append(subsequence.count('TGG'))
                
                # Appending passesed subsequences 
                sesSeq.append(subsequence)
                # Appending indices of start for sesnsor (relative to start of exon)
                startSeq.append(start)
                stopSeq.append(start+length)
                
                # Appending number of in frame TGGs, ATGs, and Stop codons 
                num_inF_TGGs.append(num_inF_TGG)
                num_inF_ATGs.append(num_inF_ATG)
                num_inF_Stops.append(num_inF_Stop)
            
        total += 1 
        # Updating start index 
        start += 1 
    
    return sesSeq, startSeq, stopSeq, num_inF_TGGs, num_inF_ATGs, num_inF_Stops 

In [23]:
sesRNAs = generate_sesRNA(seq, 204, CDS, 0, 'Reverse')
sesRNAs

([], [], [], [], [], [])

In [24]:
len(sesRNAs)

6

In [25]:
# Returns sesRNAs for each exon
def generate_sesRNAs_multiExon(exon_records, length, searchSequence, isoForm, typeSes):
    tempAll_sesRNAs = []
    tempAll_startSeq = []
    tempAll_stopSeq = []
    temp_num_inF_TGGs = []
    temp_num_inF_ATGs = []
    temp_num_inF_Stops = []
    
    for record in exon_records:
        tempSeq = record.seq 
        temp_sesRNAs, temp_startSeq, temp_stopSeq, single_num_inF_TGGs, single_num_inF_ATGs, single_num_inF_Stops = generate_sesRNA(tempSeq, length, searchSequence, isoForm, typeSes)
        
        tempAll_sesRNAs.extend(temp_sesRNAs)
        tempAll_startSeq.append(temp_startSeq)
        tempAll_stopSeq.append(temp_stopSeq)
        
        temp_num_inF_TGGs.append(single_num_inF_TGGs)
        temp_num_inF_ATGs.append(single_num_inF_ATGs)
        temp_num_inF_Stops.append(single_num_inF_Stops)
        
        # Printing number of passed sequences for current exon 
        print(len(temp_sesRNAs))
        
    return tempAll_sesRNAs, tempAll_startSeq, tempAll_stopSeq, temp_num_inF_TGGs, temp_num_inF_ATGs, temp_num_inF_Stops

In [26]:
rC_multiExon_sesRNAs, rC_multi_startSeq, rC_multi_stopSeq, rC_num_inF_TGGs, rC_num_inF_ATGs, rC_num_inF_Stops = generate_sesRNAs_multiExon(rC_exon_records, 228, CDS, 0, 'Reverse')

0
0
15


In [27]:
rC_multi_startSeq

[[],
 [],
 [153, 156, 159, 162, 165, 168, 171, 183, 186, 189, 192, 195, 198, 201, 204]]

In [28]:
rC_multi_stopSeq

[[],
 [],
 [381, 384, 387, 390, 393, 396, 399, 411, 414, 417, 420, 423, 426, 429, 432]]

In [29]:
# Just looking at 'window' in which sesRNAs are being produced 
max(rC_multi_stopSeq[2]) - min(rC_multi_startSeq[2])

279

In [30]:
rC_num_inF_TGGs

[[], [], [4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]]

In [31]:
rC_num_inF_ATGs

[[], [], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

In [32]:
rC_num_inF_Stops

[[], [], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

In [33]:
# Checking GC content of sesRNAs 
for sequence in rC_multiExon_sesRNAs:
    print(metric_gcContent(sequence))

0.6096491228070176
0.6008771929824561
0.6096491228070176
0.6096491228070176
0.6140350877192983
0.6096491228070176
0.6052631578947368
0.618421052631579
0.618421052631579
0.6228070175438597
0.6271929824561403
0.6228070175438597
0.618421052631579
0.6228070175438597
0.6271929824561403


In [34]:
all_indices_inF_TGG = []
all_indices_inF_ATG = []

for sequence in rC_multiExon_sesRNAs:
    num_inF_TGG, num_inF_ATG, num_inF_Stops, indicesTGG, indicesATG, indicesStops = return_inFrame(sequence, 'all')
    all_indices_inF_TGG.append(indicesTGG)
    all_indices_inF_ATG.append(indicesATG)
    print(indicesTGG)
    # print(num_inF_TGG)

[ 12 123 153 156]
[  9 120 150 153]
[  6 117 147 150]
[  3 114 144 147]
[  0 111 141 144]
[108 138 141]
[105 135 138]
[ 93 123 126]
[ 90 120 123]
[ 87 117 120]
[ 84 114 117]
[ 81 111 114]
[ 78 108 111]
[ 75 105 108]
[ 72 102 105]


In [None]:
C_multiExon_sesRNAs, C_multi_startSeq, C_num_inF_TGGs, C_num_inF_ATGs, C_num_inF_Stops = generate_sesRNAs_multiExon(C_exon_records, 204, CDS, 0, 'Complement')

In [None]:
C_multiExon_sesRNAs

In [None]:
# Just additional check if in CDS 
cds_sesRNAs = return_inCDS(rC_multiExon_sesRNAs, CDS, 0, 'Reverse')
cds_sesRNAs

In [None]:
for i in range(200, 300):
    if(i%3 == 0):
        print(i)

In [None]:
testSeq = sesRNAs[0]
testSeq

In [None]:
str(testSeq)

In [None]:
?split()

In [None]:
subsequence = str(testSeq)
codons = [subsequence for subsequence in re.split(r'(\w{3})', subsequence) if subsequence]

In [None]:
tempSeq = Seq('GTTCTCCTTCAGCACCTGCTCCAGCGGCGCATGCAAGCGCTCCTTATGGGGATAGGAAGCTGGGTGGGGGAACTTGTCCGCAGTCAGGCTGGCCAGTTTGGCATTCTCCAGCAGAAAAAGCTTGGGGTGAGCAGCCAGGGAAGTGGGGGCCTGTGCGTTGAGGAGGCCAGATGGGAAAAGGTGGCCTCCGAGGAGCTCCGATGG')

In [None]:
check_cORF(tempSeq)

In [None]:
coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")

In [None]:
check_cORF(coding_dna)

# Outputing sesRNA

In [None]:
# Generating BioPython directory if does not exist 
pathlib.Path('Output/BioPython').mkdir(parents=True, exist_ok=True)

# Generate SeqRecord object for each sequence and append to list 
outputID = geneName + '_sesRNA'
outputDescription = "sesRNA for " + geneName

# Generating sequence record objects (for seperate storage)
outputSeqMulti_DNA = []
outputSeqMulti_RNA = []
for i in rC_multiExon_sesRNAs:
    outputSeqMulti.append(SeqRecord(i, id = outputID, description = outputDescription))
    
# Write output fasta files 
version = "V6"
outputName = "Output/BioPython/" + geneName + "_sesRNA_" + version + ".fasta" 
with open(outputName, "w") as output_handle:
    SeqIO.write(outputSeqMulti, output_handle, "fasta")

In [None]:
# Given sequence ... converts to in frame TGGs to TAGs and in frame stops so that first 'T' becomes 'G'
# Had to be careful to only work with in frame codons ... initally had made the mistake to just use string.replace ... this would change out of frame codons as well 
def convert_DNA(sequence, numberConvert):
    # Converting to string object for manipulation 
    strSeq = str(sequence)
    # Generating in frame object variables 
    num_inF_TGG, num_inF_ATG, num_inF_Stop, indicesTGG, indicesATG, indicesStop = return_inFrame(Seq(strSeq), 'all')
    print(num_inF_TGG)
    # print(num_inF_Stop)

    # Replacing in frame stop codons in sequence 
    for stop in indicesStop: 
        stopPairs = [("TAG", "GAG"), ("TAA", "GAA"), ("TGA", "GGA")]
        stopSeq = strSeq[stop:stop+3]
        [stopSeq := stopSeq.replace(a, b) for a, b in stopPairs]
        strSeq = strSeq[:stop] + stopSeq + strSeq[stop+3:]
    
    # Setting number convert to all if 'All' selected as number of TGG to convert 
    if numberConvert == 'All': numberConvert = num_inF_TGG
    # Converting TGG's ... up to number set ... and in order from starting with most central 
    # Sorts indicees by distance from center 
    sorted_indices_centralTGG = np.array(sorted(indicesTGG - (len(strSeq)/2), key = abs)) + (len(strSeq)/2)
    # Converts in frame TGG's ... starting from most central TGG ... up to limit set by numberConvert 
    for i in range(numberConvert):
        currentIndex = int(sorted_indices_centralTGG[i])
        strSeq = strSeq[:currentIndex] + 'TAG' + strSeq[currentIndex+3:]
    # Returns RNA 
    return Seq(strSeq).transcribe()

In [None]:
testSeq = Seq('TGGGAGTAGTGGTGGTAATGA')
testStr = str(testSeq)

In [None]:
convert_DNA(testSeq, 1)

In [None]:
# Test that all and numbering is working 
convert_DNA(testSeq, 3) == convert_DNA(testSeq, 'All')

In [None]:
convert_DNA(rC_multiExon_sesRNAs[0], 'All')

In [None]:
# Testing that continious reading frame being produced if number set to 0 
len(convert_DNA(rC_multiExon_sesRNAs[0], 0).translate(to_stop = True)) == len(rC_multiExon_sesRNAs[0])/3

In [None]:
# Checking that right number of TGG being converted 
return_inFrame(convert_DNA(rC_multiExon_sesRNAs[0], 'All').back_transcribe(), 'numTGG')

In [None]:
convert_DNA(testSeq, 1) - (len(testSeq)/2)

In [None]:
abs(convert_DNA(testSeq, 1) - (len(testSeq)/2))

In [None]:
np.sort(abs(convert_DNA(testSeq, 1) - (len(testSeq)/2))) + (len(testSeq)/2)

In [None]:
np.array(sorted(convert_DNA(testSeq, 1) - (len(testSeq)/2), key = abs)) + (len(testSeq)/2)

In [None]:
# Function for saving both original template DNA sequence and the converted RNA given list of sesRNAs Seq objects
# Could change later to make for separate output files ... have not decided yet whether to just put them together into one labeled fasta file 
def save_sesRNAs(sequences_sesRNAs, geneName, version, numConvertTGG):
    # Generating BioPython directory if does not exist 
    pathlib.Path('Output/BioPython').mkdir(parents=True, exist_ok=True)
    
    # Defining save name and description for outputs (original DNA and converted RNA)
    DNA_outputID = geneName + '_sesRNA'
    DNA_outputDescription = "sesRNA DNA original for " + geneName
    convertedRNA_outputID = geneName + '_sesRNA'
    convertedRNA_outputDescription = "sesRNA converted RNA for " + geneName
    
    # Creating empty list for storing sequences 
    outputSeqMulti_DNA = []
    outputSeqMulti_convertedRNA= []
    
    # Generating SeqRecord objects in preparation for writing FASTA files ... edited description to include number of sequence
    i = 1
    for sequence in sequences_sesRNAs:
        outputSeqMulti_DNA.append(SeqRecord(sequence, id = DNA_outputID+str(i), description = DNA_outputDescription))
        outputSeqMulti_convertedRNA.append(SeqRecord(convert_DNA(sequence, numberConvertTGG), id = convertedRNA_outputID+str(i), description = convertedRNA_outputDescription))
        i += 1
        
    # Write output fasta files 
    DNA_outputName = "Output/BioPython/" + geneName + "_sesRNA_" + version + ".fasta" 
    convertedRNA_outputName = "Output/BioPython/" + geneName + "_sesRNA_convertedRNA_" + version + ".fasta" 
    
    with open(outputName, "w") as output_handle:
        # Writing original template DNA files 
        SeqIO.write(outputSeqMulti_DNA, output_handle, "fasta")
        # Writing orginal template RNA files 
        SeqIO.write(outputSeqMulti_convertedRNA, output_handle, "fasta")

In [None]:
testSeq = rC_multiExon_sesRNAs[0]

In [None]:
testSeq_str = str(testSeq)
testSeq_str

In [None]:
testStr = 'TGGTGGTAG'

In [None]:
testStr.replace('TGG', 'TAG', 1)

In [None]:
?str.replace

In [None]:
replacedSeq = testSeq_str('TGG', 'TAG', c)

In [None]:
len(rC_multiExon_sesRNAs)

# Secondary structure 

In [None]:
# Leading RNAfold as RNA 
sys.path.append("/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Packages/ViennaRNA_Python3/usr/lib/python3.9/site-packages/RNA")
import _RNA as RNAfold 

In [35]:
# Generating Temp 
pathlib.Path('Output/BioPython/Temp').mkdir(parents=True, exist_ok=True)

In [36]:
# Converting to RNA for calculating secondary structure 
sesRNAs_RNA = []
for i in range(len(rC_multiExon_sesRNAs)):
    sesRNAs_RNA.append(rC_multiExon_sesRNAs[i].transcribe())

In [37]:
sesRNAs_RNA

[Seq('GGGAUAGGAAGCUGGGUGGGGGAACUUGUCCGCAGUCAGGCUGGCCAGUUUGGC...GUU'),
 Seq('AUAGGAAGCUGGGUGGGGGAACUUGUCCGCAGUCAGGCUGGCCAGUUUGGCAUU...GAU'),
 Seq('GGAAGCUGGGUGGGGGAACUUGUCCGCAGUCAGGCUGGCCAGUUUGGCAUUCUC...GAC'),
 Seq('AGCUGGGUGGGGGAACUUGUCCGCAGUCAGGCUGGCCAGUUUGGCAUUCUCCAG...CUG'),
 Seq('UGGGUGGGGGAACUUGUCCGCAGUCAGGCUGGCCAGUUUGGCAUUCUCCAGCAG...CGG'),
 Seq('GUGGGGGAACUUGUCCGCAGUCAGGCUGGCCAGUUUGGCAUUCUCCAGCAGAAA...CUU'),
 Seq('GGGGAACUUGUCCGCAGUCAGGCUGGCCAGUUUGGCAUUCUCCAGCAGAAAAAG...GAU'),
 Seq('CGCAGUCAGGCUGGCCAGUUUGGCAUUCUCCAGCAGAAAAAGCUUGGGGUGAGC...GGC'),
 Seq('AGUCAGGCUGGCCAGUUUGGCAUUCUCCAGCAGAAAAAGCUUGGGGUGAGCAGC...GGG'),
 Seq('CAGGCUGGCCAGUUUGGCAUUCUCCAGCAGAAAAAGCUUGGGGUGAGCAGCCAG...CAG'),
 Seq('GCUGGCCAGUUUGGCAUUCUCCAGCAGAAAAAGCUUGGGGUGAGCAGCCAGGGA...CGC'),
 Seq('GGCCAGUUUGGCAUUCUCCAGCAGAAAAAGCUUGGGGUGAGCAGCCAGGGAAGU...AGA'),
 Seq('CAGUUUGGCAUUCUCCAGCAGAAAAAGCUUGGGGUGAGCAGCCAGGGAAGUGGG...AGG'),
 Seq('UUUGGCAUUCUCCAGCAGAAAAAGCUUGGGGUGAGCAGCCAGGGAAGUGGGGGC...CGC'),
 Seq('GGCAUUCUCCAGCA

In [38]:
str(sesRNAs_RNA[0])

'GGGAUAGGAAGCUGGGUGGGGGAACUUGUCCGCAGUCAGGCUGGCCAGUUUGGCAUUCUCCAGCAGAAAAAGCUUGGGGUGAGCAGCCAGGGAAGUGGGGGCCUGUGCGUUGAGGAGGCCAGAUGGGAAAAGGUGGCCUCCGAGGAGCUCCGAUGGUGGGUAAGUGGUGGAGUCCAGGUAGUUGAAGUAGUAGAGAGAGCCGCUGGCCGGCAGCCCCACAGCCUGGUU'

In [39]:
# Just making sure to clear Temp folder before starting 
os.system('rm -rf Output/BioPython/Temp/*')

0

In [40]:
# Writing sequences as seperate fasta files 
i = 1
for sesRNA in sesRNAs_RNA:
    outputName = geneName + '_' + str(i)
    outputDescription = "sesRNA #" + str(i)
    i += 1
    
    outputRecord = SeqRecord(sesRNA, id = outputName, description = outputDescription)
    outputFull = 'Output/BioPython/Temp/' + outputName + '.fasta'
    
    with open(outputFull, "w") as output_handle:
        SeqIO.write(outputRecord, output_handle, "fasta")

In [41]:
len(rC_multiExon_sesRNAs)

15

In [42]:
np.empty([2, 2])

array([[0.00e+000, 1.83e-322],
       [2.32e-322, 2.37e-322]])

In [43]:
# Call RNAfold on each sequence of output 
rnaFold_prob = np.empty([len(rC_multiExon_sesRNAs), 0])
pathTemp = '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Output/BioPython/Temp'
pathOutTemp = pathTemp + '/temp.out'

# sorting files in output of scandir 
for entry in sorted(os.scandir(pathTemp), key=lambda e: e.name):
    command = 'RNAfold -p -d2 --noLP < ' + entry.path + ' > ' + pathOutTemp    
    generateProb = subprocess.run(command, shell=True, stdout=subprocess.PIPE)

    # Moving to Temp directory to work on fasta files 
    currentWD = os.getcwd()
    os.chdir('Output/BioPython/Temp')
    
    # Running script for getting probabilities from RNAfold output file (added to ArchBin btw)
    readProb = subprocess.Popen("rnaFold_prob.sh", shell=True, stdout=subprocess.PIPE)
    returnedProb = readProb.stdout.read()
    # Waiting for last command to finish before storing value in temp.out file 
    readProb.wait()
    rnaFold_prob.append(float(returnedProb))
    
    # For checking which file currently working on (not in order for some reason) 
    print(entry.path)
    
    # Removing temp.out after finishing each run 
    os.system('rm -rf temp.out')
    # Return to initial working directory 
    os.chdir(currentWD)

AttributeError: 'numpy.ndarray' object has no attribute 'append'

In [None]:
# Displaying ensemble frequency for secondary structures 
array_rnaFold_prob = np.array(rnaFold_prob)*100
array_rnaFold_prob

In [None]:
array_rnaFold_prob[1:11]

In [None]:
rC_multi_startSeq

In [None]:
rC_multiExon_sesRNAs

In [None]:
str(rC_multiExon_sesRNAs[8])

In [None]:
str(rC_multiExon_sesRNAs[18])

In [None]:
from seqfold import dg, dg_cache, fold

In [None]:
# just returns minimum free energy
dg("GGGAGGTCGTTACATCTGGGTAACACCGGTACTGATCCGGTGACCTCCC", temp = 37.0)  # -12.94

# Test Intarna (test strength of binding, off-target, ...)  

In [None]:
import intarnapvalue
import sys

# Misc

In [None]:
# For returning index of findings 
searchCodon = 'TAG'
[m.start() for m in re.finditer(searchCodon, str(seq))]

In [None]:
testSeq = sesRNAs[0]
testSeq

In [None]:
lastATG = [m.start() for m in re.finditer('ATG', str(testSeq))][-1]
lastTGG = [m.start() for m in re.finditer('TGG', str(testSeq))][-1]

In [None]:
lastTGG

In [None]:
lastATG

In [None]:
lastATG < lastTGG

In [None]:
searchCodon = 'ATG'
[m.start() for m in re.finditer(searchCodon, str(testSeq))][-1]

In [None]:
seq.count('TAG') < 4

In [None]:
seq[0:100].count('TAG')

In [None]:
stopCodons = ['TAG', 'TAA', 'TGA']
stopCodons 

In [None]:
indiciesTGG

In [None]:
indiciesStop

In [None]:
length = 200 
center = length/2

In [None]:
arrayStop = np.array(indiciesStop)
arrayIndicies = np.array(indiciesTGG) 
centralTGGs = arrayIndicies[abs(arrayIndicies - center) < 10]

In [None]:
centralTGGs

In [None]:
np.in1d(centralTGGs,arrayStop)

In [None]:
# Check if array contains values that are within range of values in another array 
any((min(abs(arrayStop - i)) > 10) for i in centralTGGs)

In [None]:
centralTGGs

In [None]:
indiciesStop

In [None]:
testStop = [90, 16, 174]

In [None]:
(min(abs(arrayStop - centralTGGs[0])) > 10)

In [None]:
min(abs(arrayStop - centralTGGs[0])) > 10

In [None]:
centeralTGGs = offset.min()
centeralTGGs

In [None]:
centralTGGs = np.all(offset == offset.min())
centralTGGs

In [None]:
centeralTGGs = np.where(offset == offset.min())
centeralTGG

In [None]:
offset = abs(arrayIndicies - center) 
centerTGG = indiciesTGG[np.argmin(offset)]

In [None]:
any(abs(x - centerTGG) < 10 for x in indiciesStop)

In [None]:
indiciesStop = []
for codons in stopCodons:
    indiciesStop.extend([m.start() for m in re.finditer(codons, str(testSeq))])

In [None]:
len(indiciesStop)

In [None]:
[m.start() for m in re.finditer('TGA', str(testSeq))]

In [None]:
[m.start() for m in re.finditer('TAA', str(testSeq))]

In [None]:
[m.start() for m in re.finditer('TAG', str(testSeq))]

In [None]:
testSeq.count(stopCodons)

In [None]:
indiciesTGG

In [None]:
len(indiciesTGG)

In [None]:
testSeq = sesRNAs[0]

In [None]:
lastTGG = [m.start() for m in re.finditer('TGG', str(testSeq))][-1]

In [None]:
testSeq

In [None]:
indiciesTGG = [m.start() for m in re.finditer('TGG', str(testSeq))]

In [None]:
start = 0 
stop = 200

In [None]:
middle = (start + stop) / 2

In [None]:
abs(middle - indiciesTGG[0])

In [None]:
type(indiciesTGG)

In [None]:
indiciesTGG

In [None]:
any(indiciesTGG) > 2

In [None]:
length = 200 

In [None]:
any(abs(x - (length/2)) < 20 for x in indiciesTGG)

In [None]:
testList = [50, 60, 170, 200]

In [None]:
any(abs(x - (length/2)) < 10 for x in testList)

In [None]:
testSeq

In [None]:
os.path.isdir('Output/BioPython')

In [None]:
outputFileName = os.getcwd() + ''

In [None]:
testSeq