# Setup 

## Import packages 

In [1]:
# General 
import os 
import numpy as np
import pandas as pd 
# For running bash scripts from inside python ... 
import subprocess
# For manipulating string objects 
import re
# for generating any necessary directories
import pathlib 
# For manipulating list objects 
import itertools 

In [2]:
# For working with sequence objects 
from Bio.Seq import Seq

In [3]:
# For fetching sequences from Entrez 
from Bio import Entrez
from Bio import SeqIO

In [4]:
# For extracting features 
from Bio.SeqFeature import SeqFeature, FeatureLocation
# For creating SeqRecord objects 
from Bio.SeqRecord import SeqRecord

In [5]:
# For running R cells 
%load_ext rpy2.ipython

## Misc

In [6]:
geneName = 'PlxnD1'
species = 'Rat'

Entrez.email = "kehaliwoldemichael@gmail.com"  # Always tell NCBI who you are

# Functions 

In [7]:
import sys
# Importing module of personal functions 
sys.path.append('/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/kCellReadR/')
from kCellReadR import * 

# Sequence

## Loading sequences 

In [8]:
rC_exon_records, C_exon_records, CDS, cDNA = load_referenceSequences(geneName, species)

## Selecting sensor 

In [9]:
parameters = parameters_sesRNA('Both', 0, 204, 1, 1, 'None', 30, 70, 10, 20)
all_sesRNAs, all_sequenceMetrics, all_sesRNA_objs = generate_all_sesRNAs(rC_exon_records, C_exon_records, CDS, parameters)

0
0
0
4
0
0
0
20


In [10]:
len(all_sesRNAs)

24

In [11]:
all_sequenceMetrics[all_sequenceMetrics['TypeSeq'] == 'Complement']

Unnamed: 0,SeqNumber,TypeSeq,Exon,StartSeq,StopSeq,firstTGG,centralTGG,second_cTGG,numTGG,numATG,numStop,gcContent
0,1,Complement,4,469.0,673.0,111.0,111.0,126.0,3.0,0.0,1.0,66.2
1,2,Complement,4,472.0,676.0,108.0,108.0,123.0,3.0,0.0,1.0,66.2
2,3,Complement,4,475.0,679.0,105.0,105.0,120.0,4.0,0.0,1.0,66.2
3,4,Complement,4,478.0,682.0,102.0,102.0,117.0,4.0,0.0,1.0,65.7
4,5,Complement,4,481.0,685.0,99.0,99.0,114.0,5.0,0.0,1.0,65.7
5,6,Complement,4,484.0,688.0,96.0,96.0,111.0,5.0,0.0,1.0,65.2
6,7,Complement,4,487.0,691.0,93.0,108.0,93.0,5.0,0.0,1.0,65.7
7,8,Complement,4,490.0,694.0,90.0,105.0,108.0,5.0,0.0,1.0,65.7
8,9,Complement,4,493.0,697.0,87.0,102.0,105.0,5.0,0.0,1.0,65.7
9,10,Complement,4,496.0,700.0,84.0,102.0,99.0,5.0,0.0,1.0,65.2


In [12]:
# Initial if want to check variable length sesRNAs 
for i in range(200, 300):
    if(i%3 == 0):
        print(i)

201
204
207
210
213
216
219
222
225
228
231
234
237
240
243
246
249
252
255
258
261
264
267
270
273
276
279
282
285
288
291
294
297


# Secondary structure 

## Working 

In [13]:
generate_RNApred(all_sesRNAs, all_sequenceMetrics, geneName)

Unnamed: 0,SeqNumber,TypeSeq,Exon,StartSeq,StopSeq,firstTGG,centralTGG,second_cTGG,numTGG,numATG,numStop,gcContent,mfe
0,1,Reverse,4,436.0,640.0,99.0,99.0,,1.0,0.0,1.0,65.7,-93.5
1,2,Reverse,4,439.0,643.0,96.0,96.0,,1.0,0.0,1.0,65.7,-94.9
2,3,Reverse,4,442.0,646.0,93.0,93.0,,1.0,0.0,1.0,65.2,-90.0
3,4,Reverse,4,479.0,683.0,111.0,111.0,174.0,4.0,0.0,1.0,68.1,-96.9
0,1,Complement,4,469.0,673.0,111.0,111.0,126.0,3.0,0.0,1.0,66.2,-92.0
1,2,Complement,4,472.0,676.0,108.0,108.0,123.0,3.0,0.0,1.0,66.2,-91.9
2,3,Complement,4,475.0,679.0,105.0,105.0,120.0,4.0,0.0,1.0,66.2,-87.4
3,4,Complement,4,478.0,682.0,102.0,102.0,117.0,4.0,0.0,1.0,65.7,-84.4
4,5,Complement,4,481.0,685.0,99.0,99.0,114.0,5.0,0.0,1.0,65.7,-86.3
5,6,Complement,4,484.0,688.0,96.0,96.0,111.0,5.0,0.0,1.0,65.2,-84.9


In [14]:
def generate_mfeProb(sequenceMetrics): 
    rnaFold_prob = []

    martBase = '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Output/biomaRt/'
    martBasePath = martBase + species
    # Loading sequences for gene CDS
    CDS_fileName = martBasePath + '/CDS_' + geneName + '.fasta'
    
    pathTemp = '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Output/BioPython/Temp'
    pathOutTempFold = pathTemp + '/temp.out'

    # sorting files in output of scandir 
    for entry in sorted(os.scandir(pathTemp), key=lambda e: e.name):
        # Defining command for RNAfold 
        commandFold = 'RNAfold -p -d2 --noLP < ' + entry.path + ' > ' + pathOutTempFold    
        # Generating RNAfold predictions 
        generateProb = subprocess.run(commandFold, shell=True, stdout=subprocess.PIPE)
        
        # Moving to Temp directory to work on fasta files 
        currentWD = os.getcwd()
        os.chdir('/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Output/BioPython/Temp')

        # Running script for getting probabilities from RNAfold output file (added to ArchBin btw)
        readProb = subprocess.Popen("rnaFold_prob.sh", shell=True, stdout=subprocess.PIPE)
        returnedProb = readProb.stdout.read()
        # Waiting for last command to finish before storing value in temp.out file 
        readProb.wait()
        # Append frequences ... convert to percentage 
        rnaFold_prob.append(round(float(returnedProb)*100, 3))
        
        # Removing temp.out after finishing each run 
        os.system('rm -rf temp.out')
        os.system('rm -rf temp.csv')
        # Return to initial working directory 
        os.chdir(currentWD)

    # Removing files generated by RNAfold 
    os.system('rm -rf *ss.ps')
    os.system('rm -rf *dp.ps')
    
    # Adding RNA fold mfe ensemble frequency to sequenceMetrics 
    sequenceMetrics['mfeFreq'] = rnaFold_prob
    
    return sequenceMetrics  

In [15]:
fold_sequenceMetrics = generate_mfeProb(all_sequenceMetrics)
fold_sequenceMetrics 

Unnamed: 0,SeqNumber,TypeSeq,Exon,StartSeq,StopSeq,firstTGG,centralTGG,second_cTGG,numTGG,numATG,numStop,gcContent,mfe,mfeFreq
0,1,Reverse,4,436.0,640.0,99.0,99.0,,1.0,0.0,1.0,65.7,-93.5,2.193
1,2,Reverse,4,439.0,643.0,96.0,96.0,,1.0,0.0,1.0,65.7,-94.9,3.208
2,3,Reverse,4,442.0,646.0,93.0,93.0,,1.0,0.0,1.0,65.2,-90.0,1.363
3,4,Reverse,4,479.0,683.0,111.0,111.0,174.0,4.0,0.0,1.0,68.1,-96.9,1.163
0,1,Complement,4,469.0,673.0,111.0,111.0,126.0,3.0,0.0,1.0,66.2,-92.0,1.368
1,2,Complement,4,472.0,676.0,108.0,108.0,123.0,3.0,0.0,1.0,66.2,-91.9,1.493
2,3,Complement,4,475.0,679.0,105.0,105.0,120.0,4.0,0.0,1.0,66.2,-87.4,0.645
3,4,Complement,4,478.0,682.0,102.0,102.0,117.0,4.0,0.0,1.0,65.7,-84.4,1.875
4,5,Complement,4,481.0,685.0,99.0,99.0,114.0,5.0,0.0,1.0,65.7,-86.3,14.153
5,6,Complement,4,484.0,688.0,96.0,96.0,111.0,5.0,0.0,1.0,65.2,-84.9,0.631


In [16]:
def output_intaRNA(sequenceMetrics):
    # Call RNAfold on each sequence of output 
    top_intarnaE = []

    intarna_columns = ['E', 'start1', 'end1', 'start2', 'end2']
    all_sorted_intarna = []
    useful_intarna =  pd.DataFrame(columns = intarna_columns)

    martBase = '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Output/biomaRt/'
    martBasePath = martBase + species
    # Loading sequences for gene CDS
    CDS_fileName = martBasePath + '/CDS_' + geneName + '.fasta'
    
    pathTemp = '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Output/BioPython/Temp'
    pathOutTempIntaRNA = pathTemp + '/temp.csv'

    # sorting files in output of scandir 
    for entry in sorted(os.scandir(pathTemp), key=lambda e: e.name):
        # For checking which file currently working on (not in order for some reason) 
        print(entry.path)

        # Defining command for IntaRNA
        numThreads = 10
        commandInta = 'IntaRNA -t ' + CDS_fileName + ' -q ' + entry.path + ' --threads ' + str(numThreads) + ' --outMode=C > ' + pathOutTempIntaRNA
        # Generating IntaRNA predictions 
        os.system(commandInta)
        # Reading in csv file with information 
        intarnaOutput = pd.read_csv(pathOutTempIntaRNA, sep = ';')
        # Saving pd output as list of lists ... would be a pain to compute multiple times if desired 
        sorted_intarna = intarnaOutput.sort_values('E', ascending=True)
        # Concatinating select columuns of dataframe for inclusion into sequence metrics 
        all_sorted_intarna.append(sorted_intarna)
        print(sorted_intarna)
        # Getting first row ... with highest energy values ... and appending to DataFrame 
        useful_intarna = useful_intarna.append(sorted_intarna[intarna_columns].iloc[0:1])
        
        # Moving to Temp directory to work on fasta files 
        currentWD = os.getcwd()
        os.chdir('/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Output/BioPython/Temp')

        # Removing temp.out after finishing each run 
        os.system('rm -rf temp.out')
        os.system('rm -rf temp.csv')
        # Return to initial working directory 
        os.chdir(currentWD)

    # Removing files generated by RNAfold 
    os.system('rm -rf *ss.ps')
    os.system('rm -rf *dp.ps')
    # Removing temp fasta files 
#     resetCommand = 'cd ' + pathTemp + ' && rm -rf *'
#     os.system(resetCommand)
    
    # Have to reset index since useful_intarna is slice of many pd.DataFrames 
    out_sequenceMetrics = pd.concat([sequenceMetrics.reset_index(), useful_intarna.reset_index()], axis = 1)
    
    return out_sequenceMetrics 

In [17]:
intarna_sequenceMetrics = output_intaRNA(fold_sequenceMetrics)
intarna_sequenceMetrics

/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Output/BioPython/Temp/PlxnD1_01.fasta
      id1  start1  end1        id2  start2  end2  \
0  Plxnd1      54   202  PlxnD1_01      53   201   
1  Plxnd1      54   202  PlxnD1_01      53   201   

                                            subseqDP  \
0  GCCGCCGGCCUCCGGCACCGGGGGCAGCCGUCUGCUCGUGGGCGCC...   
1  GCCGCCGGCCUCCGGCACCGGGGGCAGCCGUCUGCUCGUGGGCGCC...   

                                            hybridDP       E  
0  ((((((((((((((((((((((((((((((((((((((((((((((... -239.05  
1  ((((((((((((((((((((((((((((((((((((((((((((((... -239.05  
/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Output/BioPython/Temp/PlxnD1_02.fasta
      id1  start1  end1        id2  start2  end2  \
0  Plxnd1      54   202  PlxnD1_02      50   198   
1  Plxnd1      54   202  PlxnD1_02      50   198   

                                            subseqDP  \
0  GCCGCCGGCCUCCGGCACCGG

Unnamed: 0,index,SeqNumber,TypeSeq,Exon,StartSeq,StopSeq,firstTGG,centralTGG,second_cTGG,numTGG,...,numStop,gcContent,mfe,mfeFreq,index.1,E,start1,end1,start2,end2
0,0,1,Reverse,4,436.0,640.0,99.0,99.0,,1.0,...,1.0,65.7,-93.5,2.193,0,-239.05,54,202,53,201
1,1,2,Reverse,4,439.0,643.0,96.0,96.0,,1.0,...,1.0,65.7,-94.9,3.208,0,-239.05,54,202,50,198
2,2,3,Reverse,4,442.0,646.0,93.0,93.0,,1.0,...,1.0,65.2,-90.0,1.363,0,-239.05,54,202,47,195
3,3,4,Reverse,4,479.0,683.0,111.0,111.0,174.0,4.0,...,1.0,68.1,-96.9,1.163,0,-239.05,54,202,10,158
4,0,1,Complement,4,469.0,673.0,111.0,111.0,126.0,3.0,...,1.0,66.2,-92.0,1.368,0,-239.05,54,202,28,176
5,1,2,Complement,4,472.0,676.0,108.0,108.0,123.0,3.0,...,1.0,66.2,-91.9,1.493,0,-239.05,54,202,31,179
6,2,3,Complement,4,475.0,679.0,105.0,105.0,120.0,4.0,...,1.0,66.2,-87.4,0.645,0,-239.05,54,202,34,182
7,3,4,Complement,4,478.0,682.0,102.0,102.0,117.0,4.0,...,1.0,65.7,-84.4,1.875,0,-239.05,54,202,37,185
8,4,5,Complement,4,481.0,685.0,99.0,99.0,114.0,5.0,...,1.0,65.7,-86.3,14.153,0,-239.05,54,202,40,188
9,5,6,Complement,4,484.0,688.0,96.0,96.0,111.0,5.0,...,1.0,65.2,-84.9,0.631,0,-239.05,54,202,43,191


In [None]:
def output_RNApred(sequenceMetrics):
    # Call RNAfold on each sequence of output 
    rnaFold_prob = []
    top_intarnaE = []

    intarna_columns = ['E', 'start1', 'end1', 'start2', 'end2']
    all_sorted_intarna = []
    useful_intarna =  pd.DataFrame(columns = intarna_columns)

    martBase = '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Output/biomaRt/'
    martBasePath = martBase + species
    # Loading sequences for gene CDS
    CDS_fileName = martBasePath + '/CDS_RNA_' + geneName + '.fasta'
    
    pathTemp = '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Output/BioPython/Temp'
    pathOutTempFold = pathTemp + '/temp.out'
    pathOutTempIntaRNA = pathTemp + '/temp.csv'

    # sorting files in output of scandir 
    for entry in sorted(os.scandir(pathTemp), key=lambda e: e.name):
        # For checking which file currently working on (not in order for some reason) 
        print(entry.path)

        # Defining command for RNAfold 
        commandFold = 'RNAfold -p -d2 --noLP < ' + entry.path + ' > ' + pathOutTempFold    
        # Generating RNAfold predictions 
        generateProb = subprocess.run(commandFold, shell=True, stdout=subprocess.PIPE)

        # Defining command for IntaRNA
        numThreads = 10
        commandInta = 'IntaRNA -t ' + CDS_fileName + ' -q ' + entry.path + ' --threads ' + str(numThreads) + ' --outMode=C > ' + pathOutTempIntaRNA
        # Generating IntaRNA predictions 
        os.system(commandInta)
        # Reading in csv file with information 
        intarnaOutput = pd.read_csv(pathOutTempIntaRNA, sep = ';')
        # Saving pd output as list of lists ... would be a pain to compute multiple times if desired 
        sorted_intarna = intarnaOutput.sort_values('E', ascending=True)
        # Concatinating select columuns of dataframe for inclusion into sequence metrics 
        all_sorted_intarna.append(sorted_intarna)
        # Getting first row ... with highest energy values ... and appending to DataFrame 
        useful_intarna = useful_intarna.append(sorted_intarna[intarna_columns].iloc[0:1])
        
        # Moving to Temp directory to work on fasta files 
        currentWD = os.getcwd()
        os.chdir('/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Output/BioPython/Temp')

        # Running script for getting probabilities from RNAfold output file (added to ArchBin btw)
        readProb = subprocess.Popen("rnaFold_prob.sh", shell=True, stdout=subprocess.PIPE)
        returnedProb = readProb.stdout.read()
        # Waiting for last command to finish before storing value in temp.out file 
        readProb.wait()
        # Append frequences ... convert to percentage 
        rnaFold_prob.append(float(returnedProb)*100)

        
        # Removing temp.out after finishing each run 
        os.system('rm -rf temp.out')
        os.system('rm -rf temp.csv')
        # Return to initial working directory 
        os.chdir(currentWD)

    # Removing files generated by RNAfold 
    os.system('rm -rf *ss.ps')
    os.system('rm -rf *dp.ps')
    # Removing temp fasta files 
    resetCommand = 'cd ' + pathTemp + ' && rm -rf *'
    os.system(resetCommand)
    
    # Adding RNA fold mfe ensemble frequency to sequenceMetrics 
    sequenceMetrics['mfeFreq'] = rnaFold_prob
    # Have to reset index since useful_intarna is slice of many pd.DataFrames 
    out_sequenceMetrics = pd.concat([sequenceMetrics.reset_index(), useful_intarna.reset_index()], axis = 1)
    
    return out_sequenceMetrics 

In [None]:
higherSequenceMetrics = output_RNApred(all_sequenceMetrics)
higherSequenceMetrics

In [None]:
outputPath = os.getcwd() + '/' + outputFile

In [None]:
outputPath

In [None]:
intarnaOutput = pd.read_csv(outputPath, sep = ';')

In [None]:
intarnaOutput

In [None]:
rnaFold_prob

In [None]:
# Add as column in sequence metrics dataframe 
sequenceMetrics['rnaFoldProb'] = rnaFold_prob
# Concatenate IntaRNA output as additional columns 
pd.concat(sequenceMetrics, useful_intarna, axis = 1)
sequenceMetrics

In [None]:
temp_metrics.sort_values('rnaFoldProb', ascending=True).iloc[0]

In [None]:
sortedMetrics.loc[10, :]

In [None]:
sortedMetrics

In [None]:
sortedMetrics[['Exon', 'numTGG']]

# Outputing sesRNA

In [None]:
# Generating BioPython directory if does not exist 
pathlib.Path('Output/BioPython').mkdir(parents=True, exist_ok=True)

# Generate SeqRecord object for each sequence and append to list 
outputID = geneName + '_sesRNA'
outputDescription = "sesRNA for " + geneName

# Generating sequence record objects (for seperate storage)
outputSeqMulti_DNA = []
outputSeqMulti_RNA = []
for i in rC_multiExon_sesRNAs:
    outputSeqMulti.append(SeqRecord(i, id = outputID, description = outputDescription))
    
# Write output fasta files 
version = "V6"
outputName = "Output/BioPython/" + geneName + "_sesRNA_" + version + ".fasta" 
with open(outputName, "w") as output_handle:
    SeqIO.write(outputSeqMulti, output_handle, "fasta")

In [None]:
# Given sequence ... converts to in frame TGGs to TAGs and in frame stops so that first 'T' becomes 'G'
# Had to be careful to only work with in frame codons ... initally had made the mistake to just use string.replace ... this would change out of frame codons as well 
def convert_DNA(sequence, numberConvert):
    # Converting to string object for manipulation 
    strSeq = str(sequence)
    # Generating in frame object variables 
    num_inF_TGG, num_inF_ATG, num_inF_Stop, indicesTGG, indicesATG, indicesStop = return_inFrame(Seq(strSeq), 'all')
    print(num_inF_TGG)
    # print(num_inF_Stop)

    # Replacing in frame stop codons in sequence 
    for stop in indicesStop: 
        stopPairs = [("TAG", "GAG"), ("TAA", "GAA"), ("TGA", "GGA")]
        stopSeq = strSeq[stop:stop+3]
        [stopSeq := stopSeq.replace(a, b) for a, b in stopPairs]
        strSeq = strSeq[:stop] + stopSeq + strSeq[stop+3:]
    
    # Setting number convert to all if 'All' selected as number of TGG to convert 
    if numberConvert == 'All': numberConvert = num_inF_TGG
    # Converting TGG's ... up to number set ... and in order from starting with most central 
    # Sorts indicees by distance from center 
    sorted_indices_centralTGG = np.array(sorted(indicesTGG - (len(strSeq)/2), key = abs)) + (len(strSeq)/2)
    # Converts in frame TGG's ... starting from most central TGG ... up to limit set by numberConvert 
    for i in range(numberConvert):
        currentIndex = int(sorted_indices_centralTGG[i])
        strSeq = strSeq[:currentIndex] + 'TAG' + strSeq[currentIndex+3:]
    # Returns RNA 
    return Seq(strSeq).transcribe()

In [None]:
testSeq = Seq('TGGGAGTAGTGGTGGTAATGA')
testStr = str(testSeq)

In [None]:
convert_DNA(testSeq, 1)

In [None]:
# Test that all and numbering is working 
convert_DNA(testSeq, 3) == convert_DNA(testSeq, 'All')

In [None]:
convert_DNA(rC_multiExon_sesRNAs[0], 'All')

In [None]:
# Testing that continious reading frame being produced if number set to 0 
len(convert_DNA(rC_multiExon_sesRNAs[0], 0).translate(to_stop = True)) == len(rC_multiExon_sesRNAs[0])/3

In [None]:
# Checking that right number of TGG being converted 
return_inFrame(convert_DNA(rC_multiExon_sesRNAs[0], 'All').back_transcribe(), 'numTGG')

In [None]:
convert_DNA(testSeq, 1) - (len(testSeq)/2)

In [None]:
abs(convert_DNA(testSeq, 1) - (len(testSeq)/2))

In [None]:
np.sort(abs(convert_DNA(testSeq, 1) - (len(testSeq)/2))) + (len(testSeq)/2)

In [None]:
np.array(sorted(convert_DNA(testSeq, 1) - (len(testSeq)/2), key = abs)) + (len(testSeq)/2)

In [None]:
# Function for saving both original template DNA sequence and the converted RNA given list of sesRNAs Seq objects
# Could change later to make for separate output files ... have not decided yet whether to just put them together into one labeled fasta file 
def save_sesRNAs(sequences_sesRNAs, geneName, version, numConvertTGG):
    # Generating BioPython directory if does not exist 
    pathlib.Path('Output/BioPython').mkdir(parents=True, exist_ok=True)
    
    # Defining save name and description for outputs (original DNA and converted RNA)
    DNA_outputID = geneName + '_sesRNA'
    DNA_outputDescription = "sesRNA DNA original for " + geneName
    convertedRNA_outputID = geneName + '_sesRNA'
    convertedRNA_outputDescription = "sesRNA converted RNA for " + geneName
    
    # Creating empty list for storing sequences 
    outputSeqMulti_DNA = []
    outputSeqMulti_convertedRNA= []
    
    # Generating SeqRecord objects in preparation for writing FASTA files ... edited description to include number of sequence
    i = 1
    for sequence in sequences_sesRNAs:
        outputSeqMulti_DNA.append(SeqRecord(sequence, id = DNA_outputID+str(i), description = DNA_outputDescription))
        outputSeqMulti_convertedRNA.append(SeqRecord(convert_DNA(sequence, numberConvertTGG), id = convertedRNA_outputID+str(i), description = convertedRNA_outputDescription))
        i += 1
        
    # Write output fasta files 
    DNA_outputName = "Output/BioPython/" + geneName + "_sesRNA_" + version + ".fasta" 
    convertedRNA_outputName = "Output/BioPython/" + geneName + "_sesRNA_convertedRNA_" + version + ".fasta" 
    
    with open(outputName, "w") as output_handle:
        # Writing original template DNA files 
        SeqIO.write(outputSeqMulti_DNA, output_handle, "fasta")
        # Writing orginal template RNA files 
        SeqIO.write(outputSeqMulti_convertedRNA, output_handle, "fasta")

# Not working or not being used 

In [None]:
sys.path.append("/usr/share/ViennaRNA")

In [None]:
# Leading RNAfold as RNA 
import sys
sys.path.append("/usr/lib/python3.9/site-packages/RNA")
import _RNA as RNA

In [None]:
import pyseqlib

In [None]:
from pyseqlib import pyRNAfold

In [None]:
pyRNAfold i

In [None]:
pyseqlib.fold_compound(sequence)

In [None]:
RNA.fold_compound(sequence)

In [None]:
sys.path.append("/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Packages/RNA")
import _RNA as RNA

In [None]:
sys.path.append("/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions")
import RNA_Fold

In [None]:
md = RNA.md()

In [None]:
import example

In [None]:
sequence = "CGCAGGGAUACCCGCG"
 
# create new fold_compound object
fc = RNA.fold_compound(sequence)
 
# compute minimum free energy (mfe) and corresponding structure
(ss, mfe) = fc.mfe()
 

In [None]:
# The RNA sequence
seq = "GAGUAGUGGAACCAGGCUAUGUUUGUGACUCGCAGACUAACA"
 
# compute minimum free energy (MFE) and corresponding structure
(ss, mfe) = RNA.fold(seq)

In [None]:
import pickle 

In [None]:
mfe

In [None]:
pathOuput = "/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions/seqObject.p"
pickle.dump(testSeq, open(pathOuput, 'wb'))

In [None]:
# Running script for getting probabilities from RNAfold output file (added to ArchBin btw)
pathFuncPython = '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions/'
# command = 'python ' + '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions/RNA_Fold.py'
# command = 'cd ' + pathFuncPython + ' | ./RNA_Fold.py'
# command = 'cd ' + pathFuncPython + ' | ls -a'

command = 'ls -a'
readProb = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
returnedProb = readProb.stdout.read()

In [None]:
returnedProb

In [None]:
command 

In [None]:
str(rC_multiExon_sesRNAs[0])

In [None]:
testSeq = str(rC_multiExon_sesRNAs[0])
testSeq

In [None]:
RNA.fold(testSeq)

In [None]:
def compute_mfeFreq(sequence):
    # create a fold_compound object for the current sequence
    fc = RNA.fold_compound(sequence)

    # compute the MFE and corresponding structure
    (mfe_struct, mfe) = fc.mfe()


    # compute partition function
    (bp_propensity, dG) = fc.pf()

    # compute frequency of MFE structure (the 'hard' way)
    kT = RNA.exp_param().kT / 1000.

    prob_mfe = math.exp((dG - mfe) / kT)
    
    return prob_mfe

In [None]:
rC_multi_startSeq

In [None]:
rC_multiExon_sesRNAs

In [None]:
str(rC_multiExon_sesRNAs[8])

In [None]:
str(rC_multiExon_sesRNAs[18])

In [None]:
from seqfold import dg, dg_cache, fold

In [None]:
# just returns minimum free energy
dg("GGGAGGTCGTTACATCTGGGTAACACCGGTACTGATCCGGTGACCTCCC", temp = 37.0)  # -12.94

# Test Intarna (test strength of binding, off-target, ...)  

In [None]:
import intarnapvalue

In [None]:
testSeq = str(rC_multiExon_sesRNAs[0])
testSeq

In [None]:
target = str(CDS[0].seq)
target 

In [None]:
from intarnapvalue.intarna_pvalue import IntaRNApvalue

In [None]:
?IntaRNApvalue

In [None]:
?intarna_pvalue

In [None]:
IntaRNApvalue(['--query', testSeq, '--target', target])

In [None]:
IntaRNApvalue['-q', 'AGGAUG', '-t', 'UUUAUCGUU', '-s', '10', '-m', 'b', '-d', 'gauss', '--threads', '3']

In [None]:
command = 'python -m intarnapvalue --query GCUGAAAAACAUAACCCAUAAAAUGCUAGCUGUACCAGGAACCA --target GGUUUCUUCGCCUCUGCGUUCACCAAAGUGUUCACCC -s 10 --shuffle-mode b --threads 0' 

In [None]:
readProb = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
readProb.wait()
returnedProb = readProb.stdout.read()
returnedProb

In [None]:
# Running script for getting probabilities from RNAfold output file (added to ArchBin btw)
readProb = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
returnedProb = readProb.stdout.read()
# Waiting for last command to finish before storing value in temp.out file 
readProb.wait()

In [None]:
pd.read_csv('/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions/output.csv', sep = ';')

In [None]:
pd.read_csv('/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions/output.csv', sep = ';')

In [None]:
pd.read_csv('/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions/output.csv', sep = ';')

In [None]:
# Trying to load entire transcripome ... obviously failed ... 
allRat_fileName = '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Data/Sequences/Reference/Ensembl/All/Rattus_norvegicus.Rnor_6.0.cdna.all.fa'
allRat = list(SeqIO.parse(allRat_fileName, "fasta"))

In [None]:
python -m intarnapvalue --query GCUGAAAAACAUAACCCAUAAAAUGCUAGCUGUACCAGGAACCA --target GGUUUCUUCGCCUCUGCGUUCACCAAAGUGUUCACCC --scores 10000 --shuffle-mode b --threads 0

In [None]:
intarnapvalue --query GCUGAAAAACAUAACCCAUAAAAUGCUAGCUGUACCAGGAACCA --target GGUUUCUUCGCCUCUGCGUUCACCAAAGUGUUCACCC --scores 10000 --shuffle-mode b --threads 0

# Misc

In [None]:
# For returning index of findings 
searchCodon = 'TAG'
[m.start() for m in re.finditer(searchCodon, str(seq))]

In [None]:
testSeq = sesRNAs[0]
testSeq

In [None]:
lastATG = [m.start() for m in re.finditer('ATG', str(testSeq))][-1]
lastTGG = [m.start() for m in re.finditer('TGG', str(testSeq))][-1]

In [None]:
lastTGG

In [None]:
lastATG

In [None]:
lastATG < lastTGG

In [None]:
searchCodon = 'ATG'
[m.start() for m in re.finditer(searchCodon, str(testSeq))][-1]

In [None]:
seq.count('TAG') < 4

In [None]:
seq[0:100].count('TAG')

In [None]:
stopCodons = ['TAG', 'TAA', 'TGA']
stopCodons 

In [None]:
indiciesTGG

In [None]:
indiciesStop

In [None]:
length = 200 
center = length/2

In [None]:
arrayStop = np.array(indiciesStop)
arrayIndicies = np.array(indiciesTGG) 
centralTGGs = arrayIndicies[abs(arrayIndicies - center) < 10]

In [None]:
centralTGGs

In [None]:
np.in1d(centralTGGs,arrayStop)

In [None]:
# Check if array contains values that are within range of values in another array 
any((min(abs(arrayStop - i)) > 10) for i in centralTGGs)

In [None]:
centralTGGs

In [None]:
indiciesStop

In [None]:
testStop = [90, 16, 174]

In [None]:
(min(abs(arrayStop - centralTGGs[0])) > 10)

In [None]:
min(abs(arrayStop - centralTGGs[0])) > 10

In [None]:
centeralTGGs = offset.min()
centeralTGGs

In [None]:
centralTGGs = np.all(offset == offset.min())
centralTGGs

In [None]:
centeralTGGs = np.where(offset == offset.min())
centeralTGG

In [None]:
offset = abs(arrayIndicies - center) 
centerTGG = indiciesTGG[np.argmin(offset)]

In [None]:
any(abs(x - centerTGG) < 10 for x in indiciesStop)

In [None]:
indiciesStop = []
for codons in stopCodons:
    indiciesStop.extend([m.start() for m in re.finditer(codons, str(testSeq))])

In [None]:
len(indiciesStop)

In [None]:
[m.start() for m in re.finditer('TGA', str(testSeq))]

In [None]:
[m.start() for m in re.finditer('TAA', str(testSeq))]

In [None]:
[m.start() for m in re.finditer('TAG', str(testSeq))]

In [None]:
testSeq.count(stopCodons)

In [None]:
indiciesTGG

In [None]:
len(indiciesTGG)

In [None]:
testSeq = sesRNAs[0]

In [None]:
lastTGG = [m.start() for m in re.finditer('TGG', str(testSeq))][-1]

In [None]:
testSeq

In [None]:
indiciesTGG = [m.start() for m in re.finditer('TGG', str(testSeq))]

In [None]:
start = 0 
stop = 200

In [None]:
middle = (start + stop) / 2

In [None]:
abs(middle - indiciesTGG[0])

In [None]:
type(indiciesTGG)

In [None]:
indiciesTGG

In [None]:
any(indiciesTGG) > 2

In [None]:
length = 200 

In [None]:
any(abs(x - (length/2)) < 20 for x in indiciesTGG)

In [None]:
testList = [50, 60, 170, 200]

In [None]:
any(abs(x - (length/2)) < 10 for x in testList)

In [None]:
testSeq

In [None]:
os.path.isdir('Output/BioPython')

In [None]:
outputFileName = os.getcwd() + ''

In [None]:
testSeq