# Setup 

## Import packages 

In [1]:
# General 
import os 
import numpy as np
import pandas as pd 
# For running bash scripts from inside python ... 
import subprocess
# For manipulating string objects 
import re
# for generating any necessary directories
import pathlib 
# For manipulating list objects 
import itertools 

In [2]:
# For working with sequence objects 
from Bio.Seq import Seq

In [3]:
# For fetching sequences from Entrez 
from Bio import Entrez
from Bio import SeqIO

In [4]:
# For extracting features 
from Bio.SeqFeature import SeqFeature, FeatureLocation
# For creating SeqRecord objects 
from Bio.SeqRecord import SeqRecord

In [5]:
# For running R cells 
%load_ext rpy2.ipython

## Misc

In [45]:
geneName = 'PlxnD1'
species = 'Rat'

Entrez.email = "kehaliwoldemichael@gmail.com"  # Always tell NCBI who you are

# Functions 

In [39]:
import sys
# Importing module of personal functions 
sys.path.append('/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/kCellReadR/')
from kCellReadR import * 

# Sequence

## Loading sequences 

In [46]:
rC_exon_records, C_exon_records, CDS, cDNA = load_referenceSequences(geneName, species)

In [47]:
CDS

[SeqRecord(seq=Seq('ATGCTCAACGTGGCCGCCAACCACCCCAACGCGTCCACCGTGGGACTAGTGCTG...TGA'), id='Plxnd1', name='Plxnd1', description='Plxnd1', dbxrefs=[])]

In [48]:
cDNA

[SeqRecord(seq=Seq('CTGCCGCTGCCTCTGCTGCTGCTGCTTGGGGCGGCACGGGCGGGCGCCCTAGAG...ACC'), id='Plxnd1', name='Plxnd1', description='Plxnd1', dbxrefs=[])]

In [9]:
len(CDS[0].seq)

1368

In [10]:
len(rC_exon_records[2].seq)

890

In [11]:
seq_record = rC_exon_records[1]

In [12]:
metric_gcContent(seq_record.seq)

0.5761316872427984

In [13]:
seq_record

SeqRecord(seq=Seq('CTTTTTCCCCCACCGCCAAGGAGATGCGTTCCGAGCCATGCAGCGTGTCTCTTC...CTA'), id='Fezf2', name='Fezf2', description='Fezf2', dbxrefs=[])

In [14]:
seq = seq_record.seq
len(seq)

243

## Selecting sensor 

In [15]:
parameters = parameters_sesRNA('Reverse', 0, 204, 3, 0, 'None', 30, 70, 20, 10)

In [16]:
parameters.print_parameters()

[instance attributes]
typeSeq = Reverse
isoform = 0
length = 204
num_inF_TGG = 3
num_inF_Stop = 0
num_inF_ATG = None
minGC = 30
maxGC = 70
nearCenter = 20
fromStop = 10


In [17]:
seq_record = rC_exon_records[0]

In [18]:
seq = seq_record.seq
len(seq)

671

In [19]:
sesRNAs, sequenceMetrics, sesRNA_objs = generate_sesRNA(seq, CDS, parameters, 4)
sesRNAs

[]

In [20]:
sesRNA_objs

[]

In [21]:
sequenceMetrics

Unnamed: 0,Exon,StartSeq,StopSeq,firstTGG,centralTGG,second_cTGG,numTGG,numATG,numStop,gcContent


In [22]:
rC_parameters = parameters_sesRNA('Reverse', 0, 204, 3, 0, 'None', 30, 70, 20, 20)
rC_multiExon_sesRNAs, rC_sequenceMetrics, rC_sesRNA_objs = generate_sesRNAs_multiExon(rC_exon_records, CDS, rC_parameters)

0
0
24


In [23]:
rC_parameters.print_parameters()

[instance attributes]
typeSeq = Reverse
isoform = 0
length = 204
num_inF_TGG = 3
num_inF_Stop = 0
num_inF_ATG = None
minGC = 30
maxGC = 70
nearCenter = 20
fromStop = 20


In [24]:
rC_multiExon_sesRNAs[0]

Seq('ATAGGAAGCTGGGTGGGGGAACTTGTCCGCAGTCAGGCTGGCCAGTTTGGCATT...GGC')

In [25]:
rC_sequenceMetrics

Unnamed: 0,SeqNumber,Exon,StartSeq,StopSeq,firstTGG,centralTGG,second_cTGG,numTGG,numATG,numStop,gcContent
0,1,3,156.0,360.0,9.0,120.0,150.0,4.0,0.0,0.0,59.313725
1,2,3,159.0,363.0,6.0,117.0,147.0,4.0,0.0,0.0,60.784314
2,3,3,162.0,366.0,3.0,114.0,144.0,4.0,0.0,0.0,60.784314
3,4,3,165.0,369.0,0.0,111.0,141.0,4.0,0.0,0.0,61.27451
4,5,3,168.0,372.0,108.0,108.0,138.0,3.0,0.0,0.0,61.27451
5,6,3,171.0,375.0,105.0,105.0,135.0,3.0,0.0,0.0,61.27451
6,7,3,174.0,378.0,102.0,102.0,132.0,3.0,0.0,0.0,60.784314
7,8,3,177.0,381.0,99.0,99.0,129.0,3.0,0.0,0.0,60.784314
8,9,3,180.0,384.0,96.0,96.0,126.0,3.0,0.0,0.0,60.784314
9,10,3,183.0,387.0,93.0,93.0,123.0,3.0,0.0,0.0,60.784314


In [26]:
testOut = str(rC_multiExon_sesRNAs[23])
testDash = 'CTTGGGGTGAGCAGCCAGGGAAGTGGGGGCCTGTGCGTTGAGGAGGCCAGATGGGAAAAGGTGGCCTCCGAGGAGCTCCGATGGTGGGTAAGTGGTGGAGTCCAGGTAGTTGAAGTAGTAGAGAGAGCCGCTGGCCGGCAGCCCCACAGCCTGGTTGATGACCTGCGGCTTGATGACCCTGCCGGCGGGCAGCGCAGAAGGCGC'

In [27]:
testOut == testDash

True

In [28]:
rC_exon_records 

[SeqRecord(seq=Seq('TAGTGGTTCTGTTTATTGAGTCATATATGTGTAATATTCCGTGTTCGCTTGTAC...TCC'), id='Fezf2', name='Fezf2', description='Fezf2', dbxrefs=[]),
 SeqRecord(seq=Seq('CTTTTTCCCCCACCGCCAAGGAGATGCGTTCCGAGCCATGCAGCGTGTCTCTTC...CTA'), id='Fezf2', name='Fezf2', description='Fezf2', dbxrefs=[]),
 SeqRecord(seq=Seq('CTTGCCGCACACTTCGCAGGTGAAGTTTTTGGGTTTGCTGTCAGTAGAGCCCCC...AGT'), id='Fezf2', name='Fezf2', description='Fezf2', dbxrefs=[])]

In [29]:
rC_sesRNA_objs[0].second_cTGG

150

In [30]:
parameters.print_parameters()

[instance attributes]
typeSeq = Reverse
isoform = 0
length = 204
num_inF_TGG = 3
num_inF_Stop = 0
num_inF_ATG = None
minGC = 30
maxGC = 70
nearCenter = 20
fromStop = 10


In [31]:
# Just for quickly checking things ... 
all_indices_inF_TGG = []
all_indices_inF_ATG = []
    
for sequence in rC_multiExon_sesRNAs:
    num_inF_TGG, num_inF_ATG, num_inF_Stops, indicesTGG, indicesATG, indicesStops = return_inFrame(sequence, 'all')
    all_indices_inF_TGG.append(indicesTGG)
    all_indices_inF_ATG.append(indicesATG)
    print(indicesTGG)
    # print(num_inF_TGG)

[  9 120 150 153]
[  6 117 147 150]
[  3 114 144 147]
[  0 111 141 144]
[108 138 141]
[105 135 138]
[102 132 135]
[ 99 129 132]
[ 96 126 129]
[ 93 123 126]
[ 90 120 123]
[ 87 117 120]
[ 84 114 117]
[ 81 111 114]
[ 78 108 111]
[ 75 105 108]
[ 72 102 105]
[ 69  99 102]
[66 96 99]
[63 93 96]
[60 90 93]
[57 87 90]
[54 84 87]
[51 81 84]


In [32]:
max(rC_sequenceMetrics['StartSeq'])

225.0

In [33]:
# Just looking at 'window' in which sesRNAs are being produced 
max(rC_sequenceMetrics['StopSeq']) - min(rC_sequenceMetrics['StartSeq'])

273.0

In [34]:
# Checking GC content of sesRNAs 
for sequence in rC_multiExon_sesRNAs:
    print(metric_gcContent(sequence))

0.5931372549019608
0.6078431372549019
0.6078431372549019
0.6127450980392157
0.6127450980392157
0.6127450980392157
0.6078431372549019
0.6078431372549019
0.6078431372549019
0.6078431372549019
0.6029411764705882
0.6127450980392157
0.6078431372549019
0.6029411764705882
0.5980392156862745
0.5980392156862745
0.6127450980392157
0.6127450980392157
0.6274509803921569
0.6274509803921569
0.6323529411764706
0.6274509803921569
0.6372549019607843
0.6470588235294118


In [None]:
# Just additional check if in CDS 
cds_sesRNAs = return_inCDS(rC_multiExon_sesRNAs, CDS, 0, 'Reverse')
cds_sesRNAs

In [None]:
# Checking complement
C_parameters = parameters_sesRNA('Complement', 0, 204, 1, 1, 'All upstream')
C_multiExon_sesRNAs, C_sequenceMetrics, C_sesRNA_objs = generate_sesRNAs_multiExon(C_exon_records, CDS, C_parameters)

In [None]:
C_sequenceMetrics 

In [None]:
C_multiExon_sesRNAs

In [None]:
# Initial if want to check variable length sesRNAs 
for i in range(200, 300):
    if(i%3 == 0):
        print(i)

In [None]:
testSeq = sesRNAs[0]
testSeq

In [None]:
str(testSeq)

In [None]:
?split()

In [None]:
subsequence = str(testSeq)
codons = [subsequence for subsequence in re.split(r'(\w{3})', subsequence) if subsequence]

In [None]:
tempSeq = Seq('GTTCTCCTTCAGCACCTGCTCCAGCGGCGCATGCAAGCGCTCCTTATGGGGATAGGAAGCTGGGTGGGGGAACTTGTCCGCAGTCAGGCTGGCCAGTTTGGCATTCTCCAGCAGAAAAAGCTTGGGGTGAGCAGCCAGGGAAGTGGGGGCCTGTGCGTTGAGGAGGCCAGATGGGAAAAGGTGGCCTCCGAGGAGCTCCGATGG')

In [None]:
check_cORF(tempSeq)

In [None]:
coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")

In [None]:
check_cORF(coding_dna)

# Secondary structure 

## Working 

In [None]:
def generate_RNApred(sesRNAs_DNA, sequenceMetrics):
    # Leading RNAfold as RNA 
    sys.path.append("/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Packages/ViennaRNA_Python3/usr/lib/python3.9/site-packages/RNA")
    import _RNA as RNA
    
    # Generating Temp 
    pathlib.Path('Output/BioPython/Temp').mkdir(parents=True, exist_ok=True)
    # Generating RNA of sesRNA 
    sesRNAs_RNA = return_sesRNA_RNA(sesRNAs_DNA)
    
    # Just making sure to clear Temp folder before starting 
    os.system('rm -rf Output/BioPython/Temp/*')
    
    # Creating temporary fasta files of sesRNAs (RNA)
    output_temp_sesRNA(sesRNAs_RNA)
    
    # Add as column in sequence metrics dataframe 
    sequenceMetrics['mfe'] = all_mfe
    sequenceMetrics

In [None]:
# Converting to RNA for calculating secondary structure 
def return_sesRNA_RNA(sesRNAs_DNA):
    sesRNAs_RNA = []
    for i in range(len(sesRNAs_DNA)):
        sesRNAs_RNA.append(sesRNA_DNA[i].transcribe())
    return sesRNAs_RNA

In [None]:
def output_temp_sesRNA(sesRNAs_RNA): 
    # Writing sequences as seperate fasta files 
    i = 1
    all_mfe = []

    for sesRNA in sesRNAs_RNA:
        (ss, mfe) = RNA.fold(str(sesRNAs_RNA[i-1]))
        all_mfe.append(mfe)

        # Making sure that single digit number stast with 0 so that files processed in order 
        if i < 10: numSes = '0' + str(i)
        else: numSes = str(i)

        # Defining output name 
        outputName = geneName + '_' + numSes
        outputDescription = "sesRNA #" + numSes
        i += 1

        outputRecord = SeqRecord(sesRNA, id = outputName, description = outputDescription)
        outputFull = 'Output/BioPython/Temp/' + outputName + '.fasta'

        with open(outputFull, "w") as output_handle:
            SeqIO.write(outputRecord, output_handle, "fasta")

In [None]:
def output_RNApred():
    # Call RNAfold on each sequence of output 
    rnaFold_prob = []

    intarna_columns = ['E', 'start', 'end1', 'start2', 'end2']
    all_sorted_intarna = []
    useful_intarna =  pd.Dataframe(columns = intarna_columns)

    pathTemp = '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Output/BioPython/Temp'
    pathOutTempFold = pathTemp + '/temp.out'
    pathOutTempIntaRNA = pathTemp + '/temp.csv'

    # sorting files in output of scandir 
    for entry in sorted(os.scandir(pathTemp), key=lambda e: e.name):
        # For checking which file currently working on (not in order for some reason) 
        print(entry.path)

        # Defining command for RNAfold 
        commandFold = 'RNAfold -p -d2 --noLP < ' + entry.path + ' > ' + pathOutTempFold    
        # Generating RNAfold predictions 
        generateProb = subprocess.run(commandFold, shell=True, stdout=subprocess.PIPE)

        # Defining command for IntaRNA
        numThreads = 10
        commandInta = 'IntaRNA -t ' + CDS_fileName + ' -q ' + entry.path + ' --threads ' + str(numThreads) + ' --outMode=C > ' + pathOutTempIntaRNA
        # Generating IntaRNA predictions 
        os.system(commandInta)

        # Moving to Temp directory to work on fasta files 
        currentWD = os.getcwd()
        os.chdir('Output/BioPython/Temp')

        # Running script for getting probabilities from RNAfold output file (added to ArchBin btw)
        readProb = subprocess.Popen("rnaFold_prob.sh", shell=True, stdout=subprocess.PIPE)
        returnedProb = readProb.stdout.read()
        # Waiting for last command to finish before storing value in temp.out file 
        readProb.wait()
        # Append frequences ... convert to percentage 
        rnaFold_prob.append(float(returnedProb)*100)

        # Reading in csv file with information 
        intarnaOutput = pd.read_csv(outputPath, sep = ';')
        # Saving pd output as list of lists ... would be a pain to compute multiple times if desired 
        sorted_intarna = intarnaOutput.sort_values('E', ascending=True)
        # Concatinating select columuns of dataframe for inclusion into sequence metrics 
        all_sorted_intarna.append(sorted_intarna)i
        pd.concat([useful_intarna, sorted_intarna[intarna_columns]])

        # Removing temp.out after finishing each run 
        os.system('rm -rf temp.out')
        os.system('rm -rf temp.csv')
        # Return to initial working directory 
        os.chdir(currentWD)
        
    # Removing files generated by RNAfold 
    os.system('rm -rf *ss.ps')
    os.system('rm -rf *dp.ps')
    
    return intarnaOutput

In [None]:
outputPath = os.getcwd() + '/' + outputFile

In [None]:
outputPath

In [None]:
intarnaOutput = pd.read_csv(outputPath, sep = ';')

In [None]:
intarnaOutput

In [None]:
rnaFold_prob

In [None]:
# Add as column in sequence metrics dataframe 
sequenceMetrics['rnaFoldProb'] = rnaFold_prob
# Concatenate IntaRNA output as additional columns 
pd.concat(sequenceMetrics, useful_intarna, axis = 1)
sequenceMetrics

In [None]:
temp_metrics.sort_values('rnaFoldProb', ascending=True).iloc[0]

In [None]:
sortedMetrics.loc[10, :]

In [None]:
sortedMetrics

In [None]:
sortedMetrics[['Exon', 'numTGG']]

# Outputing sesRNA

In [None]:
# Generating BioPython directory if does not exist 
pathlib.Path('Output/BioPython').mkdir(parents=True, exist_ok=True)

# Generate SeqRecord object for each sequence and append to list 
outputID = geneName + '_sesRNA'
outputDescription = "sesRNA for " + geneName

# Generating sequence record objects (for seperate storage)
outputSeqMulti_DNA = []
outputSeqMulti_RNA = []
for i in rC_multiExon_sesRNAs:
    outputSeqMulti.append(SeqRecord(i, id = outputID, description = outputDescription))
    
# Write output fasta files 
version = "V6"
outputName = "Output/BioPython/" + geneName + "_sesRNA_" + version + ".fasta" 
with open(outputName, "w") as output_handle:
    SeqIO.write(outputSeqMulti, output_handle, "fasta")

In [35]:
# Given sequence ... converts to in frame TGGs to TAGs and in frame stops so that first 'T' becomes 'G'
# Had to be careful to only work with in frame codons ... initally had made the mistake to just use string.replace ... this would change out of frame codons as well 
def convert_DNA(sequence, numberConvert):
    # Converting to string object for manipulation 
    strSeq = str(sequence)
    # Generating in frame object variables 
    num_inF_TGG, num_inF_ATG, num_inF_Stop, indicesTGG, indicesATG, indicesStop = return_inFrame(Seq(strSeq), 'all')
    print(num_inF_TGG)
    # print(num_inF_Stop)

    # Replacing in frame stop codons in sequence 
    for stop in indicesStop: 
        stopPairs = [("TAG", "GAG"), ("TAA", "GAA"), ("TGA", "GGA")]
        stopSeq = strSeq[stop:stop+3]
        [stopSeq := stopSeq.replace(a, b) for a, b in stopPairs]
        strSeq = strSeq[:stop] + stopSeq + strSeq[stop+3:]
    
    # Setting number convert to all if 'All' selected as number of TGG to convert 
    if numberConvert == 'All': numberConvert = num_inF_TGG
    # Converting TGG's ... up to number set ... and in order from starting with most central 
    # Sorts indicees by distance from center 
    sorted_indices_centralTGG = np.array(sorted(indicesTGG - (len(strSeq)/2), key = abs)) + (len(strSeq)/2)
    # Converts in frame TGG's ... starting from most central TGG ... up to limit set by numberConvert 
    for i in range(numberConvert):
        currentIndex = int(sorted_indices_centralTGG[i])
        strSeq = strSeq[:currentIndex] + 'TAG' + strSeq[currentIndex+3:]
    # Returns RNA 
    return Seq(strSeq).transcribe()

In [36]:
testSeq = Seq('TGGGAGTAGTGGTGGTAATGA')
testStr = str(testSeq)

In [37]:
convert_DNA(testSeq, 1)

3


Seq('UGGGAGGAGUAGUGGGAAGGA')

In [None]:
# Test that all and numbering is working 
convert_DNA(testSeq, 3) == convert_DNA(testSeq, 'All')

In [None]:
convert_DNA(rC_multiExon_sesRNAs[0], 'All')

In [None]:
# Testing that continious reading frame being produced if number set to 0 
len(convert_DNA(rC_multiExon_sesRNAs[0], 0).translate(to_stop = True)) == len(rC_multiExon_sesRNAs[0])/3

In [None]:
# Checking that right number of TGG being converted 
return_inFrame(convert_DNA(rC_multiExon_sesRNAs[0], 'All').back_transcribe(), 'numTGG')

In [None]:
convert_DNA(testSeq, 1) - (len(testSeq)/2)

In [None]:
abs(convert_DNA(testSeq, 1) - (len(testSeq)/2))

In [None]:
np.sort(abs(convert_DNA(testSeq, 1) - (len(testSeq)/2))) + (len(testSeq)/2)

In [None]:
np.array(sorted(convert_DNA(testSeq, 1) - (len(testSeq)/2), key = abs)) + (len(testSeq)/2)

In [None]:
# Function for saving both original template DNA sequence and the converted RNA given list of sesRNAs Seq objects
# Could change later to make for separate output files ... have not decided yet whether to just put them together into one labeled fasta file 
def save_sesRNAs(sequences_sesRNAs, geneName, version, numConvertTGG):
    # Generating BioPython directory if does not exist 
    pathlib.Path('Output/BioPython').mkdir(parents=True, exist_ok=True)
    
    # Defining save name and description for outputs (original DNA and converted RNA)
    DNA_outputID = geneName + '_sesRNA'
    DNA_outputDescription = "sesRNA DNA original for " + geneName
    convertedRNA_outputID = geneName + '_sesRNA'
    convertedRNA_outputDescription = "sesRNA converted RNA for " + geneName
    
    # Creating empty list for storing sequences 
    outputSeqMulti_DNA = []
    outputSeqMulti_convertedRNA= []
    
    # Generating SeqRecord objects in preparation for writing FASTA files ... edited description to include number of sequence
    i = 1
    for sequence in sequences_sesRNAs:
        outputSeqMulti_DNA.append(SeqRecord(sequence, id = DNA_outputID+str(i), description = DNA_outputDescription))
        outputSeqMulti_convertedRNA.append(SeqRecord(convert_DNA(sequence, numberConvertTGG), id = convertedRNA_outputID+str(i), description = convertedRNA_outputDescription))
        i += 1
        
    # Write output fasta files 
    DNA_outputName = "Output/BioPython/" + geneName + "_sesRNA_" + version + ".fasta" 
    convertedRNA_outputName = "Output/BioPython/" + geneName + "_sesRNA_convertedRNA_" + version + ".fasta" 
    
    with open(outputName, "w") as output_handle:
        # Writing original template DNA files 
        SeqIO.write(outputSeqMulti_DNA, output_handle, "fasta")
        # Writing orginal template RNA files 
        SeqIO.write(outputSeqMulti_convertedRNA, output_handle, "fasta")

# Not working or not being used 

In [None]:
sys.path.append("/usr/share/ViennaRNA")

In [None]:
# Leading RNAfold as RNA 
import sys
sys.path.append("/usr/lib/python3.9/site-packages/RNA")
import _RNA as RNA

In [None]:
import pyseqlib

In [None]:
from pyseqlib import pyRNAfold

In [None]:
pyRNAfold i

In [None]:
pyseqlib.fold_compound(sequence)

In [None]:
RNA.fold_compound(sequence)

In [None]:
sys.path.append("/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Packages/RNA")
import _RNA as RNA

In [None]:
sys.path.append("/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions")
import RNA_Fold

In [None]:
md = RNA.md()

In [None]:
import example

In [None]:
sequence = "CGCAGGGAUACCCGCG"
 
# create new fold_compound object
fc = RNA.fold_compound(sequence)
 
# compute minimum free energy (mfe) and corresponding structure
(ss, mfe) = fc.mfe()
 

In [None]:
# The RNA sequence
seq = "GAGUAGUGGAACCAGGCUAUGUUUGUGACUCGCAGACUAACA"
 
# compute minimum free energy (MFE) and corresponding structure
(ss, mfe) = RNA.fold(seq)

In [None]:
import pickle 

In [None]:
mfe

In [None]:
pathOuput = "/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions/seqObject.p"
pickle.dump(testSeq, open(pathOuput, 'wb'))

In [None]:
# Running script for getting probabilities from RNAfold output file (added to ArchBin btw)
pathFuncPython = '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions/'
# command = 'python ' + '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions/RNA_Fold.py'
# command = 'cd ' + pathFuncPython + ' | ./RNA_Fold.py'
# command = 'cd ' + pathFuncPython + ' | ls -a'

command = 'ls -a'
readProb = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
returnedProb = readProb.stdout.read()

In [None]:
returnedProb

In [None]:
command 

In [None]:
str(rC_multiExon_sesRNAs[0])

In [None]:
testSeq = str(rC_multiExon_sesRNAs[0])
testSeq

In [None]:
RNA.fold(testSeq)

In [None]:
def compute_mfeFreq(sequence):
    # create a fold_compound object for the current sequence
    fc = RNA.fold_compound(sequence)

    # compute the MFE and corresponding structure
    (mfe_struct, mfe) = fc.mfe()


    # compute partition function
    (bp_propensity, dG) = fc.pf()

    # compute frequency of MFE structure (the 'hard' way)
    kT = RNA.exp_param().kT / 1000.

    prob_mfe = math.exp((dG - mfe) / kT)
    
    return prob_mfe

In [None]:
rC_multi_startSeq

In [None]:
rC_multiExon_sesRNAs

In [None]:
str(rC_multiExon_sesRNAs[8])

In [None]:
str(rC_multiExon_sesRNAs[18])

In [None]:
from seqfold import dg, dg_cache, fold

In [None]:
# just returns minimum free energy
dg("GGGAGGTCGTTACATCTGGGTAACACCGGTACTGATCCGGTGACCTCCC", temp = 37.0)  # -12.94

# Test Intarna (test strength of binding, off-target, ...)  

In [None]:
import intarnapvalue

In [None]:
testSeq = str(rC_multiExon_sesRNAs[0])
testSeq

In [None]:
target = str(CDS[0].seq)
target 

In [None]:
from intarnapvalue.intarna_pvalue import IntaRNApvalue

In [None]:
?IntaRNApvalue

In [None]:
?intarna_pvalue

In [None]:
IntaRNApvalue(['--query', testSeq, '--target', target])

In [None]:
IntaRNApvalue['-q', 'AGGAUG', '-t', 'UUUAUCGUU', '-s', '10', '-m', 'b', '-d', 'gauss', '--threads', '3']

In [None]:
command = 'python -m intarnapvalue --query GCUGAAAAACAUAACCCAUAAAAUGCUAGCUGUACCAGGAACCA --target GGUUUCUUCGCCUCUGCGUUCACCAAAGUGUUCACCC -s 10 --shuffle-mode b --threads 0' 

In [None]:
readProb = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
readProb.wait()
returnedProb = readProb.stdout.read()
returnedProb

In [None]:
# Running script for getting probabilities from RNAfold output file (added to ArchBin btw)
readProb = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
returnedProb = readProb.stdout.read()
# Waiting for last command to finish before storing value in temp.out file 
readProb.wait()

In [None]:
pd.read_csv('/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions/output.csv', sep = ';')

In [None]:
pd.read_csv('/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions/output.csv', sep = ';')

In [None]:
pd.read_csv('/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions/output.csv', sep = ';')

In [None]:
# Trying to load entire transcripome ... obviously failed ... 
allRat_fileName = '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Data/Sequences/Reference/Ensembl/All/Rattus_norvegicus.Rnor_6.0.cdna.all.fa'
allRat = list(SeqIO.parse(allRat_fileName, "fasta"))

In [None]:
python -m intarnapvalue --query GCUGAAAAACAUAACCCAUAAAAUGCUAGCUGUACCAGGAACCA --target GGUUUCUUCGCCUCUGCGUUCACCAAAGUGUUCACCC --scores 10000 --shuffle-mode b --threads 0

In [None]:
intarnapvalue --query GCUGAAAAACAUAACCCAUAAAAUGCUAGCUGUACCAGGAACCA --target GGUUUCUUCGCCUCUGCGUUCACCAAAGUGUUCACCC --scores 10000 --shuffle-mode b --threads 0

# Misc

In [None]:
# For returning index of findings 
searchCodon = 'TAG'
[m.start() for m in re.finditer(searchCodon, str(seq))]

In [None]:
testSeq = sesRNAs[0]
testSeq

In [None]:
lastATG = [m.start() for m in re.finditer('ATG', str(testSeq))][-1]
lastTGG = [m.start() for m in re.finditer('TGG', str(testSeq))][-1]

In [None]:
lastTGG

In [None]:
lastATG

In [None]:
lastATG < lastTGG

In [None]:
searchCodon = 'ATG'
[m.start() for m in re.finditer(searchCodon, str(testSeq))][-1]

In [None]:
seq.count('TAG') < 4

In [None]:
seq[0:100].count('TAG')

In [None]:
stopCodons = ['TAG', 'TAA', 'TGA']
stopCodons 

In [None]:
indiciesTGG

In [None]:
indiciesStop

In [None]:
length = 200 
center = length/2

In [None]:
arrayStop = np.array(indiciesStop)
arrayIndicies = np.array(indiciesTGG) 
centralTGGs = arrayIndicies[abs(arrayIndicies - center) < 10]

In [None]:
centralTGGs

In [None]:
np.in1d(centralTGGs,arrayStop)

In [None]:
# Check if array contains values that are within range of values in another array 
any((min(abs(arrayStop - i)) > 10) for i in centralTGGs)

In [None]:
centralTGGs

In [None]:
indiciesStop

In [None]:
testStop = [90, 16, 174]

In [None]:
(min(abs(arrayStop - centralTGGs[0])) > 10)

In [None]:
min(abs(arrayStop - centralTGGs[0])) > 10

In [None]:
centeralTGGs = offset.min()
centeralTGGs

In [None]:
centralTGGs = np.all(offset == offset.min())
centralTGGs

In [None]:
centeralTGGs = np.where(offset == offset.min())
centeralTGG

In [None]:
offset = abs(arrayIndicies - center) 
centerTGG = indiciesTGG[np.argmin(offset)]

In [None]:
any(abs(x - centerTGG) < 10 for x in indiciesStop)

In [None]:
indiciesStop = []
for codons in stopCodons:
    indiciesStop.extend([m.start() for m in re.finditer(codons, str(testSeq))])

In [None]:
len(indiciesStop)

In [None]:
[m.start() for m in re.finditer('TGA', str(testSeq))]

In [None]:
[m.start() for m in re.finditer('TAA', str(testSeq))]

In [None]:
[m.start() for m in re.finditer('TAG', str(testSeq))]

In [None]:
testSeq.count(stopCodons)

In [None]:
indiciesTGG

In [None]:
len(indiciesTGG)

In [None]:
testSeq = sesRNAs[0]

In [None]:
lastTGG = [m.start() for m in re.finditer('TGG', str(testSeq))][-1]

In [None]:
testSeq

In [None]:
indiciesTGG = [m.start() for m in re.finditer('TGG', str(testSeq))]

In [None]:
start = 0 
stop = 200

In [None]:
middle = (start + stop) / 2

In [None]:
abs(middle - indiciesTGG[0])

In [None]:
type(indiciesTGG)

In [None]:
indiciesTGG

In [None]:
any(indiciesTGG) > 2

In [None]:
length = 200 

In [None]:
any(abs(x - (length/2)) < 20 for x in indiciesTGG)

In [None]:
testList = [50, 60, 170, 200]

In [None]:
any(abs(x - (length/2)) < 10 for x in testList)

In [None]:
testSeq

In [None]:
os.path.isdir('Output/BioPython')

In [None]:
outputFileName = os.getcwd() + ''

In [None]:
testSeq