# Setup 

## Import packages 

In [1]:
# General 
import os 
import numpy as np
import pandas as pd 
# For running bash scripts from inside python ... 
import subprocess
# For manipulating string objects 
import re
# for generating any necessary directories
import pathlib 
# For manipulating list objects 
import itertools 

In [2]:
# For working with sequence objects 
from Bio.Seq import Seq

In [3]:
# For fetching sequences from Entrez 
from Bio import Entrez
from Bio import SeqIO

In [4]:
# For extracting features 
from Bio.SeqFeature import SeqFeature, FeatureLocation
# For creating SeqRecord objects 
from Bio.SeqRecord import SeqRecord

In [5]:
# For running R cells 
%load_ext rpy2.ipython

# Functions 

In [6]:
import sys
# Importing module of personal functions 
sys.path.append('/home/user1/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code')
from kCellReadR import * 

## Misc

In [7]:
speciesName = 'Rat'
geneName = 'Fezf2'

Entrez.email = "kehaliwoldemichael@gmail.com"  # Always tell NCBI who you are

# Sequence

## Loading sequences 

In [8]:
ensembl_transcriptIDs = return_ensemblTranscriptIDs(speciesName, geneName)
ensembl_transcriptIDs

'ENSRNOT00000012452'

In [9]:
variantTable = table_transcriptsInfo(ensembl_transcriptIDs)
variantTable

Unnamed: 0,TranscriptNum,TranscriptID,TranscriptName,Assembly,Type,AA_Length,Is_Canonical
0,1,ENSRNOT00000012452,Fezf2-201,Rnor_6.0,protein_coding,455,True


In [10]:
spliceVariant = 1

In [11]:
geneName

'Fezf2'

In [12]:
speciesName

'Rat'

In [13]:
rC_exon_records, C_exon_records, CDS, cDNA = load_referenceSequences(speciesName, geneName, spliceVariant)

## Selecting sensor 

In [14]:
for exon in rC_exon_records:
    print(len(str(exon.seq)))

243
890
671


In [15]:
parameters = parameters_sesRNA(speciesName, geneName,  spliceVariant, 'Reverse', 192, 2, 0, 'All upstream', 40, 75, 50, 10)

In [16]:
parameters.print_parameters()

[instance attributes]
species = Rat
gene = Fezf2
spliceVariant = 1
seqDirection = Reverse
length = 192
num_inF_TGG = 2
num_inF_Stop = 0
inF_ATG = All upstream
minGC = 40
maxGC = 75
nearCenter = 50
fromStop = 10


In [17]:
rC_CDS = [CDS[0].reverse_complement()]

In [18]:
variantTable['Type'][0]

'protein_coding'

In [19]:
all_sesRNAs, all_sequenceMetrics, all_sesRNA_objs = generate_all_sesRNAs(rC_exon_records, C_exon_records, cDNA, parameters, variantTable['Type'])

In [20]:
all_sequenceMetrics

Unnamed: 0,SeqNumber,TypeSeq,Exon,ExonFrac,ExonProtFrac,StartSeq,StopSeq,firstTGG,centralTGG,second_cTGG,numTGG,numTTGG,numTGGA,numTTGGA,numATG,numStop,gcCont
0,1,Reverse,2,1/1,1/1,87.0,279.0,78.0,78.0,189.0,2.0,1,1,0,2.0,0.0,60.9
1,2,Reverse,2,1/1,1/1,90.0,282.0,75.0,75.0,186.0,2.0,1,1,0,2.0,0.0,60.9
2,3,Reverse,2,1/1,1/1,93.0,285.0,72.0,72.0,183.0,2.0,1,1,0,2.0,0.0,60.4
3,4,Reverse,2,1/1,1/1,96.0,288.0,69.0,69.0,180.0,2.0,1,1,0,2.0,0.0,60.4
4,5,Reverse,2,1/1,1/1,99.0,291.0,66.0,66.0,177.0,2.0,1,1,0,2.0,0.0,61.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9,65,Reverse,3,1/1,1/1,332.0,524.0,39.0,93.0,117.0,4.0,0,0,0,0.0,0.0,49.0
10,66,Reverse,3,1/1,1/1,335.0,527.0,36.0,90.0,114.0,4.0,0,0,0,0.0,0.0,50.5
11,67,Reverse,3,1/1,1/1,338.0,530.0,33.0,87.0,111.0,4.0,0,0,0,0.0,0.0,51.0
12,68,Reverse,3,1/1,1/1,341.0,533.0,30.0,84.0,108.0,5.0,0,0,0,0.0,0.0,51.6


In [21]:
all_sequenceMetrics[all_sequenceMetrics['numTTGG'] > 0]

Unnamed: 0,SeqNumber,TypeSeq,Exon,ExonFrac,ExonProtFrac,StartSeq,StopSeq,firstTGG,centralTGG,second_cTGG,numTGG,numTTGG,numTGGA,numTTGGA,numATG,numStop,gcCont
0,1,Reverse,2,1/1,1/1,87.0,279.0,78.0,78.0,189.0,2.0,1,1,0,2.0,0.0,60.9
1,2,Reverse,2,1/1,1/1,90.0,282.0,75.0,75.0,186.0,2.0,1,1,0,2.0,0.0,60.9
2,3,Reverse,2,1/1,1/1,93.0,285.0,72.0,72.0,183.0,2.0,1,1,0,2.0,0.0,60.4
3,4,Reverse,2,1/1,1/1,96.0,288.0,69.0,69.0,180.0,2.0,1,1,0,2.0,0.0,60.4
4,5,Reverse,2,1/1,1/1,99.0,291.0,66.0,66.0,177.0,2.0,1,1,0,2.0,0.0,61.5
5,6,Reverse,2,1/1,1/1,102.0,294.0,63.0,63.0,174.0,2.0,1,1,0,2.0,0.0,60.9
6,7,Reverse,2,1/1,1/1,105.0,297.0,60.0,60.0,171.0,2.0,1,1,0,2.0,0.0,60.9
7,8,Reverse,2,1/1,1/1,108.0,300.0,57.0,57.0,168.0,2.0,1,1,0,2.0,0.0,61.5
8,9,Reverse,2,1/1,1/1,111.0,303.0,54.0,54.0,165.0,2.0,1,1,0,2.0,0.0,61.5
9,10,Reverse,2,1/1,1/1,114.0,306.0,51.0,51.0,162.0,2.0,1,1,0,2.0,0.0,62.0


In [22]:
seqTdT = 'ttacttgtacagctcgtccatgccgtacaggaacaggtggtggcggccctcggagcgctcgtactgttccacgatggtgtagtcctcgttgtgggaggtgatgtccagcttggtgtccacgtagtagtagccgggcagttgcacgggcttcttggccatgtagatggtcttgaactccaccaggtagtggccgccgtccttcagcttcagggcctggtggatctcgcccttcagcacgccgtcgcgggggtacaggcgctcggtggaggcctcccagcccatggtcttcttctgcattacggggccgtcgggggggaagttggtgccgcgcatcttcaccttgtagatcagcgtgccgtcctgcagggaggagtcctgggtcacggtcaccagaccgccgtcctcgaagttcatcacgcgctcccacttgaagccctcggggaaggacagcttcttgtaatcggggatgtcggcggggtgcttcacgtacgccttggagccgtacatgaactggggggacaggatgtcccaggcgaagggcagggggccgcccttggtcaccttcagcttggcggtctgggtgccctcgtaggggcggccctcgccctcgccctcgatctcgaactcgtggccgttcatggagccctccatgcgcaccttgaagcgcatgaactctttgatgacggccatgttgttgtcctcggaggaggcggtgccggagctgccgctgccggtgctgccggtgccatgccccaggaacaggtggtggcggccctcggagcgctcgtactgttccacgatggtgtagtcctcgttgtgggaggtgatgtccagcttggtgtccacgtagtagtagccgggcagttgcacgggcttcttggccatgtagatggtcttgaactccaccaggtagtggccgccgtccttcagcttcagggcctggtggatctcgcccttcagcacgccgtcgcgggggtacaggcgctcggtggaggcctcccagcccatggtcttcttctgcattacggggccgtcgggggggaagttggtgccgcgcatcttcaccttgtagatcagcgtgccgtcctgcagggaggagtcctgggtcacggtcaccagaccgccgtcctcgaagttcatcacgcgctcccacttgaagccctcggggaaggacagcttcttgtaatcggggatgtcggcggggtgcttcacgtacgccttggagccgtacatgaactggggggacaggatgtcccaggcgaagggcagggggccgcccttggtcaccttcagcttggcggtctgggtgccctcgtaggggcggccctcgccctcgccctcgatctcgaactcgtggccgttcatggagccctccatgcgcaccttgaagcgcatgaactctttgatgacctcctcgcccttgctcaccat'
seqTdT = seqTdT.upper()
seqTdT

'TTACTTGTACAGCTCGTCCATGCCGTACAGGAACAGGTGGTGGCGGCCCTCGGAGCGCTCGTACTGTTCCACGATGGTGTAGTCCTCGTTGTGGGAGGTGATGTCCAGCTTGGTGTCCACGTAGTAGTAGCCGGGCAGTTGCACGGGCTTCTTGGCCATGTAGATGGTCTTGAACTCCACCAGGTAGTGGCCGCCGTCCTTCAGCTTCAGGGCCTGGTGGATCTCGCCCTTCAGCACGCCGTCGCGGGGGTACAGGCGCTCGGTGGAGGCCTCCCAGCCCATGGTCTTCTTCTGCATTACGGGGCCGTCGGGGGGGAAGTTGGTGCCGCGCATCTTCACCTTGTAGATCAGCGTGCCGTCCTGCAGGGAGGAGTCCTGGGTCACGGTCACCAGACCGCCGTCCTCGAAGTTCATCACGCGCTCCCACTTGAAGCCCTCGGGGAAGGACAGCTTCTTGTAATCGGGGATGTCGGCGGGGTGCTTCACGTACGCCTTGGAGCCGTACATGAACTGGGGGGACAGGATGTCCCAGGCGAAGGGCAGGGGGCCGCCCTTGGTCACCTTCAGCTTGGCGGTCTGGGTGCCCTCGTAGGGGCGGCCCTCGCCCTCGCCCTCGATCTCGAACTCGTGGCCGTTCATGGAGCCCTCCATGCGCACCTTGAAGCGCATGAACTCTTTGATGACGGCCATGTTGTTGTCCTCGGAGGAGGCGGTGCCGGAGCTGCCGCTGCCGGTGCTGCCGGTGCCATGCCCCAGGAACAGGTGGTGGCGGCCCTCGGAGCGCTCGTACTGTTCCACGATGGTGTAGTCCTCGTTGTGGGAGGTGATGTCCAGCTTGGTGTCCACGTAGTAGTAGCCGGGCAGTTGCACGGGCTTCTTGGCCATGTAGATGGTCTTGAACTCCACCAGGTAGTGGCCGCCGTCCTTCAGCTTCAGGGCCTGGTGGATCTCGCCCTTCAGCACGCCGTCGCGGGGGTACAGGCGCTCGGTGGAGGCCTC

In [31]:
len(Seq(seqTdT).translate()) == len(seqTdT)/3

True

In [36]:
num_inF_TGG, num_inF_TTGG, num_inF_TGGA, num_inF_TTGGA, num_inF_ATG, num_inF_Stop, indices_inF_TGG, \
    indices_inF_ATG, indices_inF_Stop = \
    return_inFrame(Seq(seqTdT).reverse_complement(), 'all')

In [37]:
num_inF_TGG

6

In [38]:
num_inF_TTGG

0

In [39]:
num_inF_TGGA

0

In [40]:
num_inF_TTGGA

0

In [None]:
# Generating pd.Dataframe
df = pd.DataFrame(all_sequenceMetrics)
# Converting DataFrame to json and dumping it to std.out
df_json = df.reset_index().to_json(orient="values")


In [None]:
df_json

In [None]:
# Initial if want to check variable length sesRNAs 
for i in range(200, 300):
    if(i%3 == 0):
        print(i)

# Secondary structure 

## Working 

In [None]:
generate_RNApred(all_sesRNAs, all_sequenceMetrics, geneName, 1)

In [None]:
def generate_mfeProb(sequenceMetrics, species, spliceVariant): 
    rnaFold_prob = []

    spliceVariant = str(spliceVariant)
    save_speciesName = species.replace(" ", "_")
    gene_BasePath = ensembl_BasePath + '/' + save_speciesName + '/' + geneName 
    CDS_fileName = gene_BasePath + '_cds_' + spliceVariant + '_' + save_speciesName + '.fasta'
    
    pathTemp = '/home/user1/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/Output/BioPython/Temp'
    pathOutTempFold = pathTemp + '/temp.out'

    # sorting files in output of scandir 
    for entry in sorted(os.scandir(pathTemp), key=lambda e: e.name):
        # Defining command for RNAfold 
        commandFold = 'RNAfold -p -d2 --noLP < ' + entry.path + ' > ' + pathOutTempFold    
        # Generating RNAfold predictions 
        generateProb = subprocess.run(commandFold, shell=True, stdout=subprocess.PIPE)
        
        # Moving to Temp directory to work on fasta files 
        currentWD = os.getcwd()
        os.chdir('/home/user1/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/Output/BioPython/Temp')

        # Running script for getting probabilities from RNAfold output file (added to ArchBin btw)
        readProb = subprocess.Popen("rnaFold_prob.sh", shell=True, stdout=subprocess.PIPE)
        returnedProb = readProb.stdout.read()
        # Waiting for last command to finish before storing value in temp.out file 
        readProb.wait()
        # Append frequences ... convert to percentage 
        rnaFold_prob.append(round(float(returnedProb)*100, 3))
        
        # Removing temp.out after finishing each run 
        os.system('rm -rf temp.out')
        os.system('rm -rf temp.csv')
        # Return to initial working directory 
        os.chdir(currentWD)

    # Removing files generated by RNAfold 
    os.system('rm -rf *ss.ps')
    os.system('rm -rf *dp.ps')
    
    # Adding RNA fold mfe ensemble frequency to sequenceMetrics 
    sequenceMetrics['mfeFreq'] = rnaFold_prob
    
    return sequenceMetrics  

In [None]:
fold_sequenceMetrics = generate_mfeProb(all_sequenceMetrics, speciesName, spliceVariant)
fold_sequenceMetrics 

In [None]:
def output_RIblast(sequenceMetrics, geneName, species, spliceVariant, targetName):
    spliceVariant = str(spliceVariant)
    save_speciesName = species.replace(" ", "_")
    gene_BasePath = ensembl_BasePath + '/' + save_speciesName + '/' + geneName 
    if targetName == 'CDS':
        target_fileName = gene_BasePath + '_cds_' + spliceVariant + '_' + save_speciesName + '.fasta'
    elif targetName == 'cDNA':
        target_fileName = gene_BasePath + '_cdna_' + spliceVariant + '_' + save_speciesName + '.fasta'
    print(target_fileName)
    
    # Path to tempRIblast folder 
    path_tempRIblast = '/home/user1/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/Output/RIblast/'
    query_Name = path_tempRIblast + geneName + '_db'
    
    # Generating query database 
    commandQuery = 'RIblast db -i ' + target_fileName + ' -o ' + query_Name
    os.system(commandQuery)

    # Path and file name for output CSV 
    outputName = path_tempRIblast + geneName + '.csv'
    # Path to directory sesRNA files 
    path_sesRNAs = '/home/user1/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/Output/BioPython/Temp'
    
    
    # Generating pd.DataFrame for storing calculated values 
    columns_RIblast = [' Accessibility Energy', ' Hybridization Energy', ' Interaction Energy', ' BasePair', 
                       ' Accessibility Energy', ' Hybridization Energy', ' Interaction Energy', ' BasePair']
    useful_RIblast =  pd.DataFrame(columns = columns_RIblast)
    
    # Iteratively generating calculations for sesRNA-target interaction 
    # Made sure to go through sesRNA files in order 
    for entry in sorted(os.scandir(path_sesRNAs), key=lambda e: e.name):
        # Running RIblast calculations 
        commandSearch = 'RIblast ris -i ' + entry.path + ' -o ' + outputName + ' -d ' + query_Name
        os.system(commandSearch)
        print(entry.path)
        
        # Remove first two lines from CVS to allow for parsing into pandas.Dataframe 
        commandRemoveLines = 'sed -i 1,2d ' + outputName 
        os.system(commandRemoveLines)

        outputCSV =  pd.read_csv(outputName, skiprows=[1])
        # Sorting by hybirzation energy ... have to have extra white space before column name 
        sorted_outputCSV = outputCSV.sort_values(' Hybridization Energy')

        topHybridizationE = sorted_outputCSV[[' Accessibility Energy', ' Hybridization Energy', ' Interaction Energy', ' BasePair']].iloc[0:1]
        secondHybridizationE = sorted_outputCSV[[' Accessibility Energy', ' Hybridization Energy', ' Interaction Energy', ' BasePair']].iloc[1:2]
        temp_RIblast_ouput = pd.concat([topHybridizationE.reset_index(drop=True), secondHybridizationE.reset_index(drop=True)], axis = 1)

        # Appending calculations for current sesRNA values 
        useful_RIblast = useful_RIblast.append(temp_RIblast_ouput)
        
        # Clearing csv 
        command_clearCSV = 'rm -rf ' + outputName
        os.system(command_clearCSV)
        # Clear BioPython temp fasta file for sesRNA 
        os.system('rm -rf ' + entry.path)
    
    # Clear RIblast Temp directory 
    os.system("rm -rf " + path_tempRIblast + "*")
    
    return useful_RIblast

In [None]:
outputRIblast = output_RIblast(fold_sequenceMetrics, geneName, speciesName, spliceVariant, 'cDNA')
outputRIblast

In [None]:
metricsTable_higherOrder = pd.concat([fold_sequenceMetrics.reset_index(drop=True), outputRIblast.reset_index(drop=True).iloc[:, 0:4]], axis = 1)

In [None]:
metricsTable_higherOrder

In [None]:
sesChoice = 6
chosenSequence = all_sesRNAs[sesChoice - 1]

In [None]:
str(chosenSequence)

In [None]:
return_inFrame(chosenSequence, 'all')

In [None]:
from Bio.Restriction import *

In [None]:
len(EcoRI.search(chosenSequence)) == 0

In [None]:
len(AscI.search(chosenSequence)) == 0

In [None]:
testSeq = 'CCTCCTCGCTGCCCTCGGACTTGAGGATGTCCATCTGCAGCCCTTGCCGATGCTCCATGTCCAGGTCGTCGCAGTGGGCGAAGCCCACCGCCTCCTCGTCGGTGGCCGCCTGGAAGCCCATCCTGGCGAACATGCCGCTCATCTTCGCCTGGGATTTGTGCGACACCGAGGTGGCCACGTTGGAGAGCTTGCTGCGGAGGAGGG'

In [None]:
testSeq = 'CCTCCTCGCTGCCCTCGGACTTGAGGATGTCCATCTGCAGCCCTTGCCGATGCTCCATGTCCAGGTCGTCGCAGTGGGCGAAGCCCACCGCCTCCTCGTCGGTGGCCGCCTGGAAGCCCATCCTGGCGAACATGCCGCTCATCTTCGCCTGGGATTTGTGCGACACCGAGGTGGCCACGTTGGAGAGCTTGCTGCGGAGGAGGG'

In [None]:
return_inFrame(Seq(testSeq), 'all')

In [None]:
len(Seq(testSeq).translate()) == 

In [None]:
save_all_sesRNAs_DNA(all_sesRNAs, speciesName, geneName)

In [None]:
AscI.site

In [None]:
testSeq = 'GGTAGCTCGACTGGCTTCTACCTTTCGGGTACCTTCACGGGCTTTTCTTTCGGGGCGACACGCTGACGTGGACGAAGCCCGACGGGGCCGCCATATAGTAGCGGTAGTACTCGCCTGACCCTAAGACGTAAAGGAAGCCGTAGGCTACATTGGACCCTCACCGGTAGCACCTATACCAGTTATTGTCGTGGT'

In [None]:
testSeq[::-1]

In [None]:
check_inSearchSeq(all_sesRNAs[0], CDS, 'Complement')

In [None]:
def output_intaRNA(sequenceMetrics):
    # Call RNAfold on each sequence of output 
    top_intarnaE = []

    intarna_columns = ['E', 'start1', 'end1', 'start2', 'end2']
    all_sorted_intarna = []
    useful_intarna =  pd.DataFrame(columns = intarna_columns)

    martBase = '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Output/biomaRt/'
    martBasePath = martBase + species
    # Loading sequences for gene CDS
    CDS_fileName = martBasePath + '/CDS_' + geneName + '.fasta'
    
    pathTemp = '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Output/BioPython/Temp'
    pathOutTempIntaRNA = pathTemp + '/temp.csv'

    # sorting files in output of scandir 
    for entry in sorted(os.scandir(pathTemp), key=lambda e: e.name):
        # For checking which file currently working on (not in order for some reason) 
        print(entry.path)

        # Defining command for IntaRNA
        numThreads = 10
        commandInta = 'IntaRNA -t ' + CDS_fileName + ' -q ' + entry.path + ' --threads ' + str(numThreads) + ' --outMode=C > ' + pathOutTempIntaRNA
        # Generating IntaRNA predictions 
        os.system(commandInta)
        # Reading in csv file with information 
        intarnaOutput = pd.read_csv(pathOutTempIntaRNA, sep = ';')
        # Saving pd output as list of lists ... would be a pain to compute multiple times if desired 
        sorted_intarna = intarnaOutput.sort_values('E', ascending=True)
        # Concatinating select columuns of dataframe for inclusion into sequence metrics 
        all_sorted_intarna.append(sorted_intarna)
        print(sorted_intarna)
        # Getting first row ... with highest energy values ... and appending to DataFrame 
        useful_intarna = useful_intarna.append(sorted_intarna[intarna_columns].iloc[0:1])
        
        # Moving to Temp directory to work on fasta files 
        currentWD = os.getcwd()
        os.chdir('/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Output/BioPython/Temp')

        # Removing temp.out after finishing each run 
        os.system('rm -rf temp.out')
        os.system('rm -rf temp.csv')
        # Return to initial working directory 
        os.chdir(currentWD)

    # Removing files generated by RNAfold 
    os.system('rm -rf *ss.ps')
    os.system('rm -rf *dp.ps')
    # Removing temp fasta files 
#     resetCommand = 'cd ' + pathTemp + ' && rm -rf *'
#     os.system(resetCommand)
    
    # Have to reset index since useful_intarna is slice of many pd.DataFrames 
    out_sequenceMetrics = pd.concat([sequenceMetrics.reset_index(), useful_intarna.reset_index()], axis = 1)
    
    return out_sequenceMetrics 

In [None]:
intarna_sequenceMetrics = output_intaRNA(fold_sequenceMetrics)
intarna_sequenceMetrics

In [None]:
def output_RNApred(sequenceMetrics):
    # Call RNAfold on each sequence of output 
    rnaFold_prob = []
    top_intarnaE = []

    intarna_columns = ['E', 'start1', 'end1', 'start2', 'end2']
    all_sorted_intarna = []
    useful_intarna =  pd.DataFrame(columns = intarna_columns)

    martBase = '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Output/biomaRt/'
    martBasePath = martBase + species
    # Loading sequences for gene CDS
    CDS_fileName = martBasePath + '/CDS_' + geneName + '.fasta'
    
    pathTemp = '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Output/BioPython/Temp'
    pathOutTempFold = pathTemp + '/temp.out'
    pathOutTempIntaRNA = pathTemp + '/temp.csv'

    # sorting files in output of scandir 
    for entry in sorted(os.scandir(pathTemp), key=lambda e: e.name):
        # For checking which file currently working on (not in order for some reason) 
        print(entry.path)

        # Defining command for RNAfold 
        commandFold = 'RNAfold -p -d2 --noLP < ' + entry.path + ' > ' + pathOutTempFold    
        # Generating RNAfold predictions 
        generateProb = subprocess.run(commandFold, shell=True, stdout=subprocess.PIPE)

        # Defining command for IntaRNA
        numThreads = 10
        commandInta = 'IntaRNA -t ' + CDS_fileName + ' -q ' + entry.path + ' --threads ' + str(numThreads) + ' --outMode=C > ' + pathOutTempIntaRNA
        # Generating IntaRNA predictions 
        os.system(commandInta)
        # Reading in csv file with information 
        intarnaOutput = pd.read_csv(pathOutTempIntaRNA, sep = ';')
        # Saving pd output as list of lists ... would be a pain to compute multiple times if desired 
        sorted_intarna = intarnaOutput.sort_values('E', ascending=True)
        # Concatinating select columuns of dataframe for inclusion into sequence metrics 
        all_sorted_intarna.append(sorted_intarna)
        # Getting first row ... with highest energy values ... and appending to DataFrame 
        useful_intarna = useful_intarna.append(sorted_intarna[intarna_columns].iloc[0:1])
        
        # Moving to Temp directory to work on fasta files 
        currentWD = os.getcwd()
        os.chdir('/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Output/BioPython/Temp')

        # Running script for getting probabilities from RNAfold output file (added to ArchBin btw)
        readProb = subprocess.Popen("rnaFold_prob.sh", shell=True, stdout=subprocess.PIPE)
        returnedProb = readProb.stdout.read()
        # Waiting for last command to finish before storing value in temp.out file 
        readProb.wait()
        # Append frequences ... convert to percentage 
        rnaFold_prob.append(float(returnedProb)*100)

        
        # Removing temp.out after finishing each run 
        os.system('rm -rf temp.out')
        os.system('rm -rf temp.csv')
        # Return to initial working directory 
        os.chdir(currentWD)

    # Removing files generated by RNAfold 
    os.system('rm -rf *ss.ps')
    os.system('rm -rf *dp.ps')
    # Removing temp fasta files 
    resetCommand = 'cd ' + pathTemp + ' && rm -rf *'
    os.system(resetCommand)
    
    # Adding RNA fold mfe ensemble frequency to sequenceMetrics 
    sequenceMetrics['mfeFreq'] = rnaFold_prob
    # Have to reset index since useful_intarna is slice of many pd.DataFrames 
    out_sequenceMetrics = pd.concat([sequenceMetrics.reset_index(), useful_intarna.reset_index()], axis = 1)
    
    return out_sequenceMetrics 

In [None]:
higherSequenceMetrics = output_RNApred(all_sequenceMetrics)
higherSequenceMetrics

# Outputing sesRNA

In [None]:
testSeq = Seq('TGGGAGTAGTGGTGGTAATGA')
testStr = str(testSeq)

In [None]:
testStr

In [None]:
convert_DNA(testSeq, 1)

In [None]:
# Test that all and numbering is working 
convert_DNA(testSeq, 3) == convert_DNA(testSeq, 'All')

In [None]:
convert_DNA(rC_multiExon_sesRNAs[0], 'All')

In [None]:
# Testing that continious reading frame being produced if number set to 0 
len(convert_DNA(rC_multiExon_sesRNAs[0], 0).translate(to_stop = True)) == len(rC_multiExon_sesRNAs[0])/3

In [None]:
# Checking that right number of TGG being converted 
return_inFrame(convert_DNA(rC_multiExon_sesRNAs[0], 'All').back_transcribe(), 'numTGG')

In [None]:
convert_DNA(testSeq, 1) - (len(testSeq)/2)

In [None]:
abs(convert_DNA(testSeq, 1) - (len(testSeq)/2))

In [None]:
np.sort(abs(convert_DNA(testSeq, 1) - (len(testSeq)/2))) + (len(testSeq)/2)

In [None]:
np.array(sorted(convert_DNA(testSeq, 1) - (len(testSeq)/2), key = abs)) + (len(testSeq)/2)

In [None]:
# Function for saving both original template DNA sequence and the converted RNA given list of sesRNAs Seq objects
# Could change later to make for separate output files ... have not decided yet whether to just put them together into one labeled fasta file 
def save_sesRNAs(sequences_sesRNAs, geneName, version, numConvertTGG):
    # Generating BioPython directory if does not exist 
    pathlib.Path('Output/BioPython').mkdir(parents=True, exist_ok=True)
    
    # Defining save name and description for outputs (original DNA and converted RNA)
    DNA_outputID = geneName + '_sesRNA'
    DNA_outputDescription = "sesRNA DNA original for " + geneName
    convertedRNA_outputID = geneName + '_sesRNA'
    convertedRNA_outputDescription = "sesRNA converted RNA for " + geneName
    
    # Creating empty list for storing sequences 
    outputSeqMulti_DNA = []
    outputSeqMulti_convertedRNA= []
    
    # Generating SeqRecord objects in preparation for writing FASTA files ... edited description to include number of sequence
    i = 1
    for sequence in sequences_sesRNAs:
        outputSeqMulti_DNA.append(SeqRecord(sequence, id = DNA_outputID+str(i), description = DNA_outputDescription))
        outputSeqMulti_convertedRNA.append(SeqRecord(convert_DNA(sequence, numberConvertTGG), id = convertedRNA_outputID+str(i), description = convertedRNA_outputDescription))
        i += 1
        
    # Write output fasta files 
    DNA_outputName = "Output/BioPython/" + geneName + "_sesRNA_" + version + ".fasta" 
    convertedRNA_outputName = "Output/BioPython/" + geneName + "_sesRNA_convertedRNA_" + version + ".fasta" 
    
    with open(outputName, "w") as output_handle:
        # Writing original template DNA files 
        SeqIO.write(outputSeqMulti_DNA, output_handle, "fasta")
        # Writing orginal template RNA files 
        SeqIO.write(outputSeqMulti_convertedRNA, output_handle, "fasta")

# Not working or not being used 

In [None]:
sys.path.append("/usr/share/ViennaRNA")

In [None]:
# Leading RNAfold as RNA 
import sys
sys.path.append("/usr/lib/python3.9/site-packages/RNA")
import _RNA as RNA

In [None]:
import pyseqlib

In [None]:
from pyseqlib import pyRNAfold

In [None]:
pyRNAfold i

In [None]:
pyseqlib.fold_compound(sequence)

In [None]:
RNA.fold_compound(sequence)

In [None]:
sys.path.append("/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Packages/RNA")
import _RNA as RNA

In [None]:
sys.path.append("/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions")
import RNA_Fold

In [None]:
md = RNA.md()

In [None]:
import example

In [None]:
sequence = "CGCAGGGAUACCCGCG"
 
# create new fold_compound object
fc = RNA.fold_compound(sequence)
 
# compute minimum free energy (mfe) and corresponding structure
(ss, mfe) = fc.mfe()
 

In [None]:
# The RNA sequence
seq = "GAGUAGUGGAACCAGGCUAUGUUUGUGACUCGCAGACUAACA"
 
# compute minimum free energy (MFE) and corresponding structure
(ss, mfe) = RNA.fold(seq)

In [None]:
import pickle 

In [None]:
mfe

In [None]:
pathOuput = "/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions/seqObject.p"
pickle.dump(testSeq, open(pathOuput, 'wb'))

In [None]:
# Running script for getting probabilities from RNAfold output file (added to ArchBin btw)
pathFuncPython = '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions/'
# command = 'python ' + '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions/RNA_Fold.py'
# command = 'cd ' + pathFuncPython + ' | ./RNA_Fold.py'
# command = 'cd ' + pathFuncPython + ' | ls -a'

command = 'ls -a'
readProb = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
returnedProb = readProb.stdout.read()

In [None]:
returnedProb

In [None]:
command 

In [None]:
str(rC_multiExon_sesRNAs[0])

In [None]:
testSeq = str(rC_multiExon_sesRNAs[0])
testSeq

In [None]:
RNA.fold(testSeq)

In [None]:
def compute_mfeFreq(sequence):
    # create a fold_compound object for the current sequence
    fc = RNA.fold_compound(sequence)

    # compute the MFE and corresponding structure
    (mfe_struct, mfe) = fc.mfe()


    # compute partition function
    (bp_propensity, dG) = fc.pf()

    # compute frequency of MFE structure (the 'hard' way)
    kT = RNA.exp_param().kT / 1000.

    prob_mfe = math.exp((dG - mfe) / kT)
    
    return prob_mfe

In [None]:
rC_multi_startSeq

In [None]:
rC_multiExon_sesRNAs

In [None]:
str(rC_multiExon_sesRNAs[8])

In [None]:
str(rC_multiExon_sesRNAs[18])

In [None]:
from seqfold import dg, dg_cache, fold

In [None]:
# just returns minimum free energy
dg("GGGAGGTCGTTACATCTGGGTAACACCGGTACTGATCCGGTGACCTCCC", temp = 37.0)  # -12.94

# Test Intarna (test strength of binding, off-target, ...)  

In [None]:
import intarnapvalue

In [None]:
testSeq = str(rC_multiExon_sesRNAs[0])
testSeq

In [None]:
target = str(CDS[0].seq)
target 

In [None]:
from intarnapvalue.intarna_pvalue import IntaRNApvalue

In [None]:
?IntaRNApvalue

In [None]:
?intarna_pvalue

In [None]:
IntaRNApvalue(['--query', testSeq, '--target', target])

In [None]:
IntaRNApvalue['-q', 'AGGAUG', '-t', 'UUUAUCGUU', '-s', '10', '-m', 'b', '-d', 'gauss', '--threads', '3']

In [None]:
command = 'python -m intarnapvalue --query GCUGAAAAACAUAACCCAUAAAAUGCUAGCUGUACCAGGAACCA --target GGUUUCUUCGCCUCUGCGUUCACCAAAGUGUUCACCC -s 10 --shuffle-mode b --threads 0' 

In [None]:
readProb = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
readProb.wait()
returnedProb = readProb.stdout.read()
returnedProb

In [None]:
# Running script for getting probabilities from RNAfold output file (added to ArchBin btw)
readProb = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
returnedProb = readProb.stdout.read()
# Waiting for last command to finish before storing value in temp.out file 
readProb.wait()

In [None]:
pd.read_csv('/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions/output.csv', sep = ';')

In [None]:
pd.read_csv('/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions/output.csv', sep = ';')

In [None]:
pd.read_csv('/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions/output.csv', sep = ';')

In [None]:
# Trying to load entire transcripome ... obviously failed ... 
allRat_fileName = '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Data/Sequences/Reference/Ensembl/All/Rattus_norvegicus.Rnor_6.0.cdna.all.fa'
allRat = list(SeqIO.parse(allRat_fileName, "fasta"))

In [None]:
python -m intarnapvalue --query GCUGAAAAACAUAACCCAUAAAAUGCUAGCUGUACCAGGAACCA --target GGUUUCUUCGCCUCUGCGUUCACCAAAGUGUUCACCC --scores 10000 --shuffle-mode b --threads 0

In [None]:
intarnapvalue --query GCUGAAAAACAUAACCCAUAAAAUGCUAGCUGUACCAGGAACCA --target GGUUUCUUCGCCUCUGCGUUCACCAAAGUGUUCACCC --scores 10000 --shuffle-mode b --threads 0

# Misc

In [None]:
# For returning index of findings 
searchCodon = 'TAG'
[m.start() for m in re.finditer(searchCodon, str(seq))]

In [None]:
testSeq = sesRNAs[0]
testSeq

In [None]:
lastATG = [m.start() for m in re.finditer('ATG', str(testSeq))][-1]
lastTGG = [m.start() for m in re.finditer('TGG', str(testSeq))][-1]

In [None]:
lastTGG

In [None]:
lastATG

In [None]:
lastATG < lastTGG

In [None]:
searchCodon = 'ATG'
[m.start() for m in re.finditer(searchCodon, str(testSeq))][-1]

In [None]:
seq.count('TAG') < 4

In [None]:
seq[0:100].count('TAG')

In [None]:
stopCodons = ['TAG', 'TAA', 'TGA']
stopCodons 

In [None]:
indiciesTGG

In [None]:
indiciesStop

In [None]:
length = 200 
center = length/2

In [None]:
arrayStop = np.array(indiciesStop)
arrayIndicies = np.array(indiciesTGG) 
centralTGGs = arrayIndicies[abs(arrayIndicies - center) < 10]

In [None]:
centralTGGs

In [None]:
np.in1d(centralTGGs,arrayStop)

In [None]:
# Check if array contains values that are within range of values in another array 
any((min(abs(arrayStop - i)) > 10) for i in centralTGGs)

In [None]:
centralTGGs

In [None]:
indiciesStop

In [None]:
testStop = [90, 16, 174]

In [None]:
(min(abs(arrayStop - centralTGGs[0])) > 10)

In [None]:
min(abs(arrayStop - centralTGGs[0])) > 10

In [None]:
centeralTGGs = offset.min()
centeralTGGs

In [None]:
centralTGGs = np.all(offset == offset.min())
centralTGGs

In [None]:
centeralTGGs = np.where(offset == offset.min())
centeralTGG

In [None]:
offset = abs(arrayIndicies - center) 
centerTGG = indiciesTGG[np.argmin(offset)]

In [None]:
any(abs(x - centerTGG) < 10 for x in indiciesStop)

In [None]:
indiciesStop = []
for codons in stopCodons:
    indiciesStop.extend([m.start() for m in re.finditer(codons, str(testSeq))])

In [None]:
len(indiciesStop)

In [None]:
[m.start() for m in re.finditer('TGA', str(testSeq))]

In [None]:
[m.start() for m in re.finditer('TAA', str(testSeq))]

In [None]:
[m.start() for m in re.finditer('TAG', str(testSeq))]

In [None]:
testSeq.count(stopCodons)

In [None]:
indiciesTGG

In [None]:
len(indiciesTGG)

In [None]:
testSeq = sesRNAs[0]

In [None]:
lastTGG = [m.start() for m in re.finditer('TGG', str(testSeq))][-1]

In [None]:
testSeq

In [None]:
indiciesTGG = [m.start() for m in re.finditer('TGG', str(testSeq))]

In [None]:
start = 0 
stop = 200

In [None]:
middle = (start + stop) / 2

In [None]:
abs(middle - indiciesTGG[0])

In [None]:
type(indiciesTGG)

In [None]:
indiciesTGG

In [None]:
any(indiciesTGG) > 2

In [None]:
length = 200 

In [None]:
any(abs(x - (length/2)) < 20 for x in indiciesTGG)

In [None]:
testList = [50, 60, 170, 200]

In [None]:
any(abs(x - (length/2)) < 10 for x in testList)

In [None]:
testSeq

In [None]:
os.path.isdir('Output/BioPython')

In [None]:
outputFileName = os.getcwd() + ''

In [None]:
testSeq