# Setup 

## Import packages 

In [1]:
# General 
import os 
import numpy as np
import pandas as pd 

# For manipulating string objects 
import re
# for generating any necessary directories
import pathlib 
# For manipulating list objects 
import itertools 

In [2]:
pd.set_option('display.max_rows', None)

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
# For working with sequence objects 
from Bio.Seq import Seq

In [5]:
# For fetching sequences from Entrez 
from Bio import Entrez  
from Bio import SeqIO

In [6]:
# For extracting features 
from Bio.SeqFeature import SeqFeature, FeatureLocation
# For creating SeqRecord objects 
from Bio.SeqRecord import SeqRecord

In [7]:
# For running R cells 
%load_ext rpy2.ipython  

# Functions 

In [8]:
import sys
# Importing module of personal functions 
sys.path.append('/home/user1/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code')
from kCellReadR import * 

## Misc

In [9]:
speciesName = 'Human'
geneName = 'Arc'

Entrez.email = "kehaliwoldemichael@gmail.com"  # Always tell NCBI who you are

# Sequence

## Loading sequences 

In [10]:
ensembl_transcriptIDs = return_ensemblTranscriptIDs(speciesName, geneName)
ensembl_transcriptIDs

['ENST00000356613', 'ENST00000581404']

In [11]:
variantTable = table_transcriptsInfo(ensembl_transcriptIDs)  
variantTable

Unnamed: 0,TranscriptNum,TranscriptID,TranscriptName,Assembly,Type,AA_Length,Is_Canonical
0,1,ENST00000356613,ARC-201,GRCh38,protein_coding,396,True
1,2,ENST00000581404,ARC-202,GRCh38,processed_transcript,no protein,False


In [12]:
# Chooses canonical transcript
spliceVariant = int(variantTable[variantTable['Is_Canonical'] == 'True']['TranscriptNum'])
spliceVariant

1

In [13]:
geneName

'Arc'

In [14]:
speciesName

'Human'

In [15]:
rC_exon_records, rC_intron_records, CDS, cDNA, genomic = load_referenceSequences(speciesName, geneName, spliceVariant)

In [16]:
rC_exon_records

[SeqRecord(seq=Seq('CTGAGGGGCCAGGAAGCACCGGGACATCAGGTCTGCTCTGCTGCGTGTGCGACT...TGC'), id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[]),
 SeqRecord(seq=Seq('CTGTTGTCACTCTCCTGGCTCTGATGGTGCCAGCTGCTGCCTCAGCCGGATTTG...GAG'), id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[]),
 SeqRecord(seq=Seq('GAAGTTTCAGTGTTTTATTAACAAAATCTTACAAAAAGAGTCTGTCTCTGGGGT...GAC'), id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[])]

In [17]:
rC_intron_records

[SeqRecord(seq=Seq('ctgggggagagaaagcgcgggtccatgtgagagctggctcagcaaactgcccct...cac'), id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[]),
 SeqRecord(seq=Seq('ctgaaagaacaggagagtgggcagtgaaggaccacgcaggacagtagccagggc...tac'), id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[])]

## Selecting sensor 

In [18]:
import random 
n = random.random() 
print(n)


0.22221481658708298


In [19]:
len(genomic[0])

3431

In [20]:
len(CDS[0])

1191

In [21]:
# Print lenth exons 
for exon in rC_exon_records:
    print(len(str(exon.seq)))

1996
145
809


In [22]:
len(rC_exon_records)

3

In [23]:
rC_CDS = [CDS[0].reverse_complement()]
rC_genomic = [genomic[0].reverse_complement()]

In [24]:
##### Initial if want to check variable length sesRNAs 
for i in range(192, 300):
    if(i%3 == 0):
        print(i)

192
195
198
201
204
207
210
213
216
219
222
225
228
231
234
237
240
243
246
249
252
255
258
261
264
267
270
273
276
279
282
285
288
291
294
297


In [25]:
# Longer the sesRNA ... the better it binds ... but the higher likelihood to introduce stop codons and ATGs 
# Want around 60 bp distance between multiple TAGs ... 
sesRNA_length = 192
targetChoice = 'exon' # exon, cds, genomic
parameters = parameters_sesRNA(speciesName, geneName,  spliceVariant, sesRNA_length, 1, 2, 'None', 40, 70, 20, 20)

In [26]:
if targetChoice == 'exon':
    target = rC_exon_records 
    test = CDS
elif targetChoice == 'cds':
    target = rC_CDS
    test = CDS
elif targetChoice == 'genomic':
    target = rC_genomic
    test = genomic

In [27]:
parameters.print_parameters()

[instance attributes]
species = Human
gene = Arc
spliceVariant = 1
length = 192
num_inF_TGG = 1
num_inF_Stop = 2
inF_ATG = None
minGC = 40
maxGC = 70
nearCenter = 20
fromStop = 20


In [28]:
sesRNA_length/2

96.0

In [29]:
all_sesRNAs, all_sequenceMetrics, all_sesRNA_objs = generate_all_sesRNAs(target, test, parameters, variantTable['Type'])
all_sequenceMetrics

1
3


Unnamed: 0,sesNum,SeqNum,ExonFrac,ExonProtFrac,CDSFrac,CDNAFrac,StartSeq,StopSeq,firstTGG,centralTGG,second_cTGG,numTGG,numTTGG,numTGGA,numTTGGA,numATG,numStop,gcCont
0,1,1,1/2,1/1,1/1,1/2,824.0,1016.0,114.0,114.0,,1.0,0.0,0.0,0.0,0.0,2.0,64.1
1,2,1,1/2,1/1,1/1,1/2,827.0,1019.0,111.0,111.0,,1.0,0.0,0.0,0.0,0.0,1.0,64.6
2,3,1,1/2,1/1,1/1,1/2,830.0,1022.0,108.0,108.0,,1.0,0.0,0.0,0.0,0.0,2.0,64.6
3,4,1,1/2,1/1,1/1,1/2,833.0,1025.0,105.0,105.0,,1.0,0.0,0.0,0.0,0.0,2.0,64.1
4,5,1,1/2,1/1,1/1,1/2,836.0,1028.0,102.0,102.0,,1.0,0.0,0.0,0.0,0.0,2.0,64.6
5,6,1,1/2,1/1,1/1,1/2,839.0,1031.0,99.0,99.0,,1.0,0.0,0.0,0.0,0.0,2.0,64.6
6,7,1,1/2,1/1,1/1,1/2,842.0,1034.0,96.0,96.0,,1.0,0.0,0.0,0.0,0.0,2.0,63.5
7,8,1,1/2,1/1,1/1,1/2,845.0,1037.0,93.0,93.0,,1.0,0.0,0.0,0.0,0.0,2.0,63.0
8,9,1,1/2,1/1,1/1,1/2,848.0,1040.0,90.0,90.0,189.0,2.0,1.0,0.0,0.0,0.0,2.0,63.0
9,10,1,1/2,1/1,1/1,1/2,851.0,1043.0,87.0,87.0,186.0,2.0,1.0,0.0,0.0,0.0,2.0,63.5


In [30]:
str(all_sesRNAs[0])

'TGATCTCCTCCTCGTCCGCGTCCACGTAGAGCGTCTGGTACAGGTCCCGCTTGCGCCACAGGAACTGGTCCAGCGGCTCGCCCTGCTTCTGCGGCAGGTCCAGCTCGCGCTGGATGGCCTCTCGGGACAGCGTGCCCTCGCTGTACTGCAGGAACTCCTTCTTGAACTCCACCCAGTTCTTCACGGAGCCCT'

In [31]:
str(all_sesRNAs[0].reverse_complement())

'AGGGCTCCGTGAAGAACTGGGTGGAGTTCAAGAAGGAGTTCCTGCAGTACAGCGAGGGCACGCTGTCCCGAGAGGCCATCCAGCGCGAGCTGGACCTGCCGCAGAAGCAGGGCGAGCCGCTGGACCAGTTCCTGTGGCGCAAGCGGGACCTGTACCAGACGCTCTACGTGGACGCGGACGAGGAGGAGATCA'

# Secondary structure 

## Working 

In [32]:
generate_RNApred(all_sesRNAs, all_sequenceMetrics, geneName, 1)

Unnamed: 0,sesNum,SeqNum,ExonFrac,ExonProtFrac,CDSFrac,CDNAFrac,StartSeq,StopSeq,firstTGG,centralTGG,second_cTGG,numTGG,numTTGG,numTGGA,numTTGGA,numATG,numStop,gcCont,mfe
0,1,1,1/2,1/1,1/1,1/2,824.0,1016.0,114.0,114.0,,1.0,0.0,0.0,0.0,0.0,2.0,64.1,-71.5
1,2,1,1/2,1/1,1/1,1/2,827.0,1019.0,111.0,111.0,,1.0,0.0,0.0,0.0,0.0,1.0,64.6,-73.7
2,3,1,1/2,1/1,1/1,1/2,830.0,1022.0,108.0,108.0,,1.0,0.0,0.0,0.0,0.0,2.0,64.6,-76.0
3,4,1,1/2,1/1,1/1,1/2,833.0,1025.0,105.0,105.0,,1.0,0.0,0.0,0.0,0.0,2.0,64.1,-76.6
4,5,1,1/2,1/1,1/1,1/2,836.0,1028.0,102.0,102.0,,1.0,0.0,0.0,0.0,0.0,2.0,64.6,-76.6
5,6,1,1/2,1/1,1/1,1/2,839.0,1031.0,99.0,99.0,,1.0,0.0,0.0,0.0,0.0,2.0,64.6,-75.8
6,7,1,1/2,1/1,1/1,1/2,842.0,1034.0,96.0,96.0,,1.0,0.0,0.0,0.0,0.0,2.0,63.5,-75.8
7,8,1,1/2,1/1,1/1,1/2,845.0,1037.0,93.0,93.0,,1.0,0.0,0.0,0.0,0.0,2.0,63.0,-74.7
8,9,1,1/2,1/1,1/1,1/2,848.0,1040.0,90.0,90.0,189.0,2.0,1.0,0.0,0.0,0.0,2.0,63.0,-75.1
9,10,1,1/2,1/1,1/1,1/2,851.0,1043.0,87.0,87.0,186.0,2.0,1.0,0.0,0.0,0.0,2.0,63.5,-76.3


In [33]:
fold_sequenceMetrics = generate_mfeProb(all_sequenceMetrics, geneName, speciesName, spliceVariant)
fold_sequenceMetrics 

Unnamed: 0,sesNum,SeqNum,ExonFrac,ExonProtFrac,CDSFrac,CDNAFrac,StartSeq,StopSeq,firstTGG,centralTGG,second_cTGG,numTGG,numTTGG,numTGGA,numTTGGA,numATG,numStop,gcCont,mfe,mfeFreq
0,1,1,1/2,1/1,1/1,1/2,824.0,1016.0,114.0,114.0,,1.0,0.0,0.0,0.0,0.0,2.0,64.1,-71.5,1.339
1,2,1,1/2,1/1,1/1,1/2,827.0,1019.0,111.0,111.0,,1.0,0.0,0.0,0.0,0.0,1.0,64.6,-73.7,1.818
2,3,1,1/2,1/1,1/1,1/2,830.0,1022.0,108.0,108.0,,1.0,0.0,0.0,0.0,0.0,2.0,64.6,-76.0,1.938
3,4,1,1/2,1/1,1/1,1/2,833.0,1025.0,105.0,105.0,,1.0,0.0,0.0,0.0,0.0,2.0,64.1,-76.6,2.124
4,5,1,1/2,1/1,1/1,1/2,836.0,1028.0,102.0,102.0,,1.0,0.0,0.0,0.0,0.0,2.0,64.6,-76.6,2.191
5,6,1,1/2,1/1,1/1,1/2,839.0,1031.0,99.0,99.0,,1.0,0.0,0.0,0.0,0.0,2.0,64.6,-75.8,1.88
6,7,1,1/2,1/1,1/1,1/2,842.0,1034.0,96.0,96.0,,1.0,0.0,0.0,0.0,0.0,2.0,63.5,-75.8,2.05
7,8,1,1/2,1/1,1/1,1/2,845.0,1037.0,93.0,93.0,,1.0,0.0,0.0,0.0,0.0,2.0,63.0,-74.7,1.555
8,9,1,1/2,1/1,1/1,1/2,848.0,1040.0,90.0,90.0,189.0,2.0,1.0,0.0,0.0,0.0,2.0,63.0,-75.1,0.539
9,10,1,1/2,1/1,1/1,1/2,851.0,1043.0,87.0,87.0,186.0,2.0,1.0,0.0,0.0,0.0,2.0,63.5,-76.3,2.02


In [34]:
# Code block for testing against different species than sesRNA was generated against 
#test_speciesName = 'Crab-eating macaque'
test_speciesName = speciesName

# Downloading species name if test species is not same as initial species 
if test_speciesName != speciesName:
    test_ensembl_transcriptIDs = return_ensemblTranscriptIDs(test_speciesName, geneName)
    test_variantTable = table_transcriptsInfo(test_ensembl_transcriptIDs)
    # Chooses canonical transcript
    test_spliceVariant = int(test_variantTable[test_variantTable['Is_Canonical'] == 'True']['TranscriptNum'])
    test_rC_exon_records, test_C_exon_records, test_CDS, test_cDNA, test_genomic = load_referenceSequences(test_speciesName, geneName, test_spliceVariant)

In [35]:
metricsTable_higherOrder, outputRIblast = output_RIblast(fold_sequenceMetrics, geneName, test_speciesName, spliceVariant, 'cDNA')
outputRIblast

/home/user1/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/Output/EnsemblSeq/Human/Arc_cdna-1_Human.fasta
RIblast ris mode has started.
Rna interaction search of query:Arc_01 sesRNA #01 has started.
Rna interaction search of query:Arc_01 sesRNA #01 has finished.
RIblast ris mode has finished.
/home/user1/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/Output/BioPython/Temp/Arc_01.fasta
RIblast ris mode has started.
Rna interaction search of query:Arc_02 sesRNA #02 has started.
Rna interaction search of query:Arc_02 sesRNA #02 has finished.
RIblast ris mode has finished.
/home/user1/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/Output/BioPython/Temp/Arc_02.fasta
RIblast ris mode has started.
Rna interaction search of query:Arc_03 sesRNA #03 has started.
Rna interaction search of query:Arc_03 sesRNA #03 has finished.
RIblast ris mode has finished.
/home/user1/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/Output/BioPytho

Unnamed: 0,Accessibility Energy,Hybridization Energy,Interaction Energy,BasePair,Accessibility Energy.1,Hybridization Energy.1,Interaction Energy.1,BasePair.1
0,99.2752,-211.93,-112.655,(0-143:1168-1025),22.174,-39.22,-17.046,(165-191:1003-977)
0,113.096,-228.96,-115.864,(0-155:1165-1010),21.287,-39.47,-18.183,(162-188:1003-977)
0,23.4079,-42.74,-19.3321,(159-191:1003-971),5.32976,-13.88,-8.55024,(105-117:680-669)
0,65.2411,-119.31,-54.0689,(104-191:1055-968),5.62114,-13.88,-8.25886,(102-114:680-669)
0,24.9233,-52.79,-27.8667,(153-191:1003-965),5.80326,-13.88,-8.07674,(99-111:680-669)
0,26.0439,-56.89,-30.8461,(150-191:1003-962),,,,
0,26.5589,-60.2,-33.6411,(147-191:1003-959),,,,
0,30.0003,-64.18,-34.1797,(144-191:1003-956),,,,
0,31.0746,-69.63,-38.5554,(141-191:1003-953),6.12856,-15.43,-9.30144,(172-191:423-407)
0,103.678,-194.69,-91.0123,(0-131:1141-1010),33.213,-75.82,-42.607,(138-191:1003-950)


In [36]:
metricsTable_higherOrder

Unnamed: 0,sesNum,SeqNum,ExonFrac,ExonProtFrac,CDSFrac,CDNAFrac,StartSeq,StopSeq,firstTGG,centralTGG,second_cTGG,numTGG,numTTGG,numTGGA,numTTGGA,numATG,numStop,gcCont,mfe,mfeFreq,Accessibility Energy,Hybridization Energy,Interaction Energy,BasePair
0,1,1,1/2,1/1,1/1,1/2,824.0,1016.0,114.0,114.0,,1.0,0.0,0.0,0.0,0.0,2.0,64.1,-71.5,1.339,99.2752,-211.93,-112.655,(0-143:1168-1025)
1,2,1,1/2,1/1,1/1,1/2,827.0,1019.0,111.0,111.0,,1.0,0.0,0.0,0.0,0.0,1.0,64.6,-73.7,1.818,113.096,-228.96,-115.864,(0-155:1165-1010)
2,3,1,1/2,1/1,1/1,1/2,830.0,1022.0,108.0,108.0,,1.0,0.0,0.0,0.0,0.0,2.0,64.6,-76.0,1.938,23.4079,-42.74,-19.3321,(159-191:1003-971)
3,4,1,1/2,1/1,1/1,1/2,833.0,1025.0,105.0,105.0,,1.0,0.0,0.0,0.0,0.0,2.0,64.1,-76.6,2.124,65.2411,-119.31,-54.0689,(104-191:1055-968)
4,5,1,1/2,1/1,1/1,1/2,836.0,1028.0,102.0,102.0,,1.0,0.0,0.0,0.0,0.0,2.0,64.6,-76.6,2.191,24.9233,-52.79,-27.8667,(153-191:1003-965)
5,6,1,1/2,1/1,1/1,1/2,839.0,1031.0,99.0,99.0,,1.0,0.0,0.0,0.0,0.0,2.0,64.6,-75.8,1.88,26.0439,-56.89,-30.8461,(150-191:1003-962)
6,7,1,1/2,1/1,1/1,1/2,842.0,1034.0,96.0,96.0,,1.0,0.0,0.0,0.0,0.0,2.0,63.5,-75.8,2.05,26.5589,-60.2,-33.6411,(147-191:1003-959)
7,8,1,1/2,1/1,1/1,1/2,845.0,1037.0,93.0,93.0,,1.0,0.0,0.0,0.0,0.0,2.0,63.0,-74.7,1.555,30.0003,-64.18,-34.1797,(144-191:1003-956)
8,9,1,1/2,1/1,1/1,1/2,848.0,1040.0,90.0,90.0,189.0,2.0,1.0,0.0,0.0,0.0,2.0,63.0,-75.1,0.539,31.0746,-69.63,-38.5554,(141-191:1003-953)
9,10,1,1/2,1/1,1/1,1/2,851.0,1043.0,87.0,87.0,186.0,2.0,1.0,0.0,0.0,0.0,2.0,63.5,-76.3,2.02,103.678,-194.69,-91.0123,(0-131:1141-1010)


## Choose sesRNA

In [None]:
sesChoice = 13
chosenSequence = all_sesRNAs[sesChoice - 1]

In [None]:
str(chosenSequence)

In [None]:
return_inFrame(chosenSequence, 'all')

In [None]:
from Bio.Restriction import *

In [None]:
len(EcoRI.search(chosenSequence)) == 0

In [None]:
len(AscI.search(chosenSequence)) == 0

In [None]:
len(HindIII.search(chosenSequence)) == 0

In [None]:
testSeq = 'CCTCCTCGCTGCCCTCGGACTTGAGGATGTCCATCTGCAGCCCTTGCCGATGCTCCATGTCCAGGTCGTCGCAGTGGGCGAAGCCCACCGCCTCCTCGTCGGTGGCCGCCTGGAAGCCCATCCTGGCGAACATGCCGCTCATCTTCGCCTGGGATTTGTGCGACACCGAGGTGGCCACGTTGGAGAGCTTGCTGCGGAGGAGGG'

In [None]:
testSeq = 'CCTCCTCGCTGCCCTCGGACTTGAGGATGTCCATCTGCAGCCCTTGCCGATGCTCCATGTCCAGGTCGTCGCAGTGGGCGAAGCCCACCGCCTCCTCGTCGGTGGCCGCCTGGAAGCCCATCCTGGCGAACATGCCGCTCATCTTCGCCTGGGATTTGTGCGACACCGAGGTGGCCACGTTGGAGAGCTTGCTGCGGAGGAGGG'

In [None]:
return_inFrame(Seq(testSeq), 'all')

In [None]:
save_all_sesRNAs_DNA(all_sesRNAs, speciesName, geneName)

In [None]:
AscI.site

In [None]:
testSeq = 'GGTAGCTCGACTGGCTTCTACCTTTCGGGTACCTTCACGGGCTTTTCTTTCGGGGCGACACGCTGACGTGGACGAAGCCCGACGGGGCCGCCATATAGTAGCGGTAGTACTCGCCTGACCCTAAGACGTAAAGGAAGCCGTAGGCTACATTGGACCCTCACCGGTAGCACCTATACCAGTTATTGTCGTGGT'

In [None]:
testSeq[::-1]

In [None]:
check_inSearchSeq(all_sesRNAs[0], CDS, 'Complement')

In [None]:
def output_intaRNA(sequenceMetrics):
    # Call RNAfold on each sequence of output 
    top_intarnaE = []

    intarna_columns = ['E', 'start1', 'end1', 'start2', 'end2']
    all_sorted_intarna = []
    useful_intarna =  pd.DataFrame(columns = intarna_columns)

    martBase = '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Output/biomaRt/'
    martBasePath = martBase + species
    # Loading sequences for gene CDS
    CDS_fileName = martBasePath + '/CDS_' + geneName + '.fasta'
    
    pathTemp = '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Output/BioPython/Temp'
    pathOutTempIntaRNA = pathTemp + '/temp.csv'

    # sorting files in output of scandir 
    for entry in sorted(os.scandir(pathTemp), key=lambda e: e.name):
        # For checking which file currently working on (not in order for some reason) 
        print(entry.path)

        # Defining command for IntaRNA
        numThreads = 10
        commandInta = 'IntaRNA -t ' + CDS_fileName + ' -q ' + entry.path + ' --threads ' + str(numThreads) + ' --outMode=C > ' + pathOutTempIntaRNA
        # Generating IntaRNA predictions 
        os.system(commandInta)
        # Reading in csv file with information 
        intarnaOutput = pd.read_csv(pathOutTempIntaRNA, sep = ';')
        # Saving pd output as list of lists ... would be a pain to compute multiple times if desired 
        sorted_intarna = intarnaOutput.sort_values('E', ascending=True)
        # Concatinating select columuns of dataframe for inclusion into sequence metrics 
        all_sorted_intarna.append(sorted_intarna)
        print(sorted_intarna)
        # Getting first row ... with highest energy values ... and appending to DataFrame 
        useful_intarna = useful_intarna.append(sorted_intarna[intarna_columns].iloc[0:1])
        
        # Moving to Temp directory to work on fasta files 
        currentWD = os.getcwd()
        os.chdir('/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Output/BioPython/Temp')

        # Removing temp.out after finishing each run 
        os.system('rm -rf temp.out')
        os.system('rm -rf temp.csv')
        # Return to initial working directory 
        os.chdir(currentWD)

    # Removing files generated by RNAfold 
    os.system('rm -rf *ss.ps')
    os.system('rm -rf *dp.ps')
    # Removing temp fasta files 
#     resetCommand = 'cd ' + pathTemp + ' && rm -rf *'
#     os.system(resetCommand)
    
    # Have to reset index since useful_intarna is slice of many pd.DataFrames 
    out_sequenceMetrics = pd.concat([sequenceMetrics.reset_index(), useful_intarna.reset_index()], axis = 1)
    
    return out_sequenceMetrics 

In [None]:
intarna_sequenceMetrics = output_intaRNA(fold_sequenceMetrics)
intarna_sequenceMetrics

In [None]:
def output_RNApred(sequenceMetrics):
    # Call RNAfold on each sequence of output 
    rnaFold_prob = []
    top_intarnaE = []

    intarna_columns = ['E', 'start1', 'end1', 'start2', 'end2']
    all_sorted_intarna = []
    useful_intarna =  pd.DataFrame(columns = intarna_columns)

    martBase = '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Output/biomaRt/'
    martBasePath = martBase + species
    # Loading sequences for gene CDS
    CDS_fileName = martBasePath + '/CDS_' + geneName + '.fasta'
    
    pathTemp = '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Output/BioPython/Temp'
    pathOutTempFold = pathTemp + '/temp.out'
    pathOutTempIntaRNA = pathTemp + '/temp.csv'

    # sorting files in output of scandir 
    for entry in sorted(os.scandir(pathTemp), key=lambda e: e.name):
        # For checking which file currently working on (not in order for some reason) 
        print(entry.path)

        # Defining command for RNAfold 
        commandFold = 'RNAfold -p -d2 --noLP < ' + entry.path + ' > ' + pathOutTempFold    
        # Generating RNAfold predictions 
        generateProb = subprocess.run(commandFold, shell=True, stdout=subprocess.PIPE)

        # Defining command for IntaRNA
        numThreads = 10
        commandInta = 'IntaRNA -t ' + CDS_fileName + ' -q ' + entry.path + ' --threads ' + str(numThreads) + ' --outMode=C > ' + pathOutTempIntaRNA
        # Generating IntaRNA predictions 
        os.system(commandInta)
        # Reading in csv file with information 
        intarnaOutput = pd.read_csv(pathOutTempIntaRNA, sep = ';')
        # Saving pd output as list of lists ... would be a pain to compute multiple times if desired 
        sorted_intarna = intarnaOutput.sort_values('E', ascending=True)
        # Concatinating select columuns of dataframe for inclusion into sequence metrics 
        all_sorted_intarna.append(sorted_intarna)
        # Getting first row ... with highest energy values ... and appending to DataFrame 
        useful_intarna = useful_intarna.append(sorted_intarna[intarna_columns].iloc[0:1])
        
        # Moving to Temp directory to work on fasta files 
        currentWD = os.getcwd()
        os.chdir('/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Output/BioPython/Temp')

        # Running script for getting probabilities from RNAfold output file (added to ArchBin btw)
        readProb = subprocess.Popen("rnaFold_prob.sh", shell=True, stdout=subprocess.PIPE)
        returnedProb = readProb.stdout.read()
        # Waiting for last command to finish before storing value in temp.out file 
        readProb.wait()
        # Append frequences ... convert to percentage 
        rnaFold_prob.append(float(returnedProb)*100)

        
        # Removing temp.out after finishing each run 
        os.system('rm -rf temp.out')
        os.system('rm -rf temp.csv')
        # Return to initial working directory 
        os.chdir(currentWD)

    # Removing files generated by RNAfold 
    os.system('rm -rf *ss.ps')
    os.system('rm -rf *dp.ps')
    # Removing temp fasta files 
    resetCommand = 'cd ' + pathTemp + ' && rm -rf *'
    os.system(resetCommand)
    
    # Adding RNA fold mfe ensemble frequency to sequenceMetrics 
    sequenceMetrics['mfeFreq'] = rnaFold_prob
    # Have to reset index since useful_intarna is slice of many pd.DataFrames 
    out_sequenceMetrics = pd.concat([sequenceMetrics.reset_index(), useful_intarna.reset_index()], axis = 1)
    
    return out_sequenceMetrics 

In [None]:
higherSequenceMetrics = output_RNApred(all_sequenceMetrics)
higherSequenceMetrics

# Outputing sesRNA

In [None]:
testSeq = Seq('TGGGAGTAGTGGTGGTAATGA')
testStr = str(testSeq)

In [None]:
testStr

In [None]:
convert_DNA(testSeq, 1)

In [None]:
# Test that all and numbering is working 
convert_DNA(testSeq, 3) == convert_DNA(testSeq, 'All')

In [None]:
convert_DNA(rC_multiExon_sesRNAs[0], 'All')

In [None]:
# Testing that continious reading frame being produced if number set to 0 
len(convert_DNA(rC_multiExon_sesRNAs[0], 0).translate(to_stop = True)) == len(rC_multiExon_sesRNAs[0])/3

In [None]:
# Checking that right number of TGG being converted 
return_inFrame(convert_DNA(rC_multiExon_sesRNAs[0], 'All').back_transcribe(), 'numTGG')

In [None]:
convert_DNA(testSeq, 1) - (len(testSeq)/2)

In [None]:
abs(convert_DNA(testSeq, 1) - (len(testSeq)/2))

In [None]:
np.sort(abs(convert_DNA(testSeq, 1) - (len(testSeq)/2))) + (len(testSeq)/2)

In [None]:
np.array(sorted(convert_DNA(testSeq, 1) - (len(testSeq)/2), key = abs)) + (len(testSeq)/2)

In [None]:
# Function for saving both original template DNA sequence and the converted RNA given list of sesRNAs Seq objects
# Could change later to make for separate output files ... have not decided yet whether to just put them together into one labeled fasta file 
def save_sesRNAs(sequences_sesRNAs, geneName, version, numConvertTGG):
    # Generating BioPython directory if does not exist 
    pathlib.Path('Output/BioPython').mkdir(parents=True, exist_ok=True)
    
    # Defining save name and description for outputs (original DNA and converted RNA)
    DNA_outputID = geneName + '_sesRNA'
    DNA_outputDescription = "sesRNA DNA original for " + geneName
    convertedRNA_outputID = geneName + '_sesRNA'
    convertedRNA_outputDescription = "sesRNA converted RNA for " + geneName
    
    # Creating empty list for storing sequences 
    outputSeqMulti_DNA = []
    outputSeqMulti_convertedRNA= []
    
    # Generating SeqRecord objects in preparation for writing FASTA files ... edited description to include number of sequence
    i = 1
    for sequence in sequences_sesRNAs:
        outputSeqMulti_DNA.append(SeqRecord(sequence, id = DNA_outputID+str(i), description = DNA_outputDescription))
        outputSeqMulti_convertedRNA.append(SeqRecord(convert_DNA(sequence, numberConvertTGG), id = convertedRNA_outputID+str(i), description = convertedRNA_outputDescription))
        i += 1
        
    # Write output fasta files 
    DNA_outputName = "Output/BioPython/" + geneName + "_sesRNA_" + version + ".fasta" 
    convertedRNA_outputName = "Output/BioPython/" + geneName + "_sesRNA_convertedRNA_" + version + ".fasta" 
    
    with open(outputName, "w") as output_handle:
        # Writing original template DNA files 
        SeqIO.write(outputSeqMulti_DNA, output_handle, "fasta")
        # Writing orginal template RNA files 
        SeqIO.write(outputSeqMulti_convertedRNA, output_handle, "fasta")

# Not working or not being used 

In [None]:
seqTdT = 'ttacttgtacagctcgtccatgccgtacaggaacaggtggtggcggccctcggagcgctcgtactgttccacgatggtgtagtcctcgttgtgggaggtgatgtccagcttggtgtccacgtagtagtagccgggcagttgcacgggcttcttggccatgtagatggtcttgaactccaccaggtagtggccgccgtccttcagcttcagggcctggtggatctcgcccttcagcacgccgtcgcgggggtacaggcgctcggtggaggcctcccagcccatggtcttcttctgcattacggggccgtcgggggggaagttggtgccgcgcatcttcaccttgtagatcagcgtgccgtcctgcagggaggagtcctgggtcacggtcaccagaccgccgtcctcgaagttcatcacgcgctcccacttgaagccctcggggaaggacagcttcttgtaatcggggatgtcggcggggtgcttcacgtacgccttggagccgtacatgaactggggggacaggatgtcccaggcgaagggcagggggccgcccttggtcaccttcagcttggcggtctgggtgccctcgtaggggcggccctcgccctcgccctcgatctcgaactcgtggccgttcatggagccctccatgcgcaccttgaagcgcatgaactctttgatgacggccatgttgttgtcctcggaggaggcggtgccggagctgccgctgccggtgctgccggtgccatgccccaggaacaggtggtggcggccctcggagcgctcgtactgttccacgatggtgtagtcctcgttgtgggaggtgatgtccagcttggtgtccacgtagtagtagccgggcagttgcacgggcttcttggccatgtagatggtcttgaactccaccaggtagtggccgccgtccttcagcttcagggcctggtggatctcgcccttcagcacgccgtcgcgggggtacaggcgctcggtggaggcctcccagcccatggtcttcttctgcattacggggccgtcgggggggaagttggtgccgcgcatcttcaccttgtagatcagcgtgccgtcctgcagggaggagtcctgggtcacggtcaccagaccgccgtcctcgaagttcatcacgcgctcccacttgaagccctcggggaaggacagcttcttgtaatcggggatgtcggcggggtgcttcacgtacgccttggagccgtacatgaactggggggacaggatgtcccaggcgaagggcagggggccgcccttggtcaccttcagcttggcggtctgggtgccctcgtaggggcggccctcgccctcgccctcgatctcgaactcgtggccgttcatggagccctccatgcgcaccttgaagcgcatgaactctttgatgacctcctcgcccttgctcaccat'
seqTdT = seqTdT.upper()
seqTdT

In [None]:
len(Seq(seqTdT).translate()) == len(seqTdT)/3

In [None]:
num_inF_TGG, num_inF_TTGG, num_inF_TGGA, num_inF_TTGGA, num_inF_ATG, num_inF_Stop, indices_inF_TGG, \
    indices_inF_ATG, indices_inF_Stop = \
    return_inFrame(Seq(seqTdT).reverse_complement(), 'all')

In [None]:
num_inF_TGG

In [None]:
num_inF_TTGG

In [None]:
num_inF_TGGA

In [None]:
num_inF_TTGGA

In [None]:
# Generating pd.Dataframe
df = pd.DataFrame(all_sequenceMetrics)
# Converting DataFrame to json and dumping it to std.out
df_json = df.reset_index().to_json(orient="values")


In [None]:
df_json

In [None]:
# Initial if want to check variable length sesRNAs 
for i in range(200, 300):
    if(i%3 == 0):
        print(i)

In [None]:
additional_sesRNA = 'gagaggaagggcagaggcaccaacccgggcggaggaggaggcgcggcggcggcggcggctcagaccccctccccggcccgcatctgtgcagctttccgggcgatgccagaatagatgccggggcaatgtcccgccgcaaacagggcaacccgcagcacttgtcccagagggaactcatcacgcgtaagtgtctgccgcacgcgcgaagggccggcggctggggctccgggcgcccggggctgg'

In [None]:
len(additional_sesRNA)

In [None]:
all_sequenceMetrics = all_sequenceMetrics.append(all_sequenceMetrics.iloc[11, :])

In [None]:
all_sequenceMetrics

In [None]:
len(all_sesRNAs)

In [None]:
all_sesRNAs.append(Seq(additional_sesRNA.upper()))

In [None]:
len(all_sesRNAs)

In [None]:
all_sesRNAs

In [None]:
sys.path.append("/usr/share/ViennaRNA")

In [None]:
# Leading RNAfold as RNA 
import sys
sys.path.append("/usr/lib/python3.9/site-packages/RNA")
import _RNA as RNA

In [None]:
import pyseqlib

In [None]:
from pyseqlib import pyRNAfold

In [None]:
pyRNAfold i

In [None]:
pyseqlib.fold_compound(sequence)

In [None]:
RNA.fold_compound(sequence)

In [None]:
sys.path.append("/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Packages/RNA")
import _RNA as RNA

In [None]:
sys.path.append("/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions")
import RNA_Fold

In [None]:
md = RNA.md()

In [None]:
import example

In [None]:
sequence = "CGCAGGGAUACCCGCG"
 
# create new fold_compound object
fc = RNA.fold_compound(sequence)
 
# compute minimum free energy (mfe) and corresponding structure
(ss, mfe) = fc.mfe()
 

In [None]:
# The RNA sequence
seq = "GAGUAGUGGAACCAGGCUAUGUUUGUGACUCGCAGACUAACA"
 
# compute minimum free energy (MFE) and corresponding structure
(ss, mfe) = RNA.fold(seq)

In [None]:
import pickle 

In [None]:
mfe

In [None]:
pathOuput = "/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions/seqObject.p"
pickle.dump(testSeq, open(pathOuput, 'wb'))

In [None]:
# Running script for getting probabilities from RNAfold output file (added to ArchBin btw)
pathFuncPython = '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions/'
# command = 'python ' + '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions/RNA_Fold.py'
# command = 'cd ' + pathFuncPython + ' | ./RNA_Fold.py'
# command = 'cd ' + pathFuncPython + ' | ls -a'

command = 'ls -a'
readProb = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
returnedProb = readProb.stdout.read()

In [None]:
returnedProb

In [None]:
command 

In [None]:
str(rC_multiExon_sesRNAs[0])

In [None]:
testSeq = str(rC_multiExon_sesRNAs[0])
testSeq

In [None]:
RNA.fold(testSeq)

In [None]:
def compute_mfeFreq(sequence):
    # create a fold_compound object for the current sequence
    fc = RNA.fold_compound(sequence)

    # compute the MFE and corresponding structure
    (mfe_struct, mfe) = fc.mfe()


    # compute partition function
    (bp_propensity, dG) = fc.pf()

    # compute frequency of MFE structure (the 'hard' way)
    kT = RNA.exp_param().kT / 1000.

    prob_mfe = math.exp((dG - mfe) / kT)
    
    return prob_mfe

In [None]:
rC_multi_startSeq

In [None]:
rC_multiExon_sesRNAs

In [None]:
str(rC_multiExon_sesRNAs[8])

In [None]:
str(rC_multiExon_sesRNAs[18])

In [None]:
from seqfold import dg, dg_cache, fold

In [None]:
# just returns minimum free energy
dg("GGGAGGTCGTTACATCTGGGTAACACCGGTACTGATCCGGTGACCTCCC", temp = 37.0)  # -12.94

# Test Intarna (test strength of binding, off-target, ...)  

In [None]:
import intarnapvalue

In [None]:
testSeq = str(rC_multiExon_sesRNAs[0])
testSeq

In [None]:
target = str(CDS[0].seq)
target 

In [None]:
from intarnapvalue.intarna_pvalue import IntaRNApvalue

In [None]:
?IntaRNApvalue

In [None]:
?intarna_pvalue

In [None]:
IntaRNApvalue(['--query', testSeq, '--target', target])

In [None]:
IntaRNApvalue['-q', 'AGGAUG', '-t', 'UUUAUCGUU', '-s', '10', '-m', 'b', '-d', 'gauss', '--threads', '3']

In [None]:
command = 'python -m intarnapvalue --query GCUGAAAAACAUAACCCAUAAAAUGCUAGCUGUACCAGGAACCA --target GGUUUCUUCGCCUCUGCGUUCACCAAAGUGUUCACCC -s 10 --shuffle-mode b --threads 0' 

In [None]:
readProb = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
readProb.wait()
returnedProb = readProb.stdout.read()
returnedProb

In [None]:
# Running script for getting probabilities from RNAfold output file (added to ArchBin btw)
readProb = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
returnedProb = readProb.stdout.read()
# Waiting for last command to finish before storing value in temp.out file 
readProb.wait()

In [None]:
pd.read_csv('/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions/output.csv', sep = ';')

In [None]:
pd.read_csv('/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions/output.csv', sep = ';')

In [None]:
pd.read_csv('/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Code/Functions/output.csv', sep = ';')

In [None]:
# Trying to load entire transcripome ... obviously failed ... 
allRat_fileName = '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Data/Sequences/Reference/Ensembl/All/Rattus_norvegicus.Rnor_6.0.cdna.all.fa'
allRat = list(SeqIO.parse(allRat_fileName, "fasta"))

In [None]:
python -m intarnapvalue --query GCUGAAAAACAUAACCCAUAAAAUGCUAGCUGUACCAGGAACCA --target GGUUUCUUCGCCUCUGCGUUCACCAAAGUGUUCACCC --scores 10000 --shuffle-mode b --threads 0

In [None]:
intarnapvalue --query GCUGAAAAACAUAACCCAUAAAAUGCUAGCUGUACCAGGAACCA --target GGUUUCUUCGCCUCUGCGUUCACCAAAGUGUUCACCC --scores 10000 --shuffle-mode b --threads 0

# Misc

In [None]:
# For returning index of findings 
searchCodon = 'TAG'
[m.start() for m in re.finditer(searchCodon, str(seq))]

In [None]:
testSeq = sesRNAs[0]
testSeq

In [None]:
lastATG = [m.start() for m in re.finditer('ATG', str(testSeq))][-1]
lastTGG = [m.start() for m in re.finditer('TGG', str(testSeq))][-1]

In [None]:
lastTGG

In [None]:
lastATG

In [None]:
lastATG < lastTGG

In [None]:
searchCodon = 'ATG'
[m.start() for m in re.finditer(searchCodon, str(testSeq))][-1]

In [None]:
seq.count('TAG') < 4

In [None]:
seq[0:100].count('TAG')

In [None]:
stopCodons = ['TAG', 'TAA', 'TGA']
stopCodons 

In [None]:
indiciesTGG

In [None]:
indiciesStop

In [None]:
length = 200 
center = length/2

In [None]:
arrayStop = np.array(indiciesStop)
arrayIndicies = np.array(indiciesTGG) 
centralTGGs = arrayIndicies[abs(arrayIndicies - center) < 10]

In [None]:
centralTGGs

In [None]:
np.in1d(centralTGGs,arrayStop)

In [None]:
# Check if array contains values that are within range of values in another array 
any((min(abs(arrayStop - i)) > 10) for i in centralTGGs)

In [None]:
centralTGGs

In [None]:
indiciesStop

In [None]:
testStop = [90, 16, 174]

In [None]:
(min(abs(arrayStop - centralTGGs[0])) > 10)

In [None]:
min(abs(arrayStop - centralTGGs[0])) > 10

In [None]:
centeralTGGs = offset.min()
centeralTGGs

In [None]:
centralTGGs = np.all(offset == offset.min())
centralTGGs

In [None]:
centeralTGGs = np.where(offset == offset.min())
centeralTGG

In [None]:
offset = abs(arrayIndicies - center) 
centerTGG = indiciesTGG[np.argmin(offset)]

In [None]:
any(abs(x - centerTGG) < 10 for x in indiciesStop)

In [None]:
indiciesStop = []
for codons in stopCodons:
    indiciesStop.extend([m.start() for m in re.finditer(codons, str(testSeq))])

In [None]:
len(indiciesStop)

In [None]:
[m.start() for m in re.finditer('TGA', str(testSeq))]

In [None]:
[m.start() for m in re.finditer('TAA', str(testSeq))]

In [None]:
[m.start() for m in re.finditer('TAG', str(testSeq))]

In [None]:
testSeq.count(stopCodons)

In [None]:
indiciesTGG

In [None]:
len(indiciesTGG)

In [None]:
testSeq = sesRNAs[0]

In [None]:
lastTGG = [m.start() for m in re.finditer('TGG', str(testSeq))][-1]

In [None]:
testSeq

In [None]:
indiciesTGG = [m.start() for m in re.finditer('TGG', str(testSeq))]

In [None]:
start = 0 
stop = 200

In [None]:
middle = (start + stop) / 2

In [None]:
abs(middle - indiciesTGG[0])

In [None]:
type(indiciesTGG)

In [None]:
indiciesTGG

In [None]:
any(indiciesTGG) > 2

In [None]:
length = 200 

In [None]:
any(abs(x - (length/2)) < 20 for x in indiciesTGG)

In [None]:
testList = [50, 60, 170, 200]

In [None]:
any(abs(x - (length/2)) < 10 for x in testList)

In [None]:
testSeq

In [None]:
os.path.isdir('Output/BioPython')

In [None]:
outputFileName = os.getcwd() + ''

In [None]:
testSeq