# Setup 

## Import packages 

In [1]:
# General 
import os 
import numpy as np
import pandas as pd 

# For manipulating string objects 
import re
# for generating any necessary directories
import pathlib 
# For manipulating list objects 
import itertools 

In [2]:
pd.set_option('display.max_rows', None)

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
# For working with sequence objects 
from Bio.Seq import Seq

In [5]:
# For fetching sequences from Entrez 
from Bio import Entrez  
from Bio import SeqIO

In [6]:
# For extracting features 
from Bio.SeqFeature import SeqFeature, FeatureLocation
# For creating SeqRecord objects 
from Bio.SeqRecord import SeqRecord

# Functions 

In [7]:
import sys
# Importing module of personal functions 
sys.path.append('/Users/kbw29/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/CellREADR')
from kCellReadR import * 

sys.path.append('/Users/kbw29/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/alCellREADR')
from py_modules import *

## Misc

In [8]:
speciesName = 'Mouse'
geneName = 'Sox14'

# Sequence

## Loading sequences 

In [9]:
ensembl_transcriptIDs = return_ensemblTranscriptIDs(speciesName, geneName)
ensembl_transcriptIDs

['ENSMUST00000054819', 'ENSMUST00000183065']

In [10]:
variantTable = table_transcriptsInfo(ensembl_transcriptIDs)  
variantTable

Unnamed: 0,TranscriptNum,TranscriptID,TranscriptName,Assembly,Type,AA_Length,Is_Canonical
0,1,ENSMUST00000054819,Sox14-201,GRCm39,protein_coding,240,True
1,2,ENSMUST00000183065,Sox14-202,GRCm39,protein_coding,177,False


In [11]:
# Chooses canonical transcript
spliceVariant = int(variantTable[variantTable['Is_Canonical'] == 'True']['TranscriptNum'])
spliceVariant

1

In [12]:
geneName

'Sox14'

In [13]:
speciesName

'Mouse'

In [14]:
rC_exon_records, rC_intron_records, CDS, cDNA, genomic = load_referenceSequences(speciesName, geneName, spliceVariant)

In [15]:
rC_exon_records

[SeqRecord(seq=Seq('TTACTTTTTTGCAATGACAGATTGTCTTTATTCAAAACGTCTTTGTTCAACAAA...TGG'), id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[])]

In [16]:
rC_intron_records

[]

In [17]:
len(CDS[0].seq)

723

## Selecting sensor 

In [18]:
import random 
n = random.random() 
print(n)

0.11579901698759132


In [19]:
len(genomic[0])

2065

In [20]:
len(CDS[0])

723

In [21]:
# Print lenth exons 
for exon in rC_exon_records:
    print(len(str(exon.seq)))

2065


In [22]:
len(rC_exon_records)

1

In [23]:
rC_CDS = [CDS[0].reverse_complement()]
rC_genomic = [genomic[0].reverse_complement()]

In [24]:
##### Initial if want to check variable length sesRNAs 
for i in range(192, 300):
    if(i%3 == 0):
        print(i)

192
195
198
201
204
207
210
213
216
219
222
225
228
231
234
237
240
243
246
249
252
255
258
261
264
267
270
273
276
279
282
285
288
291
294
297


In [25]:
# Longer the sesRNA ... the better it binds ... but the higher likelihood to introduce stop codons and ATGs 
# Want around 60 bp distance between multiple TAGs ... 
sesRNA_length = 300
targetChoice = 'exon' # exon, cds, genomic
parameters = parameters_sesRNA(speciesName, geneName,  spliceVariant, sesRNA_length, 1, 2, 'None', 40, 70, 20, 20)

In [26]:
if targetChoice == 'exon':
    target = rC_exon_records 
    test = CDS
elif targetChoice == 'cds':
    target = rC_CDS
    test = CDS
elif targetChoice == 'genomic':
    target = rC_genomic
    test = genomic

In [27]:
parameters.print_parameters()

[instance attributes]
species = Mouse
gene = Sox14
spliceVariant = 1
length = 300
num_inF_TGG = 1
num_inF_Stop = 2
inF_ATG = None
minGC = 40
maxGC = 70
nearCenter = 20
fromStop = 20


In [28]:
sesRNA_length/2

150.0

In [29]:
all_sesRNAs, all_sequenceMetrics, all_sesRNA_objs = generate_all_sesRNAs(target, test, parameters, variantTable['Type'])
all_sequenceMetrics

1


  all_sequenceMetrics = all_sequenceMetrics.append(temp_sequenceMetrics)


Unnamed: 0,sesNum,SeqNum,ExonFrac,ExonProtFrac,CDSFrac,CDNAFrac,StartSeq,StopSeq,firstTGG,centralTGG,second_cTGG,numTGG,numTTGG,numTGGA,numTTGGA,numATG,numStop,gcCont
0,1,1,1/2,1/2,1/2,1/2,906,1206,42,168,123,9,1,3,0,0,2,65.7
1,2,1,1/2,1/2,1/2,1/2,909,1209,39,165,120,9,1,3,0,0,2,65.7
2,3,1,1/2,1/2,1/2,1/2,912,1212,36,162,117,9,1,3,0,0,2,65.3
3,4,1,1/2,1/2,1/2,1/2,915,1215,33,159,180,9,1,3,0,0,2,65.7
4,5,1,1/2,1/2,1/2,1/2,918,1218,30,156,177,9,1,3,0,0,2,66.0
5,6,1,1/2,1/2,1/2,1/2,921,1221,27,153,174,9,1,3,0,0,2,66.7
6,7,1,1/2,1/2,1/2,1/2,924,1224,24,150,171,9,1,3,0,0,2,66.3
7,8,1,1/2,1/2,1/2,1/2,927,1227,21,147,168,9,1,3,0,0,2,66.7
8,9,1,1/2,1/2,1/2,1/2,930,1230,18,144,165,9,1,3,0,0,2,67.0
9,10,1,1/2,1/2,1/2,1/2,933,1233,15,141,162,9,1,3,0,0,2,67.0


# Secondary structure 

## Working 

In [30]:
generate_RNApred(all_sesRNAs, all_sequenceMetrics, geneName, 1)

Unnamed: 0,sesNum,SeqNum,ExonFrac,ExonProtFrac,CDSFrac,CDNAFrac,StartSeq,StopSeq,firstTGG,centralTGG,second_cTGG,numTGG,numTTGG,numTGGA,numTTGGA,numATG,numStop,gcCont,mfe
0,1,1,1/2,1/2,1/2,1/2,906,1206,42,168,123,9,1,3,0,0,2,65.7,-125.4
1,2,1,1/2,1/2,1/2,1/2,909,1209,39,165,120,9,1,3,0,0,2,65.7,-129.8
2,3,1,1/2,1/2,1/2,1/2,912,1212,36,162,117,9,1,3,0,0,2,65.3,-124.4
3,4,1,1/2,1/2,1/2,1/2,915,1215,33,159,180,9,1,3,0,0,2,65.7,-125.0
4,5,1,1/2,1/2,1/2,1/2,918,1218,30,156,177,9,1,3,0,0,2,66.0,-128.7
5,6,1,1/2,1/2,1/2,1/2,921,1221,27,153,174,9,1,3,0,0,2,66.7,-129.4
6,7,1,1/2,1/2,1/2,1/2,924,1224,24,150,171,9,1,3,0,0,2,66.3,-131.6
7,8,1,1/2,1/2,1/2,1/2,927,1227,21,147,168,9,1,3,0,0,2,66.7,-130.5
8,9,1,1/2,1/2,1/2,1/2,930,1230,18,144,165,9,1,3,0,0,2,67.0,-128.9
9,10,1,1/2,1/2,1/2,1/2,933,1233,15,141,162,9,1,3,0,0,2,67.0,-131.4


In [31]:
all_sequenceMetrics

Unnamed: 0,sesNum,SeqNum,ExonFrac,ExonProtFrac,CDSFrac,CDNAFrac,StartSeq,StopSeq,firstTGG,centralTGG,second_cTGG,numTGG,numTTGG,numTGGA,numTTGGA,numATG,numStop,gcCont,mfe
0,1,1,1/2,1/2,1/2,1/2,906,1206,42,168,123,9,1,3,0,0,2,65.7,-125.4
1,2,1,1/2,1/2,1/2,1/2,909,1209,39,165,120,9,1,3,0,0,2,65.7,-129.8
2,3,1,1/2,1/2,1/2,1/2,912,1212,36,162,117,9,1,3,0,0,2,65.3,-124.4
3,4,1,1/2,1/2,1/2,1/2,915,1215,33,159,180,9,1,3,0,0,2,65.7,-125.0
4,5,1,1/2,1/2,1/2,1/2,918,1218,30,156,177,9,1,3,0,0,2,66.0,-128.7
5,6,1,1/2,1/2,1/2,1/2,921,1221,27,153,174,9,1,3,0,0,2,66.7,-129.4
6,7,1,1/2,1/2,1/2,1/2,924,1224,24,150,171,9,1,3,0,0,2,66.3,-131.6
7,8,1,1/2,1/2,1/2,1/2,927,1227,21,147,168,9,1,3,0,0,2,66.7,-130.5
8,9,1,1/2,1/2,1/2,1/2,930,1230,18,144,165,9,1,3,0,0,2,67.0,-128.9
9,10,1,1/2,1/2,1/2,1/2,933,1233,15,141,162,9,1,3,0,0,2,67.0,-131.4


In [32]:
# Code block for testing against different species than sesRNA was generated against 
#test_speciesName = 'Crab-eating macaque'
test_speciesName = speciesName

# Downloading species name if test species is not same as initial species 
if test_speciesName != speciesName:
    test_ensembl_transcriptIDs = return_ensemblTranscriptIDs(test_speciesName, geneName)
    test_variantTable = table_transcriptsInfo(test_ensembl_transcriptIDs)
    # Chooses canonical transcript
    test_spliceVariant = int(test_variantTable[test_variantTable['Is_Canonical'] == 'True']['TranscriptNum'])
    test_rC_exon_records, test_C_exon_records, test_CDS, test_cDNA, test_genomic = load_referenceSequences(test_speciesName, geneName, test_spliceVariant)

In [33]:
metricsTable_higherOrder, outputRIblast = output_RIblast(all_sequenceMetrics, geneName, test_speciesName, spliceVariant, 'cDNA')
outputRIblast

/Users/kbw29/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/CellREADR/Output/EnsemblSeq/Mouse/Sox14_cdna-1_Mouse.fasta
RIblast ris mode has started.
Rna interaction search of query:Sox14_01 sesRNA #01 has started.
Rna interaction search of query:Sox14_01 sesRNA #01 has finished.
RIblast ris mode has finished.
/Users/kbw29/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/CellREADR/Output/BioPython/Temp/Sox14_01.fasta


  useful_RIblast = useful_RIblast.append(temp_RIblast_ouput)
  useful_RIblast = useful_RIblast.append(temp_RIblast_ouput)


RIblast ris mode has started.
Rna interaction search of query:Sox14_02 sesRNA #02 has started.
Rna interaction search of query:Sox14_02 sesRNA #02 has finished.
RIblast ris mode has finished.
/Users/kbw29/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/CellREADR/Output/BioPython/Temp/Sox14_02.fasta
RIblast ris mode has started.
Rna interaction search of query:Sox14_03 sesRNA #03 has started.
Rna interaction search of query:Sox14_03 sesRNA #03 has finished.
RIblast ris mode has finished.
/Users/kbw29/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/CellREADR/Output/BioPython/Temp/Sox14_03.fasta
RIblast ris mode has started.
Rna interaction search of query:Sox14_04 sesRNA #04 has started.
Rna interaction search of query:Sox14_04 sesRNA #04 has finished.
RIblast ris mode has finished.
/Users/kbw29/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/CellREADR/Output/BioPython/Temp/Sox14_04.fasta


  useful_RIblast = useful_RIblast.append(temp_RIblast_ouput)
  useful_RIblast = useful_RIblast.append(temp_RIblast_ouput)


RIblast ris mode has started.
Rna interaction search of query:Sox14_05 sesRNA #05 has started.
Rna interaction search of query:Sox14_05 sesRNA #05 has finished.
RIblast ris mode has finished.
/Users/kbw29/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/CellREADR/Output/BioPython/Temp/Sox14_05.fasta
RIblast ris mode has started.
Rna interaction search of query:Sox14_06 sesRNA #06 has started.


  useful_RIblast = useful_RIblast.append(temp_RIblast_ouput)
  useful_RIblast = useful_RIblast.append(temp_RIblast_ouput)


Rna interaction search of query:Sox14_06 sesRNA #06 has finished.
RIblast ris mode has finished.
/Users/kbw29/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/CellREADR/Output/BioPython/Temp/Sox14_06.fasta
RIblast ris mode has started.
Rna interaction search of query:Sox14_07 sesRNA #07 has started.
Rna interaction search of query:Sox14_07 sesRNA #07 has finished.
RIblast ris mode has finished.
/Users/kbw29/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/CellREADR/Output/BioPython/Temp/Sox14_07.fasta


  useful_RIblast = useful_RIblast.append(temp_RIblast_ouput)
  useful_RIblast = useful_RIblast.append(temp_RIblast_ouput)


RIblast ris mode has started.
Rna interaction search of query:Sox14_08 sesRNA #08 has started.
Rna interaction search of query:Sox14_08 sesRNA #08 has finished.
RIblast ris mode has finished.
/Users/kbw29/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/CellREADR/Output/BioPython/Temp/Sox14_08.fasta
RIblast ris mode has started.
Rna interaction search of query:Sox14_09 sesRNA #09 has started.
Rna interaction search of query:Sox14_09 sesRNA #09 has finished.
RIblast ris mode has finished.
/Users/kbw29/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/CellREADR/Output/BioPython/Temp/Sox14_09.fasta
RIblast ris mode has started.
Rna interaction search of query:Sox14_10 sesRNA #10 has started.


  useful_RIblast = useful_RIblast.append(temp_RIblast_ouput)


Rna interaction search of query:Sox14_10 sesRNA #10 has finished.
RIblast ris mode has finished.
/Users/kbw29/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/CellREADR/Output/BioPython/Temp/Sox14_10.fasta
RIblast ris mode has started.
Rna interaction search of query:Sox14_11 sesRNA #11 has started.


  useful_RIblast = useful_RIblast.append(temp_RIblast_ouput)


Rna interaction search of query:Sox14_11 sesRNA #11 has finished.
RIblast ris mode has finished.
/Users/kbw29/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/CellREADR/Output/BioPython/Temp/Sox14_11.fasta
RIblast ris mode has started.
Rna interaction search of query:Sox14_12 sesRNA #12 has started.


  useful_RIblast = useful_RIblast.append(temp_RIblast_ouput)


Rna interaction search of query:Sox14_12 sesRNA #12 has finished.
RIblast ris mode has finished.
/Users/kbw29/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/CellREADR/Output/BioPython/Temp/Sox14_12.fasta
RIblast ris mode has started.
Rna interaction search of query:Sox14_13 sesRNA #13 has started.
Rna interaction search of query:Sox14_13 sesRNA #13 has finished.
RIblast ris mode has finished.
/Users/kbw29/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/CellREADR/Output/BioPython/Temp/Sox14_13.fasta


  useful_RIblast = useful_RIblast.append(temp_RIblast_ouput)
  useful_RIblast = useful_RIblast.append(temp_RIblast_ouput)


RIblast ris mode has started.
Rna interaction search of query:Sox14_14 sesRNA #14 has started.
Rna interaction search of query:Sox14_14 sesRNA #14 has finished.
RIblast ris mode has finished.
/Users/kbw29/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/CellREADR/Output/BioPython/Temp/Sox14_14.fasta
RIblast ris mode has started.
Rna interaction search of query:Sox14_15 sesRNA #15 has started.


  useful_RIblast = useful_RIblast.append(temp_RIblast_ouput)
  useful_RIblast = useful_RIblast.append(temp_RIblast_ouput)


Rna interaction search of query:Sox14_15 sesRNA #15 has finished.
RIblast ris mode has finished.
/Users/kbw29/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/CellREADR/Output/BioPython/Temp/Sox14_15.fasta
RIblast ris mode has started.
Rna interaction search of query:Sox14_16 sesRNA #16 has started.
Rna interaction search of query:Sox14_16 sesRNA #16 has finished.
RIblast ris mode has finished.
/Users/kbw29/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/CellREADR/Output/BioPython/Temp/Sox14_16.fasta


  useful_RIblast = useful_RIblast.append(temp_RIblast_ouput)
  useful_RIblast = useful_RIblast.append(temp_RIblast_ouput)


RIblast ris mode has started.
Rna interaction search of query:Sox14_17 sesRNA #17 has started.
Rna interaction search of query:Sox14_17 sesRNA #17 has finished.
RIblast ris mode has finished.
/Users/kbw29/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/CellREADR/Output/BioPython/Temp/Sox14_17.fasta
RIblast ris mode has started.
Rna interaction search of query:Sox14_18 sesRNA #18 has started.
Rna interaction search of query:Sox14_18 sesRNA #18 has finished.
RIblast ris mode has finished.
/Users/kbw29/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/CellREADR/Output/BioPython/Temp/Sox14_18.fasta
RIblast ris mode has started.
Rna interaction search of query:Sox14_19 sesRNA #19 has started.
Rna interaction search of query:Sox14_19 sesRNA #19 has finished.
RIblast ris mode has finished.
/Users/kbw29/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/CellREADR/Output/BioPython/Temp/Sox14_19.fasta


  useful_RIblast = useful_RIblast.append(temp_RIblast_ouput)
  useful_RIblast = useful_RIblast.append(temp_RIblast_ouput)


RIblast ris mode has started.
Rna interaction search of query:Sox14_20 sesRNA #20 has started.
Rna interaction search of query:Sox14_20 sesRNA #20 has finished.
RIblast ris mode has finished.
/Users/kbw29/Dropbox/Research/Neurobiology_PhD/Huang/Projects/CellReadR/Code/CellREADR/Output/BioPython/Temp/Sox14_20.fasta


  useful_RIblast = useful_RIblast.append(temp_RIblast_ouput)


Unnamed: 0,Accessibility Energy,Hybridization Energy,Interaction Energy,BasePair,Accessibility Energy.1,Hybridization Energy.1,Interaction Energy.1,BasePair.1
0,152.493,-443.62,-291.127,(0-299:1158-859),17.6446,-33.61,-15.9654,(9-59:1350-1307)
0,154.181,-444.23,-290.049,(0-299:1155-856),17.9628,-33.61,-15.6472,(6-56:1350-1307)
0,151.148,-442.44,-291.292,(0-299:1152-853),24.9842,-38.95,-13.9658,(80-130:276-209)
0,150.547,-441.78,-291.233,(0-299:1149-850),24.9068,-38.95,-14.0432,(77-127:276-209)
0,153.725,-443.31,-289.585,(0-299:1146-847),24.8673,-38.95,-14.0827,(74-124:276-209)
0,154.647,-445.11,-290.463,(0-299:1143-844),24.7865,-38.95,-14.1635,(71-121:276-209)
0,155.235,-444.53,-289.295,(0-299:1140-841),24.8539,-38.95,-14.0961,(68-118:276-209)
0,155.459,-445.96,-290.501,(0-299:1137-838),25.0115,-38.95,-13.9385,(65-115:276-209)
0,157.156,-445.95,-288.794,(0-299:1134-835),25.1237,-38.95,-13.8263,(62-112:276-209)
0,157.252,-446.84,-289.588,(0-299:1131-832),25.2016,-38.95,-13.7484,(59-109:276-209)


In [34]:
metricsTable_higherOrder.sort_values(' Interaction Energy', axis = 0)

Unnamed: 0,sesNum,SeqNum,ExonFrac,ExonProtFrac,CDSFrac,CDNAFrac,StartSeq,StopSeq,firstTGG,centralTGG,second_cTGG,numTGG,numTTGG,numTGGA,numTTGGA,numATG,numStop,gcCont,mfe,Accessibility Energy,Hybridization Energy,Interaction Energy,BasePair
2,3,1,1/2,1/2,1/2,1/2,912,1212,36,162,117,9,1,3,0,0,2,65.3,-124.4,151.148,-442.44,-291.292,(0-299:1152-853)
3,4,1,1/2,1/2,1/2,1/2,915,1215,33,159,180,9,1,3,0,0,2,65.7,-125.0,150.547,-441.78,-291.233,(0-299:1149-850)
0,1,1,1/2,1/2,1/2,1/2,906,1206,42,168,123,9,1,3,0,0,2,65.7,-125.4,152.493,-443.62,-291.127,(0-299:1158-859)
7,8,1,1/2,1/2,1/2,1/2,927,1227,21,147,168,9,1,3,0,0,2,66.7,-130.5,155.459,-445.96,-290.501,(0-299:1137-838)
5,6,1,1/2,1/2,1/2,1/2,921,1221,27,153,174,9,1,3,0,0,2,66.7,-129.4,154.647,-445.11,-290.463,(0-299:1143-844)
1,2,1,1/2,1/2,1/2,1/2,909,1209,39,165,120,9,1,3,0,0,2,65.7,-129.8,154.181,-444.23,-290.049,(0-299:1155-856)
9,10,1,1/2,1/2,1/2,1/2,933,1233,15,141,162,9,1,3,0,0,2,67.0,-131.4,157.252,-446.84,-289.588,(0-299:1131-832)
4,5,1,1/2,1/2,1/2,1/2,918,1218,30,156,177,9,1,3,0,0,2,66.0,-128.7,153.725,-443.31,-289.585,(0-299:1146-847)
6,7,1,1/2,1/2,1/2,1/2,924,1224,24,150,171,9,1,3,0,0,2,66.3,-131.6,155.235,-444.53,-289.295,(0-299:1140-841)
8,9,1,1/2,1/2,1/2,1/2,930,1230,18,144,165,9,1,3,0,0,2,67.0,-128.9,157.156,-445.95,-288.794,(0-299:1134-835)


# Final sesRNA check

In [61]:
str(all_sesRNAs[2])

'CTGGGAAGAGGATGTAGGCGACAGGGGGCTGCAGGGTGGAGGCAGACCAGGCGGTACAGTTACAGGGCACCACGTAGCCCGGGTTGGTGGGGGACGGGTGCGTGTGCGTGTGCTGGCTGGGGCAGCTGAGACTGCCGAAGGCGCCGTTCTGATAGCCCAGGGTGGACGCGTAGGGCAGCGCGCTGGTGGCCAACGTGTGGGGCACTTCACCCATCTTCTGGATGGCACTGGAACTAAATTGCGCAGGGTCCAGCAGGGAGTAAGGTGCCGAGGCAGGCGGCAAGAAGGCCCGGGCTTTCT'

In [38]:
testSeq = 'CCTCCTCGCTGCCCTCGGACTTGAGGATGTCCATCTGCAGCCCTTGCCGATGCTCCATGTCCAGGTCGTCGCAGTGGGCGAAGCCCACCGCCTCCTCGTCGGTGGCCGCCTGGAAGCCCATCCTGGCGAACATGCCGCTCATCTTCGCCTGGGATTTGTGCGACACCGAGGTGGCCACGTTGGAGAGCTTGCTGCGGAGGAGGG'

In [59]:
output_json = idtdna.return_complexity_score(str(all_sesRNAs[2]))

In [60]:
output_json

{'ActualValue': 65.3,
 'DisplayText': 'The overall GC content of the sequence is 65.3%. Solution: Redesign to reduce the GC content below 58%.',
 'ForwardLocations': [],
 'IsViolated': True,
 'Name': 'Overall High GC',
 'RepeatedSegment': None,
 'ReverseLocations': [],
 'Score': 7.3,
 'StartIndex': 0,
 'TerminalEnd': 0,
 'ThresholdOutput': {'Value': 58,
  'ThresholdType': 1,
  'MinLength': 0,
  'MaxLength': 0,
  'MinPercentage': 0.0,
  'MaxPercentage': 0.0,
  'WindowLength': 0,
  'Quantity': 0},
 'ServiceProductId': 0,
 'MinimumRepeatLength': 0,
 'RepeatPercentage': 0.0,
 'GCPercentage': 0.0,
 'Length': 0,
 'Rank': 0.0}

In [52]:
scores = [idtdna.return_complexity_score(str(sesRNAs))['Score'] for sesRNAs in all_sesRNAs]

In [58]:
scores 

[7.7,
 7.7,
 7.3,
 7.7,
 8.0,
 9.7,
 8.3,
 8.7,
 9.0,
 9.7,
 13.0,
 11.3,
 11.3,
 11.3,
 13.0,
 11.3,
 13.0,
 16.3,
 18.0,
 16.3]

## Choose sesRNA

In [None]:
sesChoice = 3
chosenSequence = all_sesRNAs[sesChoice - 1]

In [None]:
str(chosenSequence)

In [None]:
return_inFrame(chosenSequence, 'all')

In [None]:
from Bio.Restriction import *

In [None]:
len(EcoRI.search(chosenSequence)) == 0

In [None]:
len(AscI.search(chosenSequence)) == 0

In [None]:
len(HindIII.search(chosenSequence)) == 0

In [None]:
testSeq = 'CCTCCTCGCTGCCCTCGGACTTGAGGATGTCCATCTGCAGCCCTTGCCGATGCTCCATGTCCAGGTCGTCGCAGTGGGCGAAGCCCACCGCCTCCTCGTCGGTGGCCGCCTGGAAGCCCATCCTGGCGAACATGCCGCTCATCTTCGCCTGGGATTTGTGCGACACCGAGGTGGCCACGTTGGAGAGCTTGCTGCGGAGGAGGG'

In [None]:
testSeq = 'CCTCCTCGCTGCCCTCGGACTTGAGGATGTCCATCTGCAGCCCTTGCCGATGCTCCATGTCCAGGTCGTCGCAGTGGGCGAAGCCCACCGCCTCCTCGTCGGTGGCCGCCTGGAAGCCCATCCTGGCGAACATGCCGCTCATCTTCGCCTGGGATTTGTGCGACACCGAGGTGGCCACGTTGGAGAGCTTGCTGCGGAGGAGGG'

In [None]:
return_inFrame(Seq(testSeq), 'all')

In [None]:
save_all_sesRNAs_DNA(all_sesRNAs, speciesName, geneName)

In [None]:
AscI.site

# Outputing sesRNA

In [None]:
testSeq = Seq('TGGGAGTAGTGGTGGTAATGA')
testStr = str(testSeq)

In [None]:
testStr

In [None]:
convert_DNA(testSeq, 1)

In [None]:
# Test that all and numbering is working 
convert_DNA(testSeq, 3) == convert_DNA(testSeq, 'All')

In [None]:
convert_DNA(rC_multiExon_sesRNAs[0], 'All')

In [None]:
# Testing that continious reading frame being produced if number set to 0 
len(convert_DNA(rC_multiExon_sesRNAs[0], 0).translate(to_stop = True)) == len(rC_multiExon_sesRNAs[0])/3

In [None]:
# Checking that right number of TGG being converted 
return_inFrame(convert_DNA(rC_multiExon_sesRNAs[0], 'All').back_transcribe(), 'numTGG')

In [None]:
# Function for saving both original template DNA sequence and the converted RNA given list of sesRNAs Seq objects
# Could change later to make for separate output files ... have not decided yet whether to just put them together into one labeled fasta file 
def save_sesRNAs(sequences_sesRNAs, geneName, version, numConvertTGG):
    # Generating BioPython directory if does not exist 
    pathlib.Path('Output/BioPython').mkdir(parents=True, exist_ok=True)
    
    # Defining save name and description for outputs (original DNA and converted RNA)
    DNA_outputID = geneName + '_sesRNA'
    DNA_outputDescription = "sesRNA DNA original for " + geneName
    convertedRNA_outputID = geneName + '_sesRNA'
    convertedRNA_outputDescription = "sesRNA converted RNA for " + geneName
    
    # Creating empty list for storing sequences 
    outputSeqMulti_DNA = []
    outputSeqMulti_convertedRNA= []
    
    # Generating SeqRecord objects in preparation for writing FASTA files ... edited description to include number of sequence
    i = 1
    for sequence in sequences_sesRNAs:
        outputSeqMulti_DNA.append(SeqRecord(sequence, id = DNA_outputID+str(i), description = DNA_outputDescription))
        outputSeqMulti_convertedRNA.append(SeqRecord(convert_DNA(sequence, numberConvertTGG), id = convertedRNA_outputID+str(i), description = convertedRNA_outputDescription))
        i += 1
        
    # Write output fasta files 
    DNA_outputName = "Output/BioPython/" + geneName + "_sesRNA_" + version + ".fasta" 
    convertedRNA_outputName = "Output/BioPython/" + geneName + "_sesRNA_convertedRNA_" + version + ".fasta" 
    
    with open(outputName, "w") as output_handle:
        # Writing original template DNA files 
        SeqIO.write(outputSeqMulti_DNA, output_handle, "fasta")
        # Writing orginal template RNA files 
        SeqIO.write(outputSeqMulti_convertedRNA, output_handle, "fasta")

# Not working or not being used 

In [None]:
num_inF_TGG, num_inF_TTGG, num_inF_TGGA, num_inF_TTGGA, num_inF_ATG, num_inF_Stop, indices_inF_TGG, \
    indices_inF_ATG, indices_inF_Stop = \
    return_inFrame(Seq(seqTdT).reverse_complement(), 'all')

In [None]:
num_inF_TGG

In [None]:
num_inF_TTGG

In [None]:
num_inF_TGGA

In [None]:
num_inF_TTGGA

In [None]:
# Trying to load entire transcripome ... obviously failed ... 
allRat_fileName = '/home/user1/Dropbox/Research/Neurobiology_PhD/Rotations/Huang/Projects/CellReadR/Data/Sequences/Reference/Ensembl/All/Rattus_norvegicus.Rnor_6.0.cdna.all.fa'
allRat = list(SeqIO.parse(allRat_fileName, "fasta"))