# Gillespie Benchmark Performance Figure

In [1]:
import numpy as np
import helperfunctions as hf
import RFE_landscape as RFE
import matplotlib.pyplot as plt
import glob, sys, os

In [2]:
class Gillespie:

    def __init__(self, sequence, frozen, maxTime, toPrint = True):

        # We begin by initializing all the components that we will need throughout the algorithm.
        # This includes all possible stems, structures, entropies, and gibbs free energy.

        self.sequence = sequence # sequence that we will fold
        self.frozen = frozen # if we have a frozen stem that we would like to include in the final calculation

       # STableBPs,             STableStructure,     compatibilityMatrix,       allStructures,                           stemEnergies,      stemEntropies,      totalEntropies
        self.allPossibleStems, self.STableStructure, self.compatibilityMatrix, self.allStructures, self.allStructures2, self.stemEnergies, self.stemEntropies, self.totalEntropies = self.initialize(sequence)

        self.allPossibleStems2 = [ [self.allPossibleStems[i], i] for i in range(len(self.allPossibleStems))]

        # need to convert the enthalpy to the gibbs free energy
        self.stemGFEnergies = hf.LegendreTransform(self.stemEnergies, self.stemEntropies, 310.15)

        # intialize the current structure arrays
        self.currentStructure = []
        self.stemsInCurrentStructure = []

        # initial starting values for the flux, time, and cutoff
        self.totalFlux = 0
        self.maxTime = maxTime
        self.time = 0

        self.nextPossibleStems = [] #initialize
        self.nextPossibleRates = [] #initialize

        self.toPrint = toPrint

    def initialize(self, sequence):
        # See Kimich et. al (https://www.biorxiv.org/content/10.1101/338921v1)

        # Call RNALandscape to initialize all the quantities that we will need throughtout our algorithm

        q = RFE.RNALandscape([sequence])
        q.calculateFELandscape()

        sequenceInNumbers = q.sequenceInNumbers
        numStems = q.numStems
        STableStructure = q.STableStructure
        STableBPs = q.STableBPs
        compatibilityMatrix = q.C
        stemEnergies, stemEntropies = hf.calculateStemFreeEnergiesPairwise(numStems, STableStructure, sequenceInNumbers)
        allLoopEntropies = q.allLoopEntropies
        allBondEntropies = q.allBondEntropies
        allDuplexEntropies = q.allDuplexEntropies
        allStructures = hf.structure2stem(q.structures, STableBPs)
        allStructures2 = q.structures

        totalEntropies = hf.totalEntropyPerStructure(allLoopEntropies, allBondEntropies, allDuplexEntropies)

        return(STableBPs, STableStructure, compatibilityMatrix, allStructures, allStructures2, stemEnergies, stemEntropies, totalEntropies)

    def convert2dot(self, currentStructure):
        # Function to convert the notation of the current structure to dot bracket notation
        # Not written to handle pseudoknots yet
        representation = ''
        dotbracket = [0]*len(self.sequence)
        # find the pseudoknots first and add those in the dotbracket notation

        for i in range(len(currentStructure)):
            for j in range(len(currentStructure[i])):
                open = currentStructure[i][j][0]
                close = currentStructure[i][j][1]
                dotbracket[open] = 1
                dotbracket[close] = 2
        # convert 0's, 1's, and 2's into '.', '(', ')'

        for element in dotbracket:
            if element == 0:
                representation += '.'
            elif element == 1:
                representation += '('
            else:
                representation += ')'

        return(representation)

    def MonteCarloStep(self):

        # Begin with any frozen contraints that will need to be considered, but we will add this
        # component last

        # This is our first move!
        if not self.time:
            C = self.compatibilityMatrix #for readability we rename this

            # generate two random numbers
            r1 = np.random.random()
            r2 = np.random.random()

            # calculate the transition rates for all the states, this done using the kineticFunctions file.
            self.allRates = hf.calculateStemRates(self.stemEntropies, kB =  0.0019872, T = 310.15, kind = 1)
            self.ratesBreak = hf.calculateStemRates(self.stemGFEnergies, kB = 0.0019872, T = 310.15, kind = 0)
            self.totalFlux = sum([r[0] for r in self.allRates]) # the sum off all the rates
            self.time += abs(np.log(r2))
            #self.time += (abs(np.log(r2))/self.totalFlux) # increment the reaction time for next state
            normalizedRates = hf.normalize(self.allRates) # normalize the rates such that they sum to one


            for i in range(len(normalizedRates)):
                trial = hf.partialSum(normalizedRates[:i])

                if trial >= r1:
                    stateEntropy = self.stemEntropies[i]
                    nextMove = self.allPossibleStems[i]
                    self.currentStructure.append(nextMove)  # append the stem to the current structure
                    self.stemsInCurrentStructure.append(i)  # append the index of this stem into a list to keep track of what stems are coming in and out of current structure
                    # update the user on what move was made
                    if self.toPrint:
                        print('Time: %0.4fs | Added Stem: %s | Current Structure: %s' %(self.time, str(nextMove), self.convert2dot(self.currentStructure)))


                    # we now need to calculate the next set of possible moves and
                    # the rates corresponding to these moves

                    for m in range(len(self.allPossibleStems)):
                        if C[i, m] and m != i:
                            self.nextPossibleStems.append([self.allPossibleStems[m], m]) # format of this array will be [stem_m , and m = index of stem from larger array]

                    trialStructures, trialIndices = hf.makeTrialStructures(self.currentStructure, self.nextPossibleStems, self.allStructures, len(self.sequence))
                    self.nextPossibleRates = hf.updateReactionRates(trialStructures, trialIndices, self.allStructures, self.totalEntropies, stateEntropy, len(self.sequence))
                    self.nextPossibleRates.insert(0, self.ratesBreak[i])

                    self.totalFlux = sum([r[0] for r in self.nextPossibleRates])

                    self.nextPossibleRates = hf.normalize(self.nextPossibleRates)
                    return(self)

        else:
        # Now we are in our 2+ move.

        # generate two random numbers
            r1 = np.random.random()
            r2 = np.random.random()

        # update time
            #self.time += (abs(np.log(r2))/self.totalFlux)
            self.time += abs(np.log(r2))
            # find the next move
            for i in range(len(self.nextPossibleRates)):
                trial = hf.partialSum(self.nextPossibleRates[:i])

                if trial >= r1:

                    if self.nextPossibleRates[i][1]: # this will be true if we have chosen to add a stem
                        stateEntropy = (0.0019872)* np.log(self.nextPossibleRates[i][0])
                        index = self.nextPossibleRates[i][2] # the index of the stem that we will add
                        nextMove = hf.findStem(index, self.nextPossibleStems)
                        stemIndex = nextMove[1]

                        self.currentStructure.append(nextMove[0])
                        self.stemsInCurrentStructure.append(stemIndex)
                        if self.toPrint:
                            print('Time: %0.4fs | Added Stem: %s | Current Structure: %s' %(self.time, str(nextMove[0]), self.convert2dot(self.currentStructure)))
                        # check for new stems that could be compatible with the structure
                        self.nextPossibleStems = hf.findNewStems(self.stemsInCurrentStructure, self.allPossibleStems2, self.allStructures2)
                        trialStructures, trialIndices = hf.makeTrialStructures(self.currentStructure, self.nextPossibleStems, self.allStructures, len(self.sequence))
                        self.nextPossibleRates = hf.updateReactionRates(trialStructures, trialIndices, self.allStructures, self.totalEntropies, stateEntropy, len(self.sequence))

                        for ind in self.stemsInCurrentStructure:
                            self.nextPossibleRates.insert(0, hf.findRate(ind, self.ratesBreak))
                        self.totalFlux = sum([r[0] for r in self.nextPossibleRates])
                        self.nextPossibleRates = hf.normalize(self.nextPossibleRates)
                        return(self)

                    else:
                        # we have chosen to break a stem
                        # We will now find the stem to break in our current structure, then populate a list of new
                        # new stems to consider for the next move.
                        stemIndexToRemove = self.nextPossibleRates[i][2]
                        stemToBreak = hf.findStem(stemIndexToRemove, self.allPossibleStems2)
                        stateEntropy = (0.0019872) * np.log(self.nextPossibleRates[i][0])
                        for k in range(len(self.stemsInCurrentStructure)): #searching for the stem to break
                            if stemIndexToRemove == self.stemsInCurrentStructure[k]:
                                del self.currentStructure[k]
                                del self.stemsInCurrentStructure[k]
                                if self.toPrint:
                                    print('Time: %0.4fs | Broke Stem: %s | Current Structure: %s' %(self.time, str(stemToBreak[0]), self.convert2dot(self.currentStructure)))

                                if len(self.currentStructure) == 0:

                                    self.nextPossibleRates = hf.normalize(self.allRates)
                                    self.nextPossibleStems = self.allPossibleStems2
                                    self.totalFlux = sum([r[0] for r in self.nextPossibleRates])
                                else:
                                    self.nextPossibleStems = hf.findNewStems(self.stemsInCurrentStructure, self.allPossibleStems2, self.allStructures2)
                                    trialStructures, trialIndices = hf.makeTrialStructures(self.currentStructure, self.nextPossibleStems, self.allStructures, len(self.sequence))
                                    self.nextPossibleRates = hf.updateReactionRates(trialStructures, trialIndices, self.allStructures, self.totalEntropies, stateEntropy, len(self.sequence))
                                    for ind in self.stemsInCurrentStructure:
                                        self.nextPossibleRates.insert(0, hf.findRate(ind, self.ratesBreak))
                                    self.totalFlux = sum([r[0] for r in self.nextPossibleRates])
                                    self.nextPossibleRates = hf.normalize(self.nextPossibleRates)
                                return(self)

    def runGillespie(self):
        # run the gillespie algorithm until we reach maxTime
        while self.time < self.maxTime:
            self.MonteCarloStep()
        return(self.convert2dot(self.currentStructure))

In [22]:
def readFile(file):
    record = {}
    dots = ''
    with open(file, "r") as f:
        for i, line in enumerate(f):
            if i == 0:
                length = int(line.split()[0])
                if length < 50:
                    for line in f.readlines()[1:]:
                        l = [x for x in line.split() if x != '']
                        n = int(l[0])
                        base = l[1]
                        partner = int(l[4])
                        record[n] = [base, partner]
    seq = "".join([record[x][0] for x in sorted(record.keys())])
    
    for x in sorted(record.keys()):
        val = record[x][1]
        if val > x:
            s = '('
            dots += str_append(s)
        elif val == 0:
            s = '.'
            dots += str_append(s)
        elif val < x:
            s = ')'
            dots += str_append(s)
    return seq, dots

def compareStructs(pred, answer):
    count = 0
    for i in range(len(pred)):
        if pred[i] != answer[i]:
            count +=1
    return count

def str_append(s):
    output = ''
    output += s
    return output

In [23]:
seq_files = glob.glob('/Users/harrisonlabollita/Library/Mobile Documents/com~apple~CloudDocs/Arizona State University/Sulc group/data_set/ct_files/*')

In [24]:
sequences = []
dotbrackets = []
seqLength = []

for file in seq_files:
    seq, dots = readFile(file)
    if seq != 0:
        sequences.append(seq)
        dotbrackets.append(dots)
        seqLength.append(len(seq))

120
119
74
111
73
75
77
76
72
76
120
76
71
352
365
71
74
121
76
76
76
73
101
119
119
115
74
76
366
72
72
333
74
73
72
74
72
72
122
75
77
89
118
73
71
72
85
77
126
74
275
294
74
73
72
120
76
366
123
128
76
73
121
118
72
72
73
73
74
73
229
363
270
75
77
75
71
76
72
73
305
73
73
74
73
73
554
75
74
298
73
75
71
310
85
74
267
73
75
359
83
74
73
76
74
76
72
299
74
77
73
72
74
73
75
75
76
72
73
76
75
76
118
73
71
74
83
386
98
83
121
73
76
73
76
74
73
76
74
75
71
71
353
77
513
77
76
73
73
77
72
76
284
73
73
73
74
77
74
73
71
74
73
282
339
77
76
72
76
354
73
73
71
74
72
71
72
76
72
74
331
73
84
77
76
71
71
72
74
340
86
73
73
74
72
78
74
92
84
71
78
99
74
74
75
84
116
71
74
370
73
75
73
91
84
74
76
74
77
93
89
76
376
81
355
74
77
87
74
72
73
77
77
84
123
115
120
77
75
76
74
85
91
72
77
286
122
75
96
120
121
72
123
118
299
77
263
72
73
76
365
73
74
315
81
71
71
75
115
72
83
392
64
75
75
74
267
74
73
73
77
323
119
303
73
77
73
93
74
1497
98
74
75
89
88
71
76
88
76
75
73
93
1542
77
119
293
119
72
7

117
342
77
77
74
71
303
72
467
71
91
76
73
75
74
91
397
73
91
74
73
74
73
90
76
118
451
81
71
72
90
251
74
74
74
327
78
72
78
64
77
120
117
90
76
73
357
73
85
77
310
88
82
121
77
81
74
75
90
72
121
118
124
73
76
77
75
74
91
355
72
72
72
73
73
89
76
74
77
303
376
88
76
89
71
267
77
75
260
73
86
120
118
286
73
72
74
77
354
75
75
357
299
287
75
75
119
420
86
357
74
71
311
120
71
119
96
83
72
270
76
312
88
78
73
126
74
72
77
73
71
119
79
86
87
252
74
392
86
93
284
362
76
76
74
73
72
93
76
75
74
119
76
77
81
75
74
74
90
71
76
72
81
117
74
90
75
73
82
119
74
95
73
299
74
77
74
76
87
84
293
77
74
89
76
75
75
76
76
71
73
75
234
72
77
447
53
77
81
76
73
76
305
76
72
77
368
120
76
100
73
71
294
74
73
72
71
91
76
76
118
73
72
606
423
76
77
73
72
77
72
73
76
73
116
77
73
76
74
77
76
72
73
74
73
74
103
76
74
73
76
73
75
525
75
72
77
76
102
73
349
116
73
71
76
72
77
118
76
77
75
76
139
71
71
72
119
308
363
72
76
74
76
76
339
74
107
308
83
74
74
73
74
120
72
76
76
71
75
74
75
76
387
118
77
76
77
75
8

73
77
75
73
73
120
75
72
120
90
75
76
77
82
75
86
120
74
120
122
72
269
77
120
72
72
302
93
123
77
85
251
70
73
77
77
59
72
76
158
87
119
77
74
117
341
85
350
77
77
74
123
120
122
77
74
350
72
75
76
90
493
74
73
96
89
96
76
76
82
90
72
79
74
75
138
118
279
73
75
72
75
115
87
76
77
99
116
145
318
76
75
82
96
74
73
117
523
120
73
85
73
76
75
75
89
74
74
82
76
76
74
72
352
86
396
76
77
88
229
76
2923
124
72
73
88
72
75
72
73
91
76
122
359
575
74
85
418
77
75
270
77
86
73
100
113
72
100
78
75
75
77
75
73
72
87
89
73
318
119
74
533
87
74
119
387
348
86
70
91
73
98
76
72
77
71
363
76
345
72
85
103
362
73
129
76
82
101
81
118
72
76
78
71
87
116
117
118
366
73
72
129
85
77
121
118
73
76
120
297
75
106
87
122
73
73
76
84
116
76
76
75
73
74
74
84
77
354
77
85
85
76
74
77
91
77
120
74
72
119
74
421
302
77
75
118
73
73
73
72
74
73
302
119
115
74
77
72
72
73
70
84
81
66
307
77
73
73
82
76
72
85
89
75
72
74
119
77
77
121
74
71
311
412
77
448
72
74
89
76
96
73
76
119
567
75
84
76
75
76
78
72
73
77
73

ValueError: invalid literal for int() with base 10: '16s_A.fulgidus_domain1.ct'

In [11]:
print(len(sequences))
print(np.mean(seqLength))
print(np.max(seqLength))
print(np.min(seqLength))

19
33.94736842105263
41
28


In [14]:
print(sequences[0])
print(dotbrackets[0])

GCCAACCCGGUCAGGUCCGGAAGGAAGCAGCCG
.......((((....(((....)))....))))


In [15]:
total_misses = []
iterations = 0
for i in range(len(sequences)):
    print("Starting iteration:", iterations + 1)
    G = Gillespie(sequences[i], [], maxTime = 5, toPrint = False)
    structure = G.runGillespie()
    mistakes = compareStructs(structure, dotbrackets[i])
    mistakes /= len(sequences[i])
    total_misses.append(mistakes)
    iterations += 1

Starting iteration: 1
Starting iteration: 2
Starting iteration: 3
Starting iteration: 4
Starting iteration: 5
Starting iteration: 6
Starting iteration: 7
Starting iteration: 8
Starting iteration: 9
Starting iteration: 10
Starting iteration: 11
Starting iteration: 12
Starting iteration: 13
Starting iteration: 14
Starting iteration: 15
Starting iteration: 16
Starting iteration: 17
Starting iteration: 18
Starting iteration: 19


In [16]:
len(total_misses)

19

In [17]:
print(np.mean(total_misses))

0.41340344039601773


In [18]:
print(np.min(total_misses))
print(np.max(total_misses))

0.18181818181818182
0.6341463414634146
