In [1]:
import json

In [21]:
with open('filtered_pdb_with_reads.json') as f:
    filtered_seqs = json.load(f)

In [22]:
# Save all of the dot bracket structures for the library sequences generated by Mxfold2

mxfold2_info = {}

with open('mxfold2/folded_seqs.fasta','r') as f:
    for line in f:
        line = line.rstrip()
        if line.startswith('>'):
            key = line[1:]
        elif set(line) == set('AUGC'):
            seq = line
        else:
            struct = line.split(' ')[0]
            mxfold2_info[key] = {
                'sequence':seq,
                'mxfold2_structure':struct}        

In [23]:
def dbParser(struct):
    """Turns a dot-bracket structure into an edge list, assuming the RNA structure has a context-free grammar"""

    # initialize helix stack and output edge list
    helix_stack = []
    edge_list = []

    # iterate over the nucleotides in the structure
    for i,x in enumerate(struct):
        if x == '(':
            # open a helix
            helix_stack += [i]
        elif x == '.':
            # unpaired region, don't update helix
            continue
        elif x == ')':
            # close a helica base pair

            # store the base pair
            edge_list += [(helix_stack[-1],i)]

            # pop the last opened helix from the stack
            helix_stack = helix_stack[:-1]
        else:
            raise Error
    return edge_list

In [24]:
# parse the dot bracket structure into edge list for all library sequences

for k,v in mxfold2_info.items():
    v['mxfold2_edge_list'] = dbParser(v['mxfold2_structure'])

In [25]:
with open('mxfold2_results.json','w') as f:
    json.dump(mxfold2_info,f)