In [100]:
import json
import requests

import re

from tqdm.notebook import tqdm

from Bio import Align, SeqIO

In [3]:
with open('filtered_pdb_with_reads.json','r') as f:
    filtered_seqs = json.load(f)

In [5]:
# Download the FASTA files for all of the sequences

missing_seqs = []
pdb_set = set([])

for k,v in tqdm(filtered_seqs.items(),total=len(filtered_seqs)):
    if v['pdb'] in pdb_set:
        continue
    r = requests.get(f"https://www.rcsb.org/fasta/entry/{v['pdb']}")
    if r.status_code == requests.codes.ok:
        with open(f"data/fasta/{v['pdb']}.fasta", 'w') as f:
            f.write(r.text)
    else:
        missing_seqs += [(v['pdb'],v)]

  0%|          | 0/1287 [00:00<?, ?it/s]

In [13]:
# Remove extraneous sequence from the library 
# this particular sequence had a "pdb-like" id 88F8 and was incorrectly kept in during filtering

del filtered_seqs['design_name:88F8: Deinococcus radiodurans R1 chromosome 1']

In [272]:
def getLongestMatch(str1,str2):
    """
    Function to get the indexes corresponding to the largest continguous mapping between str1 and str2
    """

    # Set up aligner class with gap penalties to prefer contiguous sequences
    aligner = Align.PairwiseAligner()
    aligner.mode = 'local'
    
    aligner.match_score = 2
    aligner.gap_score = -1
    aligner.mismatch_score = -1

    # align the two sequences
    alignment = aligner.align(str1,str2)

    # identify if there is an alignment (return no alignment if not)
    try:
        alignment = alignment[0]
    except IndexError:
        return (None,0)

    # find the longest contiguous alignment between the two strings
    max_len_contig = sorted(alignment.aligned[0],key = lambda s: abs(s[1]-s[0]))[-1]

    # return the indices corresponding to the longest contiguous alignment and its length
    return (max_len_contig,abs(max_len_contig[1] - max_len_contig[0]))

In [273]:
# iterate through the library sequences and find the correct RNA chain in the pdb files

for k,v in tqdm(filtered_seqs.items(),total=len(filtered_seqs)):
    top_alignment = (None,None,0)

    with open(f"data/fasta/{v['pdb']}.fasta",'r') as f:
        # loop through each chain in the pdb fasta file
        for record in SeqIO.parse(f,'fasta'):
            # find the longest aligned subsequence between the chain and the library sequence
            max_len_contig = getLongestMatch(v['sequence'],str(record.seq))

            # if this chain has the longest match to the library sequence, save it as the correct "match"
            if max_len_contig[1] > top_alignment[2]:
                top_alignment = (record.description,*max_len_contig)

    # save the longest match as the "correct chain" in the pdb file
    filtered_seqs[k].update({'chain':top_alignment[0],
                             'fasta_idxs':top_alignment[1]})

  0%|          | 0/1286 [00:00<?, ?it/s]

In [301]:
# store the edge index for the base pairs stored in the FR3D database

FR3D_pairs = {}

for k,v in tqdm(filtered_seqs.items(),total=len(filtered_seqs)):
    edge_list = []
    try:
        chain_id = re.findall(r'\[auth ([A-Za-z0-9]+)\]',v['chain'])[0]
    except IndexError:
        chain_id = re.findall(r'Chain[s]* ([A-Za-z0-9]+)',v['chain'])[0]
    offset = int(v['fasta_idxs'][0])

    with open(f"data/FR3D/{v['pdb']}.csv",'r') as f:
        fred_sequence = {}
        for line in f.readlines():
            line = line.rstrip().replace('"','')
            nodes = re.findall(rf"{v['pdb']}\|1\|{chain_id}\|([AUGC])\|(\d+)",line)
            if len(nodes) > 0:
                for node in nodes:
                    fred_sequence[int(node[1])] = node[0]
            if len(nodes) == 2:
                if nodes[0][1] == nodes[1][1]:
                    continue

                words = line.split(',')
                if len(words[1]+words[3]+words[4]) > 0:
                    edge_list += [(offset+int(nodes[0][1])-1,offset+int(nodes[1][1])-1)]

    FR3D_pairs[k] = {
        'sequence':v['sequence'],
        'FR3D_edge_list':edge_list} 
    break

  0%|          | 0/1286 [00:00<?, ?it/s]

In [None]:
def getLongestSubsequence(l):
    """to do !!!"""
    pass

In [241]:
with open('FR3D_annotations.json','w') as f:
    json.dump(FR3D_pairs,f)