# Download RNA hairpin and internal loops from [RNA 3D Motif Atlas](http://rna.bgsu.edu/rna3dhub/motifs)

In [1]:
import os, sys, shutil
import pathlib
import glob as glob
import numpy as np
import json
import wget
import warnings

In [2]:
def make_dir(output_path):
    """
    create directory
    """
    
    if os.path.isdir(output_path):
        print(">remove directory: {}".format(output_path))
        shutil.rmtree(output_path)
        
    pathlib.Path(output_path).mkdir(parents=True, exist_ok=True) 

In [3]:
def loadfile(file):
    """
    Load json file and get motif IDs
    
    Parameters
    ----------
    file : str
        Json file in full path
        
    Returns
    -------
    motif_ids : list of str
        List of motif id
    """
    
    with open(file, 'r') as f:
        mydict = json.load(f)
        
    motif_ids = [ d["motif_id"] for d in mydict ]
    assert len(motif_ids) == len(mydict), "Duplicate motif id may be present. Number of motif id does not match total number of row found in {}.".format(file)

    return motif_ids

In [4]:
def download_cif(output_path, release_version, motif_ids, url):
    """
    Download RNA coordinates using BGSU APIs (https://www.bgsu.edu/research/rna/APIs.html)
    
    Parameters
    ----------
    output_path : str
        Full path pointing to the output cif file
    release_version : str
        Release version of RNA 3D Motif Atlas (e.g. HairpinLoopMotifAtlasRelease3.57)
    motif_ids : list of str
        Motif id (e.g. HL_6SVS_002)
    url : str
        API url to download RNA coordinates
        
    Returns
    -------
    """
    
    for motif_id in motif_ids:
        _url = url + motif_id
        cif = os.path.join(output_path, "dump", motif_id + ".cif")
        
        # check duplicate motif entry
        if os.path.exists(cif):
            warnings.warn("{} already exists. Duplicate motif entry.".format(motif_id))
        
        try:
            wget.download(_url, out=cif, bar=None)
            extract_model(output_path, release_version, cif) 
        except:
            warnings.warn("Could not download {}\n{}".format(motif_id, _url))

In [5]:
def extract_model(output_path, release_version, cif):
    """
    Extract the first model from the cif file. The downloaded cif file contains two models. Model 1 is the RNA motif of interset and model 2 is the neighboring structures of model 1. 
    
    Parameters
    ----------
    output_path : str
        Full path pointing to the output cif file
    release_version : str
        Release version of RNA 3D Motif Atlas (e.g. HairpinLoopMotifAtlasRelease3.57)
    cif : str
        Downloaded cif file in full path
    
    Returns
    -------
    """
    arr = []
    count = 0
    with open(cif, "r") as f:
        for l in f.readlines():
            _l = l.strip('\n').split()[0]

            if _l.startswith("#"):
                count += 1
            if count == 2:    
                break
                
            arr.append(l)
    
    ofile = os.path.join(output_path, os.path.basename(cif))
    with open(ofile, "w") as wf:
        for l in arr:
            wf.write(l)

In [8]:
if __name__ == "__main__":
    
    url = "http://rna.bgsu.edu/rna3dhub/rest/getCoordinates?coord="    
    filenames = [ "HairpinLoopMotifAtlasRelease3.61.json", "InternalLoopMotifAtlasRelease3.61.json" ]
    base_path = os.path.dirname(os.path.abspath("__file__")).strip('notebooks')
    
    # download cif for each motif
    for filename in filenames:
        release_version = filename.split('.json')[0]
        print(release_version)

        # create output directory
        output_path = os.path.join(base_path, "data", release_version)  
        make_dir(output_path)
        make_dir(os.path.join(output_path, "dump"))
 
        file = os.path.join(base_path, "data", filename)
        motif_ids = loadfile(file)
        download_cif(output_path, file, motif_ids, url)

HairpinLoopMotifAtlasRelease3.61
InternalLoopMotifAtlasRelease3.61


#### redo if download was unsuccessful