In [1]:
import RNA

In [3]:
import RNA
import os
# import viennarna

def dna_to_rna(dna_sequence):
    rna_sequence = dna_sequence.upper().replace("T", "U")
    return rna_sequence

def predict_and_plot_rna(sequence, output_file="./rna_structure.png"):
    if not sequence or any(base not in "ACGU" for base in sequence.upper()):
        sequence = dna_to_rna(sequence)
        # raise ValueError("Invalid RNA sequence. Please use only A, C, G, and U.")
    
    fc = RNA.fold_compound(sequence)
    structure, mfe = fc.mfe()
    accessibility = fc.bpp() 
    print(accessibility)

    print(f"Predicted Structure: {structure}")
    print(f"Minimum Free Energy (MFE): {mfe} kcal/mol")

    RNA.PS_rna_plot(sequence, structure, "./rna_structure.ps")
    for i, prob in enumerate(accessibility):
        print(f"Position {i+1}: Probability of accessibility = {prob:.4f}")

    try:
        os.system(f"gs -dSAFER -dBATCH -dNOPAUSE -sDEVICE=png16m -r300 -sOutputFile={output_file} ./rna_structure.ps")
        print(f"Structure plot saved as {output_file}")
    except Exception as e:
        print(f"Error during file conversion: {e}")
    finally:
        if os.path.exists("rna_structure.ps"):
            os.remove("rna_structure.ps")

rna_sequence = "AGGAGGCGGGAGGCCGAGGCAGCAAAGCTG"
predict_and_plot_rna(rna_sequence)

()
Predicted Structure: ....(((.....)))....((((...))))
Minimum Free Energy (MFE): -7.199999809265137 kcal/mol
GPL Ghostscript 9.26 (2018-11-20)
Copyright (C) 2018 Artifex Software, Inc.  All rights reserved.
This software comes with NO WARRANTY: see the file PUBLIC for details.
Loading NimbusSans-Regular font from /usr/share/ghostscript/9.26/Resource/Font/NimbusSans-Regular... 4506700 2929098 1997272 697443 2 done.
Structure plot saved as ./rna_structure.png


In [4]:

def process_fasta_and_predict(file_path):
    """Read sequences from a FASTA file and process each."""
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    with open(file_path, "r") as fasta_file:
        sequences = {}
        current_header = None
        current_sequence = []

        # Parse the FASTA file
        for line in fasta_file:
            line = line.strip()
            if line.startswith(">"):  # Header line
                if current_header:
                    # Store the previous sequence
                    sequences[current_header] = "".join(current_sequence)
                current_header = line[1:]  # Remove ">" from header
                current_sequence = []
            else:
                current_sequence.append(line)
        
        # Add the last sequence
        if current_header:
            sequences[current_header] = "".join(current_sequence)

    # Process each sequence
    for header, sequence in sequences.items():
        print(f"\nProcessing: {header}")
        output_file = f"./images/{header.replace(' ', '_')}_structure.png"
        predict_and_plot_rna(sequence, output_file)

# Example usage
fasta_file_path = "/data6/sobhan/RLLM/notebooks/secondary/rnas.fasta"
process_fasta_and_predict(fasta_file_path)


Processing: Binding Natural RNA 1
()
Predicted Structure: ......(((.((((.(((..(((.((.......)).)))..))))))))))...........
Minimum Free Energy (MFE): -21.299999237060547 kcal/mol
GPL Ghostscript 9.26 (2018-11-20)
Copyright (C) 2018 Artifex Software, Inc.  All rights reserved.
This software comes with NO WARRANTY: see the file PUBLIC for details.
Loading NimbusSans-Regular font from /usr/share/ghostscript/9.26/Resource/Font/NimbusSans-Regular... 4506700 2929187 1997272 700276 2 done.
Structure plot saved as ./images/Binding_Natural_RNA_1_structure.png

Processing: Natural RNA 1
()
Predicted Structure: .............((..((((.((((((((((........))))))))))..)).)).))..
Minimum Free Energy (MFE): -23.200000762939453 kcal/mol
GPL Ghostscript 9.26 (2018-11-20)
Copyright (C) 2018 Artifex Software, Inc.  All rights reserved.
This software comes with NO WARRANTY: see the file PUBLIC for details.
Loading NimbusSans-Regular font from /usr/share/ghostscript/9.26/Resource/Font/NimbusSans-Regular... 4506

In [5]:
import json
import RNA

def classify_rna_structure(sequence, structure, weights):
    """
    Classify each nucleotide in an RNA sequence into structural elements
    such as stem, hairpin loop, bulge, internal loop, multiloop, and external region.

    Args:
        sequence (str): RNA sequence.
        structure (str): Dot-bracket notation of RNA structure.
        weights (list): List of weights for each nucleotide.

    Returns:
        list of dict: Each dictionary contains nucleotide classification.
    """
    stack = []
    paired_positions = {}
    elements = ["External"] * len(sequence)  # Initialize all as external

    # Identify base pairs
    for i, char in enumerate(structure):
        if char == '(':
            stack.append(i)
        elif char == ')':
            if stack:
                j = stack.pop()
                paired_positions[i] = j
                paired_positions[j] = i

    # Identify stems
    for i, char in enumerate(structure):
        if char in '()':
            elements[i] = "Stem"

    # Identify loops and unpaired regions
    i = 0
    while i < len(sequence):
        if structure[i] == '.':
            start = i
            while i < len(sequence) and structure[i] == '.':
                i += 1
            end = i

            # Determine if it's a hairpin loop, bulge, internal loop, or external
            enclosing_pairs = [
                paired_positions.get(start - 1),
                paired_positions.get(end)
            ]
            if enclosing_pairs[0] is not None and enclosing_pairs[1] is not None:
                if abs(enclosing_pairs[0] - enclosing_pairs[1]) == 1:
                    for j in range(start, end):
                        elements[j] = "Hairpin Loop"
                else:
                    for j in range(start, end):
                        elements[j] = "Internal Loop"
            elif enclosing_pairs[0] is not None or enclosing_pairs[1] is not None:
                for j in range(start, end):
                    elements[j] = "Bulge"
            else:
                for j in range(start, end):
                    elements[j] = "External"
        else:
            i += 1

    # Identify multiloops
    for i in range(len(sequence)):
        if elements[i] == "Stem":
            paired = paired_positions[i]
            left_unpaired = (i > 0 and elements[i - 1] not in {"Stem", "External"})
            right_unpaired = (i < len(sequence) - 1 and elements[i + 1] not in {"Stem", "External"})
            if left_unpaired or right_unpaired:
                elements[i] = "Multiloop"

    # Create result
    result = []
    for i, char in enumerate(sequence):
        result.append({
            "nucleotide": char,
            "position": i + 1,
            "element": elements[i],
            "weight": weights[i]
        })
    
    return result

def process_json_and_write_output_with_structure(json_file, output_file):
    """
    Process the RNA JSON file, predict secondary structure, classify structural elements,
    and write annotated nucleotide information to a text file.

    Args:
        json_file (str): Path to the JSON file containing RNA data.
        output_file (str): Path to the output text file.
    """
    with open(json_file, 'r') as f:
        data = json.load(f)
    
    with open(output_file, 'w') as f_out:
        for prediction in data["predictions"]:
            sequence = prediction["sequence"]
            weights = prediction["weights"]
            rna_id = prediction["id"]
            score = prediction["score"]
            
            # Predict RNA structure
            fc = RNA.fold_compound(sequence)
            structure, mfe = fc.mfe()
            
            # Classify structural elements
            annotations = classify_rna_structure(sequence, structure, weights)
            
            # Write to output file
            f_out.write(f"RNA ID: {rna_id}\n")
            f_out.write(f"Sequence: {sequence}\n")
            f_out.write(f"Predicted Structure: {structure}\n")
            f_out.write(f"Score: {score}\n")
            f_out.write(f"Minimum Free Energy (MFE): {mfe} kcal/mol\n")
            f_out.write(f"Annotations:\n")
            for annotation in annotations:
                f_out.write(f"  Position: {annotation['position']}, "
                            f"Nucleotide: {annotation['nucleotide']}, "
                            f"Element: {annotation['element']}, "
                            f"Weight: {annotation['weight']:.4f}\n")
            f_out.write("\n" + "="*50 + "\n\n")

# Example Usage
json_file = "/data6/sobhan/RLLM/results/validation/RBM5v4/RBM5.json"  # Replace with your JSON file path
output_file = "./rna_annotations.txt"
process_json_and_write_output_with_structure(json_file, output_file)