In [2]:
import os
import subprocess
import numpy as np
import re
import tempfile
import lib_forgi

def predict_rna_structure_linearpartition(rna_sequence, linearpartition_path):
    """
    Predict RNA secondary structure using the LinearPartition tool and return structure annotations.

    Parameters:
    rna_sequence (str): Input RNA sequence.
    linearpartition_path (str): Path to the LinearPartition executable.

    Returns:
    dict: Structure annotations with nucleotide positions labeled.
    """
    # Temporary FASTA file to hold the RNA sequence
    with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".fasta") as fasta_file:
        fasta_file.write(f">seq\n{rna_sequence}\n")
        fasta_filepath = fasta_file.name

    try:
        # Run LinearPartition to get the structure
        command = f'cat {fasta_filepath} | {linearpartition_path} -m'
        output = subprocess.run(command, shell=True, stdout=subprocess.PIPE, check=True)

        # Decode output and parse dot-bracket notation
        dot_bracket_structure = output.stdout.decode().strip().split('\n')[-1]

        # Use lib_forgi to annotate structure features
        bg = lib_forgi.BulgeGraph()
        bg.from_dotbracket(dot_bracket_structure, None)
        forgi_structure = bg.to_bg_string()

        # Structure annotation lookup
        entity_lookup = {'f': 'dangling start', 't': 'dangling end', 'i': 'internal loop',
                         'h': 'hairpin loop', 'm': 'multi loop', 's': 'stem'}

        annotations = ['unstructured'] * len(rna_sequence)

        def make_node_set(numbers):
            numbers = list(map(int, numbers))
            ans = set()
            while len(numbers) > 1:
                a, b = numbers[:2]
                numbers = numbers[2:]
                for n in range(a - 1, b):
                    ans.add(n)
            return ans

        for line in forgi_structure.split('\n'):
            if line.startswith('define'):
                parts = line.split()
                structure_type = entity_lookup.get(parts[1][0], 'unstructured')
                for n in make_node_set(parts[2:]):
                    annotations[n] = structure_type

        return dict(enumerate(annotations))

    finally:
        os.remove(fasta_filepath)


In [6]:
def get_struct_annotation_viennaRNA(rna_sequence: str, path_to_rnafold: str = "RNAfold") -> list:
    with tempfile.NamedTemporaryFile(mode='w', delete=False) as tmp_fasta:
        tmp_fasta_name = tmp_fasta.name
        tmp_fasta.write(f">test_sequence\n{rna_sequence}\n")
    
    # 2. Run RNAfold (ViennaRNA). The output typically has lines:
    #    sequence, then dot-bracket + energy like "....((..))... (-7.4)"
    command = f"cat {tmp_fasta_name} | {path_to_rnafold}"
    result = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
    
    # 3. Parse the output to find dot-bracket structure
    stdout_str = result.stdout.decode('utf-8').strip().split('\n')
    # Example lines:
    # >test_sequence
    # ACGUGAAGGCUUCGAGGCUU
    # ....((..))...((..)) (-3.20)
    dot_bracket = None
    for line in stdout_str:
        line = line.strip()
        # This line usually ends with an energy in parentheses, e.g. "....(...) (-2.30)"
        # We'll extract the portion before the space
        match = re.match(r"([\.\(\)]+)\s+\(.*\)", line)
        if match:
            dot_bracket = match.group(1)
            break
    
    # 4. If found, label using forgi
    if dot_bracket is not None:
        labels = parse_dot_bracket_to_labels(dot_bracket)
    else:
        labels = [" "] * len(rna_sequence)
    
    # 5. Clean up
    try:
        os.remove(tmp_fasta_name)
    except:
        pass
    
    return labels


In [14]:
import re
import numpy as np
import lib_forgi  # imported as "import forgi" if installed as "forgi"
from lib_forgi import BulgeGraph

# Map the forgi single-letter annotation to F, T, I, H, M, S
# forgi annotation -> Our labels
# f = 'dangling start'
# t = 'dangling end'
# i = 'internal loop'
# h = 'hairpin loop'
# m = 'multi loop'
# s = 'stem'

ENTITY_LOOKUP = {
    'f': 0,  # dangling start
    't': 1,  # dangling end
    'i': 2,  # internal loop
    'h': 3,  # hairpin loop
    'm': 4,  # multi loop
    's': 5   # stem
}
LABELS = ['F', 'T', 'I', 'H', 'M', 'S']

def parse_dot_bracket_to_labels(dot_bracket: str):
    """
    Given a dot-bracket string, parse it using forgi to label each nucleotide 
    with one of [F, T, I, H, M, S].
    
    Returns:
        labels: a list of length len(dot_bracket), where each element is 
                one of ['F', 'T', 'I', 'H', 'M', 'S'].
    """
    # Initialize BulgeGraph
    bg = BulgeGraph()
    bg.from_dotbracket(dot_bracket, None)

    # We create a 2D array [6, length_of_seq] = 0
    # For each classification, we set 1 if it belongs to that class
    num_positions = len(dot_bracket)
    structure_matrix = np.zeros((6, num_positions), dtype=int)

    # The bg.to_bg_string() will produce lines like:
    # define f ... ...
    # define i ... ...
    # etc.
    for line in bg.to_bg_string().split('\n'):
        line = line.strip()
        # Example line: "define f 1 3" => nucleotides 1..3 are 'dangling start'
        if line.startswith('define'):
            parts = line.split()
            entity = parts[1][0]  # e.g. 'f'
            entity_index = ENTITY_LOOKUP.get(entity, None)
            if entity_index is not None:
                # The remaining parts are start/end indexes
                # parts might look like ['define', 'f', '2', '5']
                # these indexes are 1-based in forgi, so we convert to 0-based
                start_idx = int(parts[2]) - 1
                end_idx   = int(parts[3]) - 1
                for i in range(start_idx, end_idx + 1):
                    structure_matrix[entity_index, i] = 1

    # Convert the structure matrix to single-label annotation.
    # If a position belongs to multiple categories, we'll pick the first.
    labels_per_nucleotide = []
    for pos in range(num_positions):
        # Find which row is 1
        row_indices = np.where(structure_matrix[:, pos] == 1)[0]
        if len(row_indices) > 0:
            # Take the first annotation found
            label_index = row_indices[0]
            labels_per_nucleotide.append(LABELS[label_index])
        else:
            # If no annotation, we can label as something, or default to 'S' or ' '
            labels_per_nucleotide.append(' ')  # or 'X'

    return labels_per_nucleotide


In [23]:
test_seq = "ACGUGAAGGCUUCCCCGAGGCUU"
v_labels = get_struct_annotation_viennaRNA(test_seq, "RNAfold")
print("ViennaRNA Labels:", v_labels)

ViennaRNA Labels: ['F', 'F', 'F', 'F', 'F', 'F', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'H', 'H', 'H', ' ', ' ', ' ', ' ', ' ', ' ', ' ']


In [5]:
rna_seq = "GCGGAUUUAGCUCAGUUGGAGAGCGCC"

# Example usage of LinearPartition
# linearpartition_annotations = predict_rna_structure_linearpartition(rna_seq, "/path/to/linearpartition")
# print("LinearPartition Structure Annotations:", linearpartition_annotations)

# Example usage of ViennaRNA
viennarna_annotations = predict_rna_structure_viennarna(rna_seq)
print("ViennaRNA Structure Annotations:", viennarna_annotations)


Error in ViennaRNA execution: 'bytes' object has no attribute 'encode'
ViennaRNA Structure Annotations: None
