In [1]:
import pandas as pd
import re

In [3]:
# Read data file
file_path = "../data_ensembl/21-1-46709983.txt"
# file_path = "../data_ensembl/test.txt"
with open(file_path, "r") as f:
    lines = f.readlines()

In [5]:
# Function to extract transitions
def extract_transitions(gen_id, start, end, sequence, chromosome, global_start, global_end, strand, exons, transcript_count):
    data_ei = []
    data_ie = []
    data_ze = []
    data_ez = []

    if transcript_count == 0:
        return data_ei, data_ie, data_ze, data_ez

    # Exon -> Intron
    for i in range(len(exons) - 1):
        exon_end = exons[i][1]
        intron_start = exon_end + 1

        if intron_start + 1 < len(sequence) and sequence[intron_start:intron_start+2] == "gt":
            left = sequence[max(0, intron_start - 5):intron_start]
            right = sequence[intron_start:intron_start + 7]
            transition_seq = left + right
            data_ei.append([gen_id, chromosome, global_start, exon_end, *list(transition_seq)])

    # Intron -> Exon
    for i in range(len(exons) - 1):
        exon_start = exons[i + 1][0]
        intron_end = exon_start - 1

        if intron_end - 1 >= 0 and sequence[intron_end-1:intron_end+1] == "ag":
            left = sequence[max(0, intron_end - 100):intron_end]
            right = sequence[intron_end:intron_end + 5]
            transition_seq = left + right
            data_ie.append([gen_id, chromosome, global_start, exon_start, *list(transition_seq)])

    # Intergenic Zone -> First Exon
    exon_start = exons[0][0]
    left = sequence[max(0, exon_start - 500):exon_start]
    right = sequence[exon_start:exon_start + 50]
    transition_seq = left + right
    data_ze.append([gen_id, chromosome, global_start, exon_start, *list(transition_seq)])

    # Last Exon -> Intergenic Zone
    exon_end = exons[-1][1]
    left = sequence[max(0, exon_end - 50):exon_end]
    right = sequence[exon_end:exon_end + 500]
    transition_seq = left + right
    data_ez.append([gen_id, chromosome, global_start, exon_end, *list(transition_seq)])

    return data_ei, data_ie, data_ze, data_ez

In [6]:
# Variables to store data
all_data_ei, all_data_ie, all_data_ze, all_data_ez = [], [], [], []

In [7]:
# Process each line
index = 0
transcript_regex = re.compile(r"^\(\[(\d+,\d+)\](,\[(\d+,\d+)\])*,\[(\d+)\]\)$")

while index < len(lines):
    line = lines[index].strip()
    if line.startswith("("):
        # Extract gene information
        match = re.match(r"\(\[(.*?)\],\[(\d+)\],\[(\d+)\],\[(.*?)\],\[(\d+)\],\[(\d+)\],\[(\d+)\],(true|false)\)", line)
        if match:
            gen_id, start, end, sequence, chromosome, global_start, global_end, strand = match.groups()
            start, end, chromosome, global_start, global_end = map(int, [start, end, chromosome, global_start, global_end])
            strand = (strand == "true")

            # Accumulate all transcript lines
            exons_list = []
            transcript_count = 0

            while index + 1 < len(lines) and transcript_regex.match(lines[index + 1].strip()):
                index += 1
                trans_line = lines[index].strip()
                exon_matches = re.findall(r"\[(\d+),(\d+)\]", trans_line)
                exons = [(int(s), int(e)) for s, e in exon_matches]
                transcript_count = int(re.search(r"\[(\d+)\]\)$", trans_line).group(1))
                exons_list.append(exons)

            # Process each exon set
            for exons in exons_list:
                data_ei, data_ie, data_ze, data_ez = extract_transitions(gen_id, start, end, sequence, chromosome, global_start, global_end, strand, exons, transcript_count)

                all_data_ei.extend(data_ei)
                all_data_ie.extend(data_ie)
                all_data_ze.extend(data_ze)
                all_data_ez.extend(data_ez)

    index += 1

In [8]:
# Save to CSV
pd.DataFrame(all_data_ei).to_csv("../data/data_ei.csv", index=False, header=["GEN_ID", "Chromosome", "Global_Start", "Exon_End"] + [f"B{i+1}" for i in range(12)])
pd.DataFrame(all_data_ie).to_csv("../data/data_ie.csv", index=False, header=["GEN_ID", "Chromosome", "Global_Start", "Exon_Start"] + [f"B{i+1}" for i in range(105)])
pd.DataFrame(all_data_ze).to_csv("../data/data_ze.csv", index=False, header=["GEN_ID", "Chromosome", "Global_Start", "Exon_Start"] + [f"B{i+1}" for i in range(550)])
pd.DataFrame(all_data_ez).to_csv("../data/data_ez.csv", index=False, header=["GEN_ID", "Chromosome", "Global_Start", "Exon_End"] + [f"B{i+1}" for i in range(550)])