In [1]:
import argparse
from dataclasses import dataclass


@dataclass
class svid:
    """

    Object to store the sv information

    """

    chromo: str
    pos: int
    svtype: str
    reflen: int
    nonreflen: int
    sourcenode: str
    bubref: str
    bubnonref: str
    sinknode: str


def sv_det(line):
    """
    Determine SV types 
    Input: Each line of gfatools bubble output
    Output: Insertions or Deletions SV based on comparison
            between ref and non-ref allele

    """
    # 1 348029 4 2 s1,s2,s135091,s3 # this part is parsed from the workflow
    chromo, pos, nodes, _, nodelist = line.strip().split() # replaced _ to paths
    bubble = nodelist.split(",")[1:-1] # except first to be stored on succeeding variable
    sourcenode = nodelist.split(",")[0] # source
    sinknode = nodelist.split(",")[-1] # sink , can be redundant as mentioned in bubble variable
    if nodes in ["3", "4"]:
        if nodes == "3":
            rrank, nodelen = nodeinf[bubble[0]] # rrank and nodelen are from graph_len.tsv file in step 1 ; nodeinf[nodeid] = [int(rrank), int(nodelen)]
            if rrank > 0: # if the rank of the first node inside the bubble
                svtype = "Insertion"
                reflen = 0
                nonreflen = nodelen
                bubnonref = bubble[0]
                bubref = 0
            else:
                svtype = "Deletion"
                reflen = nodelen
                nonreflen = 0
                bubnonref = 0
                bubref = bubble[0]
        elif nodes == "4":
            # do not consider looping in the bubble
            if bubble[0] == bubble[1]:
                return None
            for bub in bubble:
                rrank, nodelen = nodeinf[bub]
                if rrank > 0:
                    nonreflen = nodelen
                    bubnonref = bub
                else:
                    reflen = nodelen
                    bubref = bub
            svtype = "AltDel" if nonreflen < reflen else "AltIns"
        return chromo, pos, svtype, reflen, nonreflen, sourcenode, bubref, bubnonref, sinknode
    else:
        return None

In [11]:
nodeinf = {}

with open(f"01_combined_coverage.tsv") as infile:
    next(infile)  # Skip the header row
    for line in infile:
        line_comp = line.strip().split()
        nodeid = line_comp[0]
        nodelen = line_comp[1]
        chromo = line_comp[2]
        pos = line_comp[3]
        rrank = line_comp[4]
        nodeinf[nodeid] = [int(rrank), int(nodelen)]

# Now nodeinf will contain the parsed data excluding the unwanted columns

In [13]:
nodeinf

{'s1': [0, 11093],
 's2': [0, 25],
 's3': [0, 1129],
 's4': [0, 7805],
 's5': [0, 9225],
 's6': [0, 436],
 's7': [0, 39],
 's8': [0, 13418],
 's9': [0, 11606],
 's10': [0, 4],
 's11': [0, 25278],
 's12': [0, 54],
 's13': [0, 1873],
 's14': [0, 235],
 's15': [0, 11348],
 's16': [0, 6580],
 's17': [0, 3569],
 's18': [0, 463],
 's19': [0, 2340],
 's20': [0, 11564],
 's21': [0, 3231],
 's22': [0, 179],
 's23': [0, 7131],
 's24': [0, 6991],
 's25': [0, 59],
 's26': [0, 3474],
 's27': [0, 8734],
 's28': [0, 2642],
 's29': [0, 67],
 's30': [0, 7646],
 's31': [0, 65],
 's32': [0, 6079],
 's33': [0, 10170],
 's34': [0, 1],
 's35': [0, 769],
 's36': [0, 772],
 's37': [0, 4568],
 's38': [0, 424],
 's39': [0, 636],
 's40': [0, 885],
 's41': [0, 487],
 's42': [0, 405],
 's43': [0, 1003],
 's44': [0, 1578],
 's45': [0, 4930],
 's46': [0, 25],
 's47': [0, 32],
 's48': [0, 127],
 's49': [0, 661],
 's50': [0, 8867],
 's51': [0, 13639],
 's52': [0, 306],
 's53': [0, 9611],
 's54': [0, 140],
 's55': [0, 

In [47]:
# path to bubbles /Users/jongpaduhilao/Desktop/LAB Files/Initial_Pangenome_analysis/Trial_4/crysnanto_bubble/asm5.nip.biallelic.bubble.tsv
# return chromo, pos, svtype, reflen, nonreflen, sourcenode, bubref, bubnonref, sinknode

with open(f"asm5.nip.biallelic.bubble.tsv") as infile:
    for line in infile:
        if sv_det(line):
            print(*sv_det(line))

id=Nipponbare|chr01 11093 AltIns 25 74 s1 s2 s191853 s3
id=Nipponbare|chr01 20052 Insertion 0 738 s4 0 s258732 s5
id=Nipponbare|chr01 29713 AltIns 39 829 s6 s7 s191860 s8
id=Nipponbare|chr01 80058 AltDel 54 1 s11 s12 s191861 s13
id=Nipponbare|chr01 81985 Deletion 235 0 s13 s14 0 s15
id=Nipponbare|chr01 93568 Insertion 0 6689 s15 0 s191862 s16
id=Nipponbare|chr01 100148 Deletion 3569 0 s16 s17 0 s18
id=Nipponbare|chr01 135616 AltDel 59 1 s24 s25 s258735 s26
id=Nipponbare|chr01 139149 Insertion 0 2900 s26 0 s191863 s27
id=Nipponbare|chr01 147883 Insertion 0 507 s27 0 s191864 s28
id=Nipponbare|chr01 150525 Deletion 67 0 s28 s29 0 s30
id=Nipponbare|chr01 158238 Deletion 65 0 s30 s31 0 s32
id=Nipponbare|chr01 164382 Insertion 0 113 s32 0 s191865 s33
id=Nipponbare|chr01 174552 AltIns 1 579 s33 s34 s191866 s35
id=Nipponbare|chr01 175322 Insertion 0 222 s35 0 s191867 s36
id=Nipponbare|chr01 176094 Insertion 0 337 s36 0 s191868 s37
id=Nipponbare|chr01 180662 Deletion 424 0 s37 s38 0 s39
id=Nipp

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [48]:
result

This part is for tracing the path in reference to Crysnanto et al. 2021 trace_path.py 

Input:
biallelic_sv.tsv
edge_coverage.tsv

Variables:
Assembly names as anims

In [88]:
from collections import defaultdict
import argparse

def parse_edge_coverage(covfile):
    """

    Create nested dict with [parent][child][assembly] from coverage file

    """
    basedict = lambda: defaultdict(basedict)
    edge_cover = basedict() # initiate dictionary

    with open(covfile) as infile:
        _, _, *assemb = next(infile).strip().split()

        for line in infile:
            parent, child, *covers = line.strip().split()

            for ind, cover in enumerate(covers):
                edge_cover[parent][child][assemb[ind]] = int(cover)

    return edge_cover

edge_cover = parse_edge_coverage('01_combined_coverage_edge.tsv')

FileNotFoundError: [Errno 2] No such file or directory: '01_combined_coverage_edge.tsv'

In [10]:
edge_cover

defaultdict(<function __main__.parse_edge_coverage.<locals>.<lambda>()>,
            {'s1': defaultdict(<function __main__.parse_edge_coverage.<locals>.<lambda>()>,
                         {'s2': defaultdict(<function __main__.parse_edge_coverage.<locals>.<lambda>()>,
                                      {'IRGSP': 1,
                                       'nh232': 1,
                                       'cw02': 0,
                                       'nh236': 1,
                                       'nh286': 0,
                                       'nh273': 0}),
                          's191853': defaultdict(<function __main__.parse_edge_coverage.<locals>.<lambda>()>,
                                      {'IRGSP': 0,
                                       'nh232': 0,
                                       'cw02': 1,
                                       'nh236': 0,
                                       'nh286': 1,
                                       'nh273': 1})}),
    

In [87]:
def generate_path(start, ref, nonref, stop):
    """

    Given source, sink, and ref/non-ref nodes enumerate all possible paths

    """
    ref_path = [x for x in [start, ref, stop] if x != "0"]
    nonref_path = [x for x in [start, nonref, stop] if x != "0"]
    return [ref_path, nonref_path]


def path_traverse(path, anim, edge_cover):
    """

    Check whether path is traversed for a given assembly

    """

    all_cover = []
    for ind, node in enumerate(path[:-1]):
        parent = node
        child = path[ind + 1]
        cover = edge_cover[parent][child][anim]
        all_cover.append(cover)

    if all(all_cover):
        return anim
    else:
        return ""

In [12]:
anims = ["IRGSP", "nh232", "cw02", "nh236", "nh286", "nh273"]
# edge_coverage = edge_cover

In [13]:
def traverse_all_anims(ref_path, nonref_path, ref_len, nonref_len, mutype, anim=anims, svmode="biallelic"):
    """
    Going to the each assembly and check whether it traverse the ref and non-ref path

    """
    # iterate for each assembly whether its support ref or nonref
    ref_list = [path_traverse(ref_path, anim, edge_cover=edge_cover) for anim in anims]
    nonref_list = [path_traverse(nonref_path, anim, edge_cover=edge_cover) for anim in anims]

    # if no assembly can traverse just write noassemb
    if all(x == "" for x in nonref_list):
        nonref_list = ["noassemb"]

    if all(x == "" for x in ref_list):
        ref_list = ["noassemb"]

    # return with svid, ref paths and the label
    print(svmode, "\t", svid, "\t", mutype, "\t", ref_len, "\t", nonref_len, "\t",
            ",".join(ref_path), "\t", ",".join([x for x in ref_list if x]), "\t",
            ",".join(nonref_path), "\t", ",".join([x for x in nonref_list if x]))


In [20]:
# process biallelic bubble
with open(f"asm5.nip.biallelic_sv.tsv") as infile:
    for line in infile:
        # 1 165873 AltDel 497 2 s1 s2 s133016 s3
        linecomp = line.strip().split()
        svid = "_".join(linecomp[:2])
        mutype, ref_len, nonref_len = linecomp[2:5]
        start, ref, nonref, stop = linecomp[5:]
        ref_path, nonref_path = generate_path(start, ref, nonref, stop)

        traverse_all_anims(ref_path, nonref_path, ref_len, nonref_len, mutype)

biallelic 	 id=Nipponbare|chr01_11093 	 AltIns 	 25 	 74 	 s1,s2,s3 	 IRGSP,nh232,nh236 	 s1,s191853,s3 	 cw02,nh286,nh273
biallelic 	 id=Nipponbare|chr01_20052 	 Insertion 	 0 	 738 	 s4,s5 	 IRGSP,nh232,cw02,nh236 	 s4,s258732,s5 	 nh286,nh273
biallelic 	 id=Nipponbare|chr01_29713 	 AltIns 	 39 	 829 	 s6,s7,s8 	 IRGSP,nh232,nh236,nh286,nh273 	 s6,s191860,s8 	 cw02
biallelic 	 id=Nipponbare|chr01_80058 	 AltDel 	 54 	 1 	 s11,s12,s13 	 IRGSP,nh232,nh236 	 s11,s191861,s13 	 cw02,nh286,nh273
biallelic 	 id=Nipponbare|chr01_81985 	 Deletion 	 235 	 0 	 s13,s14,s15 	 IRGSP,nh232,cw02,nh236 	 s13,s15 	 nh286,nh273
biallelic 	 id=Nipponbare|chr01_93568 	 Insertion 	 0 	 6689 	 s15,s16 	 IRGSP,nh232,nh236,nh286,nh273 	 s15,s191862,s16 	 cw02
biallelic 	 id=Nipponbare|chr01_100148 	 Deletion 	 3569 	 0 	 s16,s17,s18 	 IRGSP 	 s16,s18 	 nh232,cw02,nh236,nh286,nh273
biallelic 	 id=Nipponbare|chr01_135616 	 AltDel 	 59 	 1 	 s24,s25,s26 	 IRGSP,nh232,cw02,nh236 	 s24,s258735,s26 	 nh286,nh273
b

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [19]:
cd ../crysnanto_bubble/

/Users/jongpaduhilao/Desktop/LAB Files/Initial_Pangenome_analysis/Trial_4/crysnanto_bubble


Breakpoint and Annotation of Structural Variations
https://github.com/AnimalGenomicsETH/bovine-graphs/blob/main/subworkflows/sv_analysis.py

input:
bubble file - asm_bubble.tsv
bialsv file - asm_biallelic_sv.tsv

Output: 
left_bed - left_breakpoints.bed
right_bed - right_breakpoints.bed

In [8]:
bubble_file = "/Users/jongpaduhilao/Desktop/LAB_Files/Initial_Pangenome_analysis/Trial_4/crysnanto_bubble/asm5.nip.bubble.tsv"

import re

right_bp = {}
with open(bubble_file) as infile:
    for line in infile:
        chromo, left_side, right_side, *_ = line.strip().split('\t')
        # Extract the chromosome identifier (e.g., 'chr01')
        chromo = re.search(r'chr\d+', chromo).group()
        right_bp[f"{chromo}_{left_side}"] = right_side
"""
{'01_11093': '11118',
 '01_12247': '12247'
"""
right_bp

{'chr01_11093': '11118',
 'chr01_12247': '12247',
 'chr01_20052': '20052',
 'chr01_29277': '29277',
 'chr01_29713': '29752',
 'chr01_43170': '54780',
 'chr01_80058': '80112',
 'chr01_81985': '82220',
 'chr01_93568': '93568',
 'chr01_100148': '103717',
 'chr01_104180': '128625',
 'chr01_135616': '135675',
 'chr01_139149': '139149',
 'chr01_147883': '147883',
 'chr01_150525': '150592',
 'chr01_158238': '158303',
 'chr01_164382': '164382',
 'chr01_174552': '174553',
 'chr01_175322': '175322',
 'chr01_176094': '176094',
 'chr01_180662': '181086',
 'chr01_181722': '181722',
 'chr01_182607': '182607',
 'chr01_183094': '183094',
 'chr01_183499': '183499',
 'chr01_184502': '186080',
 'chr01_191010': '191067',
 'chr01_191194': '191855',
 'chr01_200722': '200722',
 'chr01_214361': '214667',
 'chr01_224278': '224418',
 'chr01_225346': '225346',
 'chr01_236635': '236667',
 'chr01_238837': '239267',
 'chr01_239593': '239594',
 'chr01_239605': '239605',
 'chr01_239619': '239672',
 'chr01_256647': '2

In [22]:
def wrote_sv_bed(line, left_file, right_file, mutype="biallelic"):
    if mutype == "biallelic":
    # Remove leading/trailing whitespace and split
        line_comp = [item.strip() for item in line.strip().split()]
        chromo_full, leftcoord = line_comp[:2]
        start_node, stop_node = line_comp[5], line_comp[-1]
        
        # Extract the chromosome identifier (e.g., 'chr01')
        chromo = re.search(r'chr\d+', chromo_full).group()
        
        leftcoord = int(leftcoord)
        sv_comp = f"{chromo}_{leftcoord}"
        
        # sv_comp, *sv_rest = line.strip().split()
        # _, *chromo, leftcoord = sv_comp.split("_")
        # only get the numeric part as the chromosome id
        # chromo = [int(x) for x in chromo if re.search(r"\d+", x)][0]
        # start_node, *_, stop_node = sv_rest[-2].split(",")
    """
    if mutype == "multiallelic": # this part won't work as we don't have multiallelic sv type right now
        # 1_535561        1898    5138    AltIns  s33,s60173,s36
        line_comp = line.strip().split()
        chromo, leftcoord = line_comp[0].split("_")
        leftcoord = int(leftcoord)
        sv_comp = f"{chromo}_{leftcoord}"
        start_node, *_, stop_node = line_comp[-1].split(",")
    """
    # Ensure all components are stripped of whitespace
    chromo = chromo.strip()
    start_node = start_node.strip()
    stop_node = stop_node.strip()
    sv_comp = sv_comp.strip()
    
    # Write the bed file
    left_file.write(
        f"{chromo}\t{leftcoord-1}\t{leftcoord+1}\t{start_node}\t{stop_node}\t{sv_comp}\n")
    
    svid = f"{chromo}_{leftcoord}"
    rightcoord = int(right_bp[svid])
    right_file.write(
        f"{chromo}\t{rightcoord-1}\t{rightcoord+1}\t{start_node}\t{stop_node}\t{sv_comp}\n")

In [23]:
bialsv_file = "asm5.nip.biallelic_sv.tsv"

In [24]:
with open(bialsv_file) as bialfile:
    with open("asm5.nip_left_breakpoints.bed", "a") as left_file, open("asm5.nip_right_breakpoints.bed", "a") as right_file:
        for line in bialfile:
            # process the biallelic breakpoints
            wrote_sv_bed(line, left_file, right_file, mutype="biallelic")
            # process the multiallelic breakpoints
            #    sv_processed = []
            #    mutlist = []
            #    for line in multifile:
            #        svid = line.strip().split()[0]
            #        # sv_comp, *sv_rest = line.strip().split()
            #        # _, *chromo, leftcoord = sv_comp.split("_")
            #        # # only get the numeric part as the chromosome id
            #        # chromo = [int(x) for x in chromo if re.search(r"\d+", x)][0]
            #        # svid = f"{chromo}_{leftcoord}"
            #        if svid not in sv_processed:
            #            sv_processed.append(svid)
            #            wrote_sv_bed(line, left_file, right_file, mutype="multiallelic")
    

In [38]:
ls

00_Crysnanto_unique_IRGSP.tsv
00_Crysnanto_unique_IRGSP.tsvls
00_Crysnanto_vs_minigrapg_IRGSP.tsv
IRGSP_paths.tsv
asm5.nip.biallelic.bubble.tsv
asm5.nip.biallelic_sv.tsv
asm5.nip.bubble.bed
asm5.nip.bubble.tsv
asm5.nip.multiallelic.bubble.tsv
asm5.nip.multiallelic.bubble_greater_8.tsv
asm5.nip_left_breakpoints.bed
asm5.nip_right_breakpoints.bed
path_trace_asm5_biallele.tsv
[31mtrace_path.edited.py[m[m*


In [39]:
gff = "/Users/jongpaduhilao/Desktop/LAB_Files/Initial_Pangenome_analysis/Trial_4/Raw_data/IRGSP-1.0_representative/transcripts.gff"

In [1]:
merged = "/Users/jongpaduhilao/Desktop/LAB_Files/Initial_Pangenome_analysis/Trial_4/Raw_data/data/05x4_biallelic_asm2annot.intersect.tsv"

Annotate Breakpoint, I think this is the start and end of the bubble formations

Input: 
left bed
right bed
gff transcripts IRGSP

Output:
Annot_breakpoints.py

EDIT: this is annot_sv.py

In [37]:
def get_feature_table(mergedbreak):
    """
    Parse the GFF line to get the name, ID, and other attributes from each feature.
    Yields: [start position, strand, ID, Name]
    """
    for feature in mergedbreak:
        # feature.fields[-1] holds the attributes column (e.g., "ID=Os01t0100200-01;Name=Os01t0100200-01")
        attributes = feature[-1].strip()  # Get the last column (attributes) and remove whitespace
        annot = [x.split("=") if "=" in x else [x, "."] for x in attributes.split(";")] # Split attributes at ";" and then "="
        
        try:
            # Create a dictionary from the attribute key-value pairs
            annot2 = {x1: x2 for x1, x2 in annot}  # Safeguard for missing "="
            
            # Extracting 'Parent', 'ID', and 'Name' attributes
            Parent = annot2.get("Parent", "noparent")  # Default to 'noparent' if Parent key is missing
            ID = annot2.get("ID", "noid")  # Default to 'noid' if ID key is missing
            Name = annot2.get("Name", "noname")  # Default to 'noname' if Name key is missing

            # If both 'ID' and 'Name' are missing, assign 'Parent' to both
            if ID == "noid" and Name == "noname":
                ID = Parent
                Name = Parent
            
            # Yield the relevant fields: start position, strand, ID, Name
            yield [feature[3], feature[11], ID, Name]
        
        except:
            # Skip and continue if any error occurs (e.g., malformed attributes)
            continue

def extract_important_feature(prevfeat, curfeat):
    """

    Return the most important feature from gff
    """
    priority = {"CDS": 6, "five_prime_UTR": 5, "three_prime_UTR": 4 ,"exon":3, "mRNA":2,  "gene": 1} # adding 5' UTR and 3' UTR to suit IRGSP annotation
    prevprior = priority.get(prevfeat, 0)
    curprior = priority.get(curfeat, 0)

    if curprior == 0 and prevprior == 0:
        return "intergenic"
    if curprior >= prevprior:
        return curfeat
    else:
        return prevfeat


with open(merged, "r") as file:  # Ensure the correct file path is used
    tsv_file = csv.reader(file, delimiter="\t")

    # Get the processed features and attributes
    parsed_features = list(get_feature_table(tsv_file))


# Example: Print the parsed features for inspection
# in this case, no intergenic becase all features are print. Fix the parsing part
for feature in parsed_features:
    print(feature[1])

.
mRNA
three_prime_UTR
.
.
.
.
.
.
.
.
mRNA
three_prime_UTR
.
.
mRNA
exon
.
.
.
.
.
.
mRNA
five_prime_UTR
CDS
three_prime_UTR
mRNA
five_prime_UTR
CDS
mRNA
three_prime_UTR
.
.
mRNA
exon
.
.
.
mRNA
five_prime_UTR
.
.
mRNA
three_prime_UTR
mRNA
mRNA
mRNA
mRNA
mRNA
.
.
.
.
.
.
.
.
.
.
.
.
.
.
mRNA
mRNA
mRNA
mRNA
.
.
.
.
.
.
mRNA
.
three_prime_UTR
mRNA
mRNA
mRNA
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
mRNA
.
.
.
.
.
.
.
.
mRNA
three_prime_UTR
.
.
.
.
.
.
.
.
mRNA
five_prime_UTR
CDS
mRNA
.
.
.
.
.
mRNA
.
.
.
.
.
.
.
mRNA
mRNA
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
mRNA
three_prime_UTR
.
.
.
.
.
.
.
mRNA
CDS
mRNA
three_prime_UTR
.
.
.
.
.
.
.
mRNA
three_prime_UTR
CDS
three_prime_UTR
mRNA
CDS
three_prime_UTR
mRNA
five_prime_UTR
mRNA
CDS
CDS
mRNA
five_prime_UTR
mRNA
five_prime_UTR
mRNA
mRNA
CDS
mRNA
mRNA
mRNA
mRNA
.
.
.
.
.
mRNA
five_prime_UTR
mRNA
three_prime_UTR
.
mRNA
.
.
.
.
.
.
.
.
.
.
.
.
.
mRNA
CDS
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
mRNA
mRNA
mRNA
mRNA
mRNA
mRNA
.
.
.
.
.
.
.
.
mRNA
.
.
.
.

In [42]:
import csv

def get_feature_table(mergedbreak):
    """
    Parse the TSV line to get the ID and feature type.
    Yields: [svid, feature_type, ID]
    """
    for feature in mergedbreak:
        try:
            svid = feature[3]  # SVID is in the 4th column
            sv_type = feature[4] # type of SV
            feature_type = feature[11]  # mRNA and gene structures
            attributes = feature[-1].strip()
            
            # Parse attributes
            attr_dict = dict(x.split('=') for x in attributes.split(';') if '=' in x)
            
            # Get ID, defaulting to the 'Parent' if 'ID' is not present
            ID = attr_dict.get('ID', attr_dict.get('Parent', 'unknown'))
            
            yield [svid, feature_type, ID, sv_type]
        except Exception as e:
            print(f"Error processing line: {feature}")
            print(f"Error message: {str(e)}")
            continue

def extract_important_feature(prevfeat, curfeat):
    """
    Return the most important feature based on priority.
    """
    priority = {"CDS": 6, "five_prime_UTR": 5, "three_prime_UTR": 4, "exon": 3, "mRNA": 2, "gene": 1}
    prevprior = priority.get(prevfeat, 0)
    curprior = priority.get(curfeat, 0)
    if curprior == 0 and prevprior == 0:
        return "intergenic"
    return curfeat if curprior >= prevprior else prevfeat

def process_features(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        tsv_reader = csv.reader(infile, delimiter='\t')
        feature_gen = get_feature_table(tsv_reader)
        
        current_svid = None
        current_type = "intergenic"
        current_id = ""
        current_sv_type = ""
        
        for svid, feat_type, feat_id, sv_type in feature_gen:
            if svid != current_svid:
                if current_svid:  # Write the previous entry
                    outfile.write(f"{current_svid}\t{current_sv_type}\t{current_type}\t{current_id}\n")
                current_svid = svid
                current_type = "intergenic"
                current_id = feat_id
                current_sv_type = sv_type
            
            current_type = extract_important_feature(current_type, feat_type)
            if current_type != "intergenic":
                current_id = feat_id
        
        # Write the last entry
        if current_svid:
            outfile.write(f"{current_svid}\t{current_sv_type}\t{current_type}\t{current_id}\n")

if __name__ == "__main__":
    input_file = merged  # Replace with your input file path
    output_file = "test2_annot_sv.tsv"  # Replace with your desired output file path
    
    process_features(input_file, output_file)
    print(f"Annotation complete. Results written to {output_file}")

# add the sv type _ real one
# logic of the code
# intergenic? number of intergenic should be the same if this is intersected to locus.gff intergenic == locus.gff "."
# trace the codes function outputs

Annotation complete. Results written to test2_annot_sv.tsv


In [38]:
pwd

'/Users/jongpaduhilao/Desktop/LAB_Files/Initial_Pangenome_analysis/Trial_4'