In [55]:
import argparse
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
import numpy as np

columns=['scaffold','start_scf','end_scf','part','type','contig','start_ctg','end_ctg','strand']
agp = pd.read_csv("/g/data/xl04/ka6418/bassiana/publication-v2/curation/v1/BASDU_SUP_DEEP_hifiasm_yahs.breaks.nomito.agp", sep='\t', header=None,names=columns)
#agp = agp.drop(['start_scf','end_scf','part','type','start_ctg','end_ctg','strand'],axis=1)
agp = agp[agp['contig']!='200']

In [56]:
def split_and_rename(df, column_to_split, new_columns, drop_original=True):
    # Splits a column into multiple columns and optionally drops the original column
    splits = df[column_to_split].str.split(' ', expand=True)
    splits.columns = new_columns
    df = pd.concat([splits, df], axis=1)
    if drop_original:
        df.drop(column_to_split, axis=1, inplace=True)
    return df

In [57]:
def merge_dfs(left_df, right_df, merge_keys, drop_cols=None, how='inner'):
    # Merges two DataFrames on specified keys and drops specified columns
    merged = pd.merge(left_df, right_df, on=merge_keys, how=how)
    if drop_cols:
        merged.drop(drop_cols, axis=1, inplace=True)
    return merged

In [58]:
headers = pd.read_csv("/g/data/xl04/ka6418/bassiana/publication-v2/curation/final_sorted_and_headers/replaceheaders.tsv",sep='\t',header=None,names=['scaffold','renamed'])
headers = split_and_rename(headers,'renamed',['renamedscf','chr'])

In [59]:
headers

Unnamed: 0,renamedscf,chr,scaffold
0,BASDUscf1,chr1,scaffold_1
1,BASDUscf2,chr2,scaffold_2
2,BASDUscf3,chr3,scaffold_3
3,BASDUscf4,chr4,scaffold_4
4,BASDUscf5,chr5,scaffold_5
...,...,...,...
167,BASDUscf170,putY,scaffold_170
168,BASDUscfmt,mtGenome,scaffold_mt
169,BASDUscf172,putY,scaffold_172
170,BASDUscf173,putY,scaffold_173


In [60]:
agp['end_ctg'] = agp['end_ctg'].astype(int)
agp['start_ctg'] = agp['start_ctg'].astype(int)
agp['ctg_covered'] = agp['end_ctg'] - agp['start_ctg']

df = agp

# Keep only relevant columns
df_subset = df[['scaffold', 'contig', 'ctg_covered']]

# Sort by contig and ctg_covered to prioritize higher coverage
df_sorted = df_subset.sort_values(by=['contig', 'ctg_covered'], ascending=[True, False])

# Drop duplicates, keeping the one with the highest coverage
df_unique = df_sorted.drop_duplicates(subset=['contig'], keep='first')

# Reset the index for clarity
agp = df_unique.reset_index(drop=True)


In [61]:
merged = merge_dfs(headers,agp,['scaffold'])
merged = merged[['contig','chr']]

In [63]:
merged.to_csv('/g/data/xl04/ka6418/bassiana/publication-v2/bandage/seqlabels_chr_pctg.csv',index=None)

merged

In [51]:
agp

Unnamed: 0,scaffold,start_scf,end_scf,part,type,contig,start_ctg,end_ctg,strand,ctg_covered
0,scaffold_1,1,104231792,1,W,ptg000001l,1,104231792,+,104231791
2,scaffold_1,104231993,151774118,3,W,ptg000015l,1,47542126,+,47542125
4,scaffold_1,151774319,299325919,5,W,ptg000003l,106001,147657601,+,147551600
5,scaffold_2,1,108794513,1,W,ptg000074l,1,108794513,+,108794512
7,scaffold_2,108794714,150899378,3,W,ptg000009l,1,42104665,-,42104664
...,...,...,...,...,...,...,...,...,...,...
274,scaffold_169,1,19148,1,W,ptg000189l,1,19148,+,19147
275,scaffold_170,1,18326,1,W,ptg000167l,37001,55326,+,18325
276,scaffold_172,1,17258,1,W,ptg000188l,1,17258,+,17257
277,scaffold_173,1,16576,1,W,ptg000115c,1,16576,+,16575


In [54]:

df_final


Unnamed: 0,scaffold,contig,ctg_covered
0,scaffold_1,ptg000001l,104231791
1,scaffold_3,ptg000002l,35701484
2,scaffold_1,ptg000003l,147551600
3,scaffold_9,ptg000004l,38789999
4,scaffold_3,ptg000005l,90404875
...,...,...,...
181,scaffold_172,ptg000188l,17257
182,scaffold_169,ptg000189l,19147
183,scaffold_167,ptg000190l,20678
184,scaffold_168,ptg000191l,19810
