In [1]:
import sys
import pandas as pd


output_file = '/home/isacco.cenacchi/data/Tirocinio/out/tools_comparison.tsv'
tool1 = '/home/isacco.cenacchi/data/Tirocinio/out/MAGs_minced_Default/MAGs_minced_Default_parsed.tsv'
tool2 = '/home/isacco.cenacchi/data/Tirocinio/out/MAGs_minced_Paper/MAGs_minced_Paper_parsed.tsv'
tool3 = '/home/isacco.cenacchi/data/Tirocinio/out/MAGs_pilercr_PILER1/MAGs_pilercr_PILER1_parsed.tsv'
tool4 = '/home/isacco.cenacchi/data/Tirocinio/out/MAGs_pilercr_PILER2/MAGs_pilercr_PILER2_parsed.tsv'

# Upload the TSV files
try:
    tool1_df = pd.read_csv(tool1, delimiter='\t', usecols=['MAG', 'Contig', 'Start', 'End', 'Spacers', 'Repeats', 'ToolCodename'], dtype={'MAG': str, 'Contig': str, 'Start': int, 'End': int, 'ToolCodename': str}, index_col=False)
    tool2_df = pd.read_csv(tool2, delimiter='\t', usecols=['MAG', 'Contig', 'Start', 'End', 'Spacers', 'Repeats', 'ToolCodename'], dtype={'MAG': str, 'Contig': str, 'Start': int, 'End': int, 'ToolCodename': str}, index_col=False)
    tool3_df = pd.read_csv(tool3, delimiter='\t', usecols=['MAG', 'Contig', 'Start', 'End', 'Spacers', 'Repeats', 'ToolCodename'], dtype={'MAG': str, 'Contig': str, 'Start': int, 'End': int, 'ToolCodename': str}, index_col=False)
    tool4_df = pd.read_csv(tool4, delimiter='\t', usecols=['MAG', 'Contig', 'Start', 'End', 'Spacers', 'Repeats', 'ToolCodename'], dtype={'MAG': str, 'Contig': str, 'Start': int, 'End': int, 'ToolCodename': str}, index_col=False)
except FileNotFoundError as e:
    print("The input file does not exist, check the path", file=sys.stderr)
    # exit()
except ValueError as e:
    print('Errore: ', e)
    print('Check the column names in the input files (MAG, Contig, Start, End, Spacers, Repeats, ToolCodename), and secure that file is a TSV file')
    # exit()

# Concat the DataFrames
all_df = pd.concat([tool1_df, tool2_df, tool3_df, tool4_df], ignore_index=True)

# Remove duplicates based on 'MAG', 'Contig', 'Start', 'End', 'Spacers', 'Repeats' and keep one row, concatening 'ToolCodename' values
combined_df = all_df.groupby(['MAG', 'Contig', 'Start', 'End', 'Spacers', 'Repeats'], as_index=False).agg({
    'ToolCodename': lambda x: ','.join(sorted(set(x)))  # Usa il set per rimuovere duplicati e sorted per ordine consistente
})

# Function to assign unique IDs and check for overlaps
def assign_id_and_merge_overlaps(df):
    df = df.sort_values(by=['MAG', 'Contig', 'Start']).reset_index(drop=True)
    df['ID'] = None
    current_id = 1
    
    for i in range(len(df)):
        if df.loc[i, 'ID'] is None:
            # Assign a new ID
            df.loc[i, 'ID'] = current_id
            current_id += 1
        
        # Check for overlaps with subsequent rows with the same 'MAG' and 'Contig'
        for j in range(i + 1, len(df)):
            if df.loc[i, 'MAG'] == df.loc[j, 'MAG'] and df.loc[i, 'Contig'] == df.loc[j, 'Contig']:
                # Check for overlap of intervals
                if df.loc[i, 'Start'] <= df.loc[j, 'End'] and df.loc[j, 'Start'] <= df.loc[i, 'End']:
                    # Assign the same ID to the overlapping row
                    df.loc[j, 'ID'] = df.loc[i, 'ID']
            else:
                # If 'MAG' or 'Contig' are different, break the loop
                break
    return df

# Apply the function to assign IDs and check for overlaps
final_df = assign_id_and_merge_overlaps(combined_df)

# Save the final DataFrame to a TSV file
final_df.to_csv(output_file, sep='\t', index=False)

