In [None]:
import sys
import os
from datetime import datetime
import pandas as pd
from run_CRISPRtools import assign_id_and_merge_overlaps


input_dir = '/Users/isaccocenacchi/Desktop/Tirocinio/out/MAGs_CRISPRtools'
comparison_file = os.path.join(input_dir, f"{os.path.basename(input_dir)}_tools_comparison.tsv")
cas=True

start_time = datetime.now()

parsed_files = [os.path.join(dirpath,filename)
            for dirpath, _, filenames in os.walk(input_dir)
            for filename in filenames
            if filename.endswith('_parsed.tsv')
        ]

parsed_dfs = []

columns={'MAG': str, 'Contig': str, 'Start': int, 'End': int, 'Repeats': str, 'Spacers': str, 'ToolCodename': str}
if cas:
    columns.update({'Cas_0-1000': int, 'Cas_1000-10000': int, 'Cas_>100000': int, 'Cas_overlayed': int})

# Upload the TSV files
for file in parsed_files:
    try:
        parsed_dfs.append(pd.read_csv(file, delimiter='\t',
                                        usecols=list(columns.keys()),
                                        dtype=columns,
                                        index_col=False))
    except FileNotFoundError as e:
        print(f"The parsing file '{file}' does not exist, there was a problem with the parsing")
    except ValueError as e:
        print(f'Check the column names in the parsed file {file}: {e}')

if not parsed_dfs and len(parsed_dfs) < 2:
    print('No files to compare')

# Concat the DataFrames
all_df = pd.concat([parsed_df for parsed_df in parsed_dfs], ignore_index=True)


columns_groupby = list(columns.keys())
columns_groupby.remove('ToolCodename')

# Remove duplicates based on all columns and keep one row, concatening 'ToolCodename' values
combined_df = all_df.groupby(columns_groupby, as_index=False).agg({'ToolCodename': lambda x: ','.join(sorted(set(x)))}) 
                                                            # Use set to remove duplicates and sorted for consistent order 

# Apply the function to assign IDs and check for overlaps
final_df = assign_id_and_merge_overlaps(combined_df)

# Save the final DataFrame to a TSV file
final_df.to_csv(comparison_file, sep='\t')

end_time = datetime.now()
print(f'Tool comparison in {datetime.strftime(datetime.min + (end_time - start_time), "%Hh:%Mm:%S.%f")[:-3]}s')
print(f'Total number of CRISPR arrays: {len(final_df)}')