In [None]:
import pandas as pd

In [None]:
input_merged_summary=snakemake.input.merged_summary
input_cluster_file=snakemake.input.cluster_file
output_representatives=snakemake.output.representatives
output_checkv_categories=snakemake.output.checkv_categories

In [None]:
# Read and preprocess the clustered_df DataFrame
clustered_df = pd.read_csv(input_cluster_file, sep="\t", names=["rep", "mem"])
clustered_df["mem"] = clustered_df["mem"].str.split(",")
clustered_df = clustered_df.explode('mem').reset_index(drop=True)

# Read and preprocess the checkv_df_vOTUs DataFrame
checkv_df_vOTUs = pd.read_csv(input_merged_summary, sep="\t",
                            names= ["contig_id_vOTU", "contig_length_vOTU", "provirus_vOTU", "proviral_length_vOTU",
                           "gene_count_vOTU", "viral_genes_vOTU", "host_genes_vOTU", "checkv_quality_vOTU",
                           "miuvig_quality_vOTU", "completeness_vOTU", "completeness_method_vOTU",
                           "contamination_vOTU", "kmer_freq_vOTU", "warnings_vOTU"])
checkv_df_vOTUs = checkv_df_vOTUs[["contig_id_vOTU", "contig_length_vOTU", "checkv_quality_vOTU",
                                   "completeness_vOTU", "provirus_vOTU", "warnings_vOTU"]]
checkv_df_vOTUs["contig_length_vOTU"] = checkv_df_vOTUs["contig_length_vOTU"].astype(float)
checkv_df_vOTUs["completeness_vOTU"] = checkv_df_vOTUs["completeness_vOTU"].astype(float)

# Create copies of the checkv_df_vOTUs DataFrame
checkv_df_rep = checkv_df_vOTUs.copy()
checkv_df_rep.columns = ["contig_id_rep", "contig_length_rep", "checkv_quality_rep", "completeness_rep",
                         "provirus_rep", "warnings_rep"]

# Merge DataFrames and drop unnecessary columns
merged_df = clustered_df.merge(checkv_df_rep, left_on="rep", right_on="contig_id_rep", how="left") \
                        .merge(checkv_df_vOTUs, left_on="mem", right_on="contig_id_vOTU", how="left")
merged_df = merged_df.drop(['contig_id_vOTU', 'contig_id_rep'], axis=1)
merged_df[['completeness_vOTU', 'contig_length_vOTU', 'completeness_rep']] = merged_df[
    ['completeness_vOTU', 'contig_length_vOTU', 'completeness_rep']].fillna(0)
merged_df["completeness_vOTU"] = merged_df["completeness_vOTU"].astype(float)
merged_df["contig_length_vOTU"] = merged_df["contig_length_vOTU"].astype(float)

# Choose the contig with the highest completeness and the lowest contig length
merged_df_best = merged_df[merged_df.groupby(['rep'], sort=False)['completeness_vOTU'].transform(max) == merged_df['completeness_vOTU']]
merged_df_best = merged_df_best[merged_df_best.groupby(['rep'], sort=False)['contig_length_vOTU'].transform(min) == merged_df_best['contig_length_vOTU']]
merged_df_best = merged_df_best.groupby(['rep']).first().reset_index()

# Select final singletons and grouped contigs based on checkv quality
final_singletons = merged_df_best[merged_df_best["rep"] == merged_df_best["mem"]]["rep"].tolist()

merged_df_diff = merged_df_best[merged_df_best["rep"] != merged_df_best["mem"]]
merged_df_diff = merged_df_diff.replace({'Complete': 4, 'High-quality': 3, 'Medium-quality': 2,'Low-quality': 1, 'Not-determined': 1})

# Choose representatives based on checkv quality comparison
merged_df_greatherthan = merged_df_diff[merged_df_diff["checkv_quality_rep"] >= merged_df_diff["checkv_quality_vOTU"]]
gr1 = merged_df_greatherthan["rep"].tolist()

merged_df_smallerthan = merged_df_diff[merged_df_diff["checkv_quality_rep"] < merged_df_diff["checkv_quality_vOTU"]]
gr2 = merged_df_smallerthan["mem"].tolist()

representatives = final_singletons + gr1 + gr2
final_df_rep = checkv_df_vOTUs[checkv_df_vOTUs['contig_id_vOTU'].isin(representatives)]

# Save representative contig IDs to a CSV file
final_df_rep["contig_id_vOTU"].to_csv(output_representatives, index=False, header=False)

# Sort the final_df_rep DataFrame by contig_length_vOTU column
final_df_rep.sort_values(by="contig_length_vOTU")                                                            


In [None]:
final_df_rep.groupby("checkv_quality_vOTU").size().to_csv(output_checkv_categories, index=True, header=False)
final_df_rep.groupby("checkv_quality_vOTU").size().to_frame()