In [None]:
import pandas as pd

In [None]:
input_merged_summary=snakemake.input.merged_summary
input_cluster_file=snakemake.input.cluster_file
input_derreplicated_clusters=snakemake.input.derreplicated_clusters
output_representatives=snakemake.output.representatives
output_checkv_categories=snakemake.output.checkv_categories
output_new_clusters=snakemake.output.new_clusters

In [None]:
# Read and preprocess the clustered_df DataFrame
clustered_df = pd.read_csv(input_cluster_file, sep="\t", names=["rep", "mem"])
clustered_df["mem"] = clustered_df["mem"].str.split(",")
clustered_df = clustered_df.explode('mem').reset_index(drop=True)

# Read and preprocess the checkv_df_vOTUs DataFrame
checkv_df_vOTUs = pd.read_csv(input_merged_summary, sep="\t",
                            names= ["contig_id_vOTU", "contig_length_vOTU", "provirus_vOTU", "proviral_length_vOTU",
                           "gene_count_vOTU", "viral_genes_vOTU", "host_genes_vOTU", "checkv_quality_vOTU",
                           "miuvig_quality_vOTU", "completeness_vOTU", "completeness_method_vOTU",
                           "contamination_vOTU", "kmer_freq_vOTU", "warnings_vOTU"])
checkv_df_vOTUs = checkv_df_vOTUs[["contig_id_vOTU", "contig_length_vOTU", "checkv_quality_vOTU",
                                   "completeness_vOTU", "provirus_vOTU", "warnings_vOTU"]]
checkv_df_vOTUs["contig_length_vOTU"] = checkv_df_vOTUs["contig_length_vOTU"].astype(float)
checkv_df_vOTUs["completeness_vOTU"] = checkv_df_vOTUs["completeness_vOTU"].astype(float)
checkv_df_vOTUs=checkv_df_vOTUs.groupby("contig_id_vOTU").first().reset_index()

# Create copies of the checkv_df_vOTUs DataFrame
checkv_df_rep = checkv_df_vOTUs.copy()
checkv_df_rep.columns = ["contig_id_rep", "contig_length_rep", "checkv_quality_rep", "completeness_rep",
                         "provirus_rep", "warnings_rep"]

# Merge DataFrames and drop unnecessary columns
merged_df = clustered_df.merge(checkv_df_rep, left_on="rep", right_on="contig_id_rep", how="left") \
                        .merge(checkv_df_vOTUs, left_on="mem", right_on="contig_id_vOTU", how="left")
merged_df = merged_df.drop(['contig_id_vOTU', 'contig_id_rep'], axis=1)
merged_df[['completeness_vOTU', 'contig_length_vOTU', 'completeness_rep']] = merged_df[
    ['completeness_vOTU', 'contig_length_vOTU', 'completeness_rep']].fillna(0)
merged_df["completeness_vOTU"] = merged_df["completeness_vOTU"].astype(float)
merged_df["contig_length_vOTU"] = merged_df["contig_length_vOTU"].astype(float)

# Choose the contig with the highest completeness and the lowest contig length
merged_df_best = merged_df[merged_df.groupby(['rep'], sort=False)['completeness_vOTU'].transform(max) == merged_df['completeness_vOTU']]
merged_df_best = merged_df_best[merged_df_best.groupby(['rep'], sort=False)['contig_length_vOTU'].transform(min) == merged_df_best['contig_length_vOTU']]
merged_df_best = merged_df_best.groupby(['rep']).first().reset_index()

# Select final singletons and grouped contigs based on checkv quality
final_singletons = merged_df_best[merged_df_best["rep"] == merged_df_best["mem"]]["rep"].tolist()

merged_df_diff = merged_df_best[merged_df_best["rep"] != merged_df_best["mem"]]
merged_df_diff = merged_df_diff.replace({'Complete': 4, 'High-quality': 3, 'Medium-quality': 2,'Low-quality': 1, 'Not-determined': 1})

# Choose representatives based on checkv quality comparison
merged_df_greatherthan = merged_df_diff[merged_df_diff["checkv_quality_rep"] > merged_df_diff["checkv_quality_vOTU"]]
gr1 = merged_df_greatherthan["rep"].tolist()

merged_df_smallerthan = merged_df_diff[merged_df_diff["checkv_quality_rep"] < merged_df_diff["checkv_quality_vOTU"]]
gr2 = merged_df_smallerthan["mem"].tolist()

merged_df_equal1 = merged_df_diff[(merged_df_diff["checkv_quality_rep"] == merged_df_diff["checkv_quality_vOTU"]) & 
                                       (merged_df_diff["warnings_rep"]!="contig >1.5x longer than expected genome length")]

gr3 = merged_df_equal1["rep"].tolist()

merged_df_equal2 = merged_df_diff[(merged_df_diff["checkv_quality_rep"] == merged_df_diff["checkv_quality_vOTU"]) & 
                                       (merged_df_diff["warnings_rep"]=="contig >1.5x longer than expected genome length")]
gr4 = merged_df_equal2[merged_df_equal2["warnings_vOTU"]=="contig >1.5x longer than expected genome length"]["rep"].tolist()

gr5 = merged_df_equal2[merged_df_equal2["warnings_vOTU"]!="contig >1.5x longer than expected genome length"]["mem"].tolist()

# merged_df_greatherthan.sort_values(by="contig_length_vOTU")[-30:]
representatives = final_singletons + gr1 + gr2 + gr3 +  gr4 + gr5
final_df_rep = checkv_df_vOTUs[checkv_df_vOTUs['contig_id_vOTU'].isin(representatives)]

# Sort the final_df_rep DataFrame by contig_length_vOTU column
final_df_rep.sort_values(by="contig_length_vOTU")                                                            

In [None]:
final_df_rep.groupby("checkv_quality_vOTU").size().to_csv(output_checkv_categories, index=True, header=False)
final_df_rep.groupby("checkv_quality_vOTU").size().to_frame()

In [None]:
derreplicated_df=pd.read_csv(input_derreplicated_clusters, sep="\t", names=["rep_d", "mem_d"])
derreplicated_df=derreplicated_df.merge(clustered_df, left_on="rep_d", right_on="mem")
derreplicated_df=derreplicated_df[["rep", "mem_d"]]
# Make a copy of the original DataFrame `derreplicated_df` to avoid modifying it directly
derreplicated_df_fixed = derreplicated_df.copy()

# Create a set of unique values from the "rep" column in `derreplicated_df` for quick lookup
rep_set = set(derreplicated_df["rep"])

# # Loop over each new reference (in `contig_id_vOTU` column) from `final_df_rep`
for new_reference in final_df_rep["contig_id_vOTU"]:
    
    # Check if the new reference is not already in the existing set of "rep" values
    if new_reference not in rep_set:
        
        # Find the old reference in `derreplicated_df` where `mem` column matches the `new_reference`
        # Get the corresponding value from the "rep" column
        old_reference = derreplicated_df.loc[derreplicated_df["mem_d"] == new_reference, "rep"].iloc[0]
        
        # Update `derreplicated_df_fixed` so that all occurrences of `old_reference` in "rep" are replaced by `new_reference`
        derreplicated_df_fixed.loc[derreplicated_df_fixed["rep"] == old_reference, "rep"] = new_reference


In [None]:
for rep, group in derreplicated_df_fixed.groupby("rep"):
    # Check if `rep` does not start with "P_" or "E_"
    if not rep.startswith("P_") and not rep.startswith("E_"):
        
        # Check if any value in the "mem_d" column starts with "P_" or "E_"
        if group["mem_d"].str.startswith("P_").any() or group["mem_d"].str.startswith("E_").any():
            
            # Print the matching `rep` value
            print(f"Match found for rep: {rep}")
            
            # Find values in "mem_d" that start with "P_" or "E_"
            matching_mem_d = group["mem_d"][group["mem_d"].str.startswith("P_") | group["mem_d"].str.startswith("E_")]
            
            # Collect `mem_d` and `contig_length_rep` pairs for sorting
            mem_d_lengths = []
            
            # Loop over each matching mem_d value to get the corresponding contig length from checkv_df_rep
            for mem_d in matching_mem_d:
                # Find the contig length for each matching `mem_d` in `checkv_df_rep`
                contig_length = checkv_df_rep.loc[checkv_df_rep["contig_id_rep"] == mem_d, "contig_length_rep"]
                if not contig_length.empty:
                    mem_d_lengths.append((mem_d, contig_length.iloc[0]))

            # Sort the list by `contig_length_rep` (second item in tuple)
            mem_d_lengths.sort(key=lambda x: x[1])

            # Print sorted `mem_d` values with their contig lengths
            print("Matching mem_d values sorted by contig length:")
            for mem_d, length in mem_d_lengths:
                print(f"{mem_d}: {length}")
                
            # Set new_reference as the longest (last in sorted list)
            new_reference = mem_d_lengths[-1][0]  # Get only the `mem_d` part
            print("new_reference:", new_reference)
            
            # Replace the "rep" column values where "rep" == rep with new_reference
            derreplicated_df_fixed.loc[derreplicated_df_fixed["rep"] == rep, "rep"] = new_reference

derreplicated_df_fixed.to_csv(output_new_clusters, sep="\t", index=False)


In [None]:
# Save representative contig IDs to a CSV file
derreplicated_df_fixed["rep"].to_csv(output_representatives, index=False, header=False)

In [None]:
derreplicated_df_fixed.groupby("rep").first()