# FILTER

In [None]:
import pandas as pd
from Bio import SeqIO   
import matplotlib.pyplot as plt
from matplotlib_venn import venn3

In [None]:
input_df_counts_paired=snakemake.input.df_counts_paired
input_merged_summary=snakemake.input.merged_summary
input_vOTUs_prefiltered=snakemake.input.vOTUs_prefiltered
input_vibrant_circular=snakemake.input.vibrant_circular
input_vibrant_positive=snakemake.input.vibrant_positive
input_vibrant_quality=snakemake.input.vibrant_quality
input_vibrant_summary=snakemake.input.vibrant_summary
input_virsorter_table=snakemake.input.virsorter_table
input_virsorter_positive_list=snakemake.input.virsorter_positive_list
input_genomad_virus_summary=snakemake.input.genomad_virus_summary
input_genomad_plasmid_summary=snakemake.input.genomad_plasmid_summary
input_genomad_viral_fasta=snakemake.input.genomad_viral_fasta
input_genomad_viral_fasta_conservative=snakemake.input.genomad_viral_fasta_conservative
#________
output_summary=snakemake.output.summary
output_filtered_list=snakemake.output.filtered_list

SAMPLES=snakemake.params.samples
min_votu_len=snakemake.params.min_votu_len

In [None]:
# read checkv merged summary
checkv_df= pd.read_csv(input_merged_summary, sep="\t",
                            names= ["contig_id_vOTU", "contig_length_vOTU", "provirus_vOTU", "proviral_length_vOTU",
                           "gene_count_vOTU", "viral_genes_vOTU", "host_genes_vOTU", "checkv_quality_vOTU",
                           "miuvig_quality_vOTU", "completeness_vOTU", "completeness_method_vOTU",
                           "contamination_vOTU", "kmer_freq_vOTU", "warnings_vOTU"])
checkv_df

In [None]:
# Create an empty list to store vOTU names
vOTU_list = []

# Iterate over records in input_vOTUs_prefiltered file and append record IDs to the list
for record in SeqIO.parse(input_vOTUs_prefiltered, "fasta"):
    vOTU_list.append(record.id)

# Create a DataFrame with vOTU names
vOTU_prefiltered = pd.DataFrame(vOTU_list, columns=["vOTU_name"])

# Read data from input_merged_summary file into checkv_df DataFrame with specified column names
checkv_df = pd.read_csv(input_merged_summary, sep="\t", names=["checkV_contig_id", "checkV_contig_length", "checkV_provirus",
                                                                "checkV_proviral_length", "checkV_gene_count", "checkV_viral_genes",
                                                                "checkV_host_genes", "checkv_quality", "checkV_miuvig_quality",
                                                                "checkV_completeness", "checkV_completeness_method", "checkV_contamination",
                                                                "checkV_kmer_freq", "checkV_warnings"])

# Read data from input_vibrant_circular file into vibrant_circular_df DataFrame with a single column name
vibrant_circular_df = pd.read_csv(input_vibrant_circular, names=["VIBRANT_circular_name"])
vibrant_circular_df["VIBRANT_circular"] = "Y"

# Read data from input_vibrant_positive file into vibrant_positive_df DataFrame with a single column name
vibrant_positive_df = pd.read_csv(input_vibrant_positive, names=["VIBRANT_positive_name"])
vibrant_positive_df["VIBRANT_positive"] = "Y"

# Read data from input_vibrant_quality file into vibrant_quality_df DataFrame with specific column names
vibrant_quality_df = pd.read_csv(input_vibrant_quality, sep="\t")
vibrant_quality_df.columns = ["VIBRANT_scaffold", "VIBRANT_lifecycle", "VIBRANT_quality"]
vibrant_quality_df = vibrant_quality_df.groupby("VIBRANT_scaffold").first()

# Read data from input_vibrant_summary file into vibrant_summary_df DataFrame with modified column names
vibrant_summary_df = pd.read_csv(input_vibrant_summary, sep="\t")
vibrant_summary_df.columns = ['VIBRANT_' + str(col).replace(" ", "_") for col in vibrant_summary_df.columns]

# Read data from input_virsorter_table file into virsorter_summary_df DataFrame with modified column names
virsorter_summary_df = pd.read_csv(input_virsorter_table, sep="\t")
virsorter_summary_df.columns = ['VirSorter2_' + str(col).replace(" ", "_") for col in virsorter_summary_df.columns]
virsorter_summary_df["VirSorter2_positive"] = "Y"

# Read data from input_genomad_virus_summary file into genomad_summary_df DataFrame with modified column names
genomad_summary_df = pd.read_csv(input_genomad_virus_summary, sep="\t")
genomad_summary_df.columns = ['geNomad_' + str(col).replace(" ", "_") for col in genomad_summary_df.columns]

# Create an empty list to store viral names
viral_list = []

# Iterate over records in input_genomad_viral_fasta file and append record IDs to the list
for record in SeqIO.parse(input_genomad_viral_fasta, "fasta"):
    viral_list.append(record.id)

# Create a DataFrame with viral names
geNomad_viral = pd.DataFrame(viral_list, columns=["geNomad_viral_name"])
geNomad_viral["geNomad_viral"] = "Y"

# Create an empty list to store conservative viral names
viral_list_conservative = []

# Iterate over records in input_genomad_viral_fasta_conservative file and append record IDs to the list
for record in SeqIO.parse(input_genomad_viral_fasta_conservative, "fasta"):
    viral_list_conservative.append(record.id)

# Create a DataFrame with conservative viral names
geNomad_viral_conservative = pd.DataFrame(viral_list_conservative, columns=["geNomad_viral_conservative_name"])
geNomad_viral_conservative["geNomad_viral_conservative"] = "Y"


In [None]:
# Merge vOTU_prefiltered with vibrant_circular_df based on vOTU_name and VIBRANT_circular_name columns
df_temp1 = vOTU_prefiltered.merge(vibrant_circular_df, left_on="vOTU_name", right_on="VIBRANT_circular_name", how="left")

# Merge df_temp1 with vibrant_positive_df based on vOTU_name and VIBRANT_positive_name columns
df_temp2 = df_temp1.merge(vibrant_positive_df, left_on="vOTU_name", right_on="VIBRANT_positive_name", how="left")

# Merge df_temp2 with vibrant_quality_df based on vOTU_name and VIBRANT_scaffold columns
df_temp3 = df_temp2.merge(vibrant_quality_df, left_on="vOTU_name", right_on="VIBRANT_scaffold", how="left")

# Merge df_temp3 with vibrant_summary_df based on vOTU_name and VIBRANT_scaffold columns
df_temp4 = df_temp3.merge(vibrant_summary_df, left_on="vOTU_name", right_on="VIBRANT_scaffold", how="left")

# Merge df_temp4 with virsorter_summary_df based on vOTU_name and VirSorter2_seqname columns
df_temp5 = df_temp4.merge(virsorter_summary_df, left_on="vOTU_name", right_on="VirSorter2_seqname", how="left")

# Merge df_temp5 with genomad_summary_df based on vOTU_name and geNomad_seq_name columns
df_temp6 = df_temp5.merge(genomad_summary_df, left_on="vOTU_name", right_on="geNomad_seq_name", how="left")

# Merge df_temp6 with geNomad_viral based on vOTU_name and geNomad_viral_name columns
df_temp7 = df_temp6.merge(geNomad_viral, left_on="vOTU_name", right_on="geNomad_viral_name", how="left")

# Merge df_temp7 with geNomad_viral_conservative based on vOTU_name and geNomad_viral_conservative_name columns
df_temp8 = df_temp7.merge(geNomad_viral_conservative, left_on="vOTU_name", right_on="geNomad_viral_conservative_name", how="left")

# Merge df_temp8 with checkv_df based on vOTU_name and checkV_contig_id columns
df_merged = df_temp8.merge(checkv_df, left_on="vOTU_name", right_on="checkV_contig_id", how="left")

# Drop unnecessary columns from df_merged
df_merged = df_merged.drop(["VIBRANT_circular_name", "VIBRANT_positive_name", "VIBRANT_scaffold", "checkV_contig_id"], axis=1)

# Return the merged DataFrame
df_merged


In [None]:
df=df_merged[["VirSorter2_positive","VIBRANT_positive","geNomad_viral","geNomad_viral_conservative"]]

# Get the counts for each category
a_count = df['VirSorter2_positive'].value_counts()['Y']
b_count = df['VIBRANT_positive'].value_counts()['Y']
c_count = df['geNomad_viral_conservative'].value_counts()['Y']
ab_count = df[df['VirSorter2_positive'] == 'Y']['VIBRANT_positive'].value_counts()['Y']
ac_count = df[df['VirSorter2_positive'] == 'Y']['geNomad_viral_conservative'].value_counts()['Y']
bc_count = df[(df['VIBRANT_positive'] == 'Y') & (df['geNomad_viral_conservative'] == 'Y')]['VIBRANT_positive'].value_counts()['Y']
abc_count = df[(df['VirSorter2_positive'] == 'Y') & (df['VIBRANT_positive'] == 'Y') & (df['geNomad_viral_conservative'] == 'Y')]['VIBRANT_positive'].value_counts()['Y']

# Create the Venn diagram
venn_labels = {'100': a_count - ab_count - ac_count + abc_count,
               '010': b_count - ab_count - bc_count + abc_count,
               '001': c_count - ac_count - bc_count + abc_count,
               '110': ab_count - abc_count,
               '101': ac_count - abc_count,
               '011': bc_count - abc_count,
               '111': abc_count}

plt.figure(figsize=(8, 6))
venn_diagram = venn3(subsets=venn_labels, set_labels=('VirSorter2_positive', 'VIBRANT_positive', 'geNomad_viral_conservative'))

# Add the counts to the Venn diagram
for text in venn_diagram.set_labels:
    text.set_fontsize(14)
for text in venn_diagram.subset_labels:
    if text:
        text.set_fontsize(12)

# Show the Venn diagram
plt.title("Venn Diagram of Categories 'VirSorter2_positive', 'VIBRANT_positive', and 'geNomad_viral_conservative'")
plt.show()

In [None]:
# Count the number of 'Y' occurrences in columns specified in columns_to_count and store the result in a new column 'count_Y'
df_merged['count_Y'] = df_merged[columns_to_count].eq('Y').sum(axis=1)

# Define the desired column order in the DataFrame
move_cols = ["vOTU_name", "count_Y", "checkV_contig_length", "VirSorter2_positive", "VIBRANT_positive", "geNomad_viral", "geNomad_viral_conservative"]

# Reorder the columns in df_merged according to move_cols and keep the remaining columns in their original order
df_merged = df_merged[move_cols + [col for col in df_merged.columns if col not in move_cols]]

# Create a filtered DataFrame by applying conditions on count_Y and checkV_contig_length columns
df_merged_filtered = df_merged[(df_merged["count_Y"] >= 3) & (df_merged["checkV_contig_length"] >= min_votu_len)]

# Write the values in the 'vOTU_name' column to output_filtered_list file without the index and header
df_merged_filtered["vOTU_name"].to_csv(output_filtered_list, index=False, header=False)

# Write df_merged_filtered to output_summary file
df_merged_filtered.to_csv(output_summary)

# Return df_merged_filtered DataFrame
df_merged_filtered


In [None]:
df_merged_filtered.groupby("checkv_quality").size().to_frame()