In [1]:
import pandas as pd

In [None]:
# input_merged_summary="/home/lmf/MOSAIC/ORGANIC/05_vOTUs/checkV_merged_quality_summary.tot.txt"
# input_cluster_file="/home/lmf/MOSAIC/ORGANIC/05_vOTUs/combined_positive_viral_contigs.tot_95-85.clstr"
# output_representatives="/home/lmf/MOSAIC/ORGANIC/05_vOTUs/vOTU_clustering_rep_list.csv"
# output_checkv_categories="/home/lmf/MOSAIC/ORGANIC/05_vOTUs/vOTU_clustering_rep_list_checkv_per_category.csv"

In [None]:
input_merged_summary=snakemake.input.merged_summary
input_cluster_file=snakemake.input.cluster_file
output_representatives=snakemake.output.representatives
output_checkv_categories=snakemake.output.checkv_categories

In [9]:
# Read and preprocess the clustered_df DataFrame
clustered_df = pd.read_csv(input_cluster_file, sep="\t", names=["rep", "mem"])
clustered_df["mem"] = clustered_df["mem"].str.split(",")
clustered_df = clustered_df.explode('mem').reset_index(drop=True)

# Read and preprocess the checkv_df_vOTUs DataFrame
checkv_df_vOTUs = pd.read_csv(input_merged_summary, sep="\t",
                            names= ["contig_id_vOTU", "contig_length_vOTU", "provirus_vOTU", "proviral_length_vOTU",
                           "gene_count_vOTU", "viral_genes_vOTU", "host_genes_vOTU", "checkv_quality_vOTU",
                           "miuvig_quality_vOTU", "completeness_vOTU", "completeness_method_vOTU",
                           "contamination_vOTU", "kmer_freq_vOTU", "warnings_vOTU"])
checkv_df_vOTUs = checkv_df_vOTUs[["contig_id_vOTU", "contig_length_vOTU", "checkv_quality_vOTU",
                                   "completeness_vOTU", "provirus_vOTU", "warnings_vOTU"]]
checkv_df_vOTUs["contig_length_vOTU"] = checkv_df_vOTUs["contig_length_vOTU"].astype(float)
checkv_df_vOTUs["completeness_vOTU"] = checkv_df_vOTUs["completeness_vOTU"].astype(float)

# Create copies of the checkv_df_vOTUs DataFrame
checkv_df_rep = checkv_df_vOTUs.copy()
checkv_df_rep.columns = ["contig_id_rep", "contig_length_rep", "checkv_quality_rep", "completeness_rep",
                         "provirus_rep", "warnings_rep"]

# Merge DataFrames and drop unnecessary columns
merged_df = clustered_df.merge(checkv_df_rep, left_on="rep", right_on="contig_id_rep", how="left") \
                        .merge(checkv_df_vOTUs, left_on="mem", right_on="contig_id_vOTU", how="left")
merged_df = merged_df.drop(['contig_id_vOTU', 'contig_id_rep'], axis=1)
merged_df[['completeness_vOTU', 'contig_length_vOTU', 'completeness_rep']] = merged_df[
    ['completeness_vOTU', 'contig_length_vOTU', 'completeness_rep']].fillna(0)
merged_df["completeness_vOTU"] = merged_df["completeness_vOTU"].astype(float)
merged_df["contig_length_vOTU"] = merged_df["contig_length_vOTU"].astype(float)

# Choose the contig with the highest completeness and the lowest contig length
merged_df_best = merged_df[merged_df.groupby(['rep'], sort=False)['completeness_vOTU'].transform(max) == merged_df['completeness_vOTU']]
merged_df_best = merged_df_best[merged_df_best.groupby(['rep'], sort=False)['contig_length_vOTU'].transform(min) == merged_df_best['contig_length_vOTU']]
merged_df_best = merged_df_best.groupby(['rep']).first().reset_index()

# Select final singletons and grouped contigs based on checkv quality
final_singletons = merged_df_best[merged_df_best["rep"] == merged_df_best["mem"]]["rep"].tolist()

merged_df_diff = merged_df_best[merged_df_best["rep"] != merged_df_best["mem"]]
merged_df_diff = merged_df_diff.replace({'Complete': 4, 'High-quality': 3, 'Medium-quality': 2,'Low-quality': 1, 'Not-determined': 1})

# Choose representatives based on checkv quality comparison
merged_df_greatherthan = merged_df_diff[merged_df_diff["checkv_quality_rep"] >= merged_df_diff["checkv_quality_vOTU"]]
gr1 = merged_df_greatherthan["rep"].tolist()

merged_df_smallerthan = merged_df_diff[merged_df_diff["checkv_quality_rep"] < merged_df_diff["checkv_quality_vOTU"]]
gr2 = merged_df_smallerthan["mem"].tolist()

representatives = final_singletons + gr1 + gr2
final_df_rep = checkv_df_vOTUs[checkv_df_vOTUs['contig_id_vOTU'].isin(representatives)]

# Save representative contig IDs to a CSV file
final_df_rep["contig_id_vOTU"].to_csv(output_representatives, index=False, header=False)

# Sort the final_df_rep DataFrame by contig_length_vOTU column
final_df_rep.sort_values(by="contig_length_vOTU")

                                                                  


Unnamed: 0,contig_id_vOTU,contig_length_vOTU,checkv_quality_vOTU,completeness_vOTU,provirus_vOTU,warnings_vOTU
124158,Vir_3_T_tig00004100_len=1016_reads=2_class=con...,718.0,Not-determined,,No,no viral genes detected
97441,Vir_3_2_2_NODE_9072_length_1000_cov_2.717460,1000.0,Low-quality,2.16,No,
37074,Vir_3_T_NODE_45770_length_1000_cov_3.016931,1000.0,Low-quality,2.37,No,no viral genes detected
37075,Vir_3_T_NODE_45771_length_1000_cov_3.003175,1000.0,Not-determined,,No,
37076,Vir_3_T_NODE_45772_length_1000_cov_2.996825,1000.0,Low-quality,0.89,No,
...,...,...,...,...,...,...
125779,Vir_3_T_tig00022782_len=123915_reads=429_class...,125742.0,Medium-quality,88.97,No,
0,Vir_3_T_NODE_4_length_136511_cov_14.214501,136511.0,Low-quality,37.63,No,
125298,Vir_3_T_tig00000162_len=150443_reads=3504_clas...,151838.0,High-quality,100.00,No,contig >1.5x longer than expected genome length
125083,Vir_3_T_tig00000093_len=165649_reads=3063_clas...,164373.0,High-quality,100.00,No,contig >1.5x longer than expected genome length


In [13]:
final_df_rep.groupby("checkv_quality_vOTU").size().to_csv(output_checkv_categories, index=False, header=False)
final_df_rep.groupby("checkv_quality_vOTU").size().to_frame()

Unnamed: 0_level_0,0
checkv_quality_vOTU,Unnamed: 1_level_1
Complete,54
High-quality,181
Low-quality,46362
Medium-quality,360
Not-determined,6593


# BANREP

In [None]:

# merged_df
merged_df=merged_df[merged_df["rep"]!=merged_df["mem"]]

merged_df=merged_df[merged_df["completeness_rep"]!=merged_df["completeness_vOTU"]]
merged_df=merged_df[merged_df["completeness_rep"]>merged_df["completeness_vOTU"]+10]

merged_df
# clustered_df=clustered_df.merge(checkv_df_rep, left_on="rep", right_on="contig_id_rep", how="left")
# clustered_df = clustered_df.drop('contig_id_rep', axis=1)
# clustered_df=clustered_df[clustered_df["contig_length_rep"].isna()==False]


# merged_df

# # merged_df.groupby('rep').first().groupby('checkv_quality_rep').count()
# merged_df=merged_df[(merged_df["completeness_vOTU"]<merged_df["completeness_rep"])]
# merged_df
# merged_df.groupby('checkv_quality_rep').count()

# merged_df=merged_df.replace('Complete',4)
# merged_df=merged_df.replace('High-quality',3)
# merged_df=merged_df.replace('Medium-quality',2)
# merged_df=merged_df.replace('Low-quality',1)
# merged_df=merged_df.replace('Not-determined',0)
# merged_df['checkv_quality_votu'] =  merged_df['checkv_quality_votu'].fillna(4)
# merged_df
# # merged_df['warnings'] =  merged_df['warnings'].fillna("none")

# # # merged_df
# # merged_df[merged_df["warnings"]!="none"]
# # merged_df=merged_df[merged_df["warnings"]=="none"]


textfile = open("/home/lmf/PhylloVir/VIRAL_WORLD/VIRAL_FRACTION/08_ASSEMBLY_TEST/reference_genomes.txt", "w")
merged_df
for element in list(merged_df["rep"]):
    textfile.write(element + "\n")
textfile.close()
merged_df

# merged_df[merged_df["rep"]=="NODE_3_length_455224_cov_96.893901"]


In [None]:
assembly_df=pd.read_csv("/home/lmf/PhylloVir/SEQUENCING_TEST_VIROMES/VIRAL_FRACTION/08_ASSEMBLY_TEST/assembled_contigs.txt",sep=":", names=["file","contig"])
assembly_df["contig"]=assembly_df["contig"].str[1:]
assembly_df["file"]=assembly_df["file"].str.split("_positive").str[0]
# merged_df.merge(assembly_df, right_on="contig", left_on=)
assembly_df


In [None]:
from Bio import SeqIO
fasta_sequences = SeqIO.parse(open("/home/lmf/PhylloVir/SEQUENCING_TEST_VIROMES/VIRAL_FRACTION/08_ASSEMBLY_TEST/reference_genomes.fasta"),'fasta')
with open("/home/lmf/PhylloVir/SEQUENCING_TEST_VIROMES/VIRAL_FRACTION/reference_genomes_formatted.fasta", "w") as out_file:
    for fasta in fasta_sequences:
        name, sequence = fasta.id, str(fasta.seq)
        print(">" + list(assembly_df[assembly_df["contig"]==name]["file"])[0]+ "_"+ name, file=out_file)
        print(sequence, file=out_file)
        
#         new_sequence = some_function(sequence)
#         write_fasta(out_file)
        

In [None]:
cat reference_full_phages.fasta reference_genomes_formatted.fasta > reference_phages.fasta

In [None]:
# import pandas as pd
# work_dir="/home/lmf/PhylloVir/VIRAL_WORLD/VIRAL_FRACTION"
# core_virome_df=pd.read_csv(work_dir + "/core_metadata.csv", index_col=0)
# core_virome_df.rename({"checkv_quality": "checkv_quality_votu"}, axis=1, inplace=True)
# # # color_plot_df
# # checkv_df=pd.read_csv("/home/lmf/PhylloVir/VIRAL_WORLD/VIRAL_FRACTION/05_vOTUs/merged_checkV_quality_summary.txt"
# #                      ,sep="\t", names=["contig_id", "contig_length_votu", "provirus_votu", "proviral_length", "gene_count", "viral_genes", "host_genes", "checkv_quality_votu", "miuvig_quality", "completeness_votu", "completeness_method", "contamination", "kmer_freq", "warnings_votu"])
# # checkv_df=checkv_df[["contig_id", "contig_length_votu", "checkv_quality_votu", "completeness_votu", "provirus_votu", "warnings_votu"]]
# # core_virome_df=core_virome_df.merge(checkv_df, left_on="contig", right_on="contig_id", how="left")
# # core_virome_df = core_virome_df.drop('contig_id_x', "contig, axis=1)


# core_virome_df



In [None]:
seqtk subseq high_quality_genomes_and_workflow.fasta subassembly_vOTU_clustering_rep_list.csv > subassembly_vOTU_clustering_rep.fasta
cp subassembly_vOTU_clustering_rep.fasta 95-85_positive_viral_contigs.tot.fasta

