# SGA Phages

This notebook should be run after SGA_CRISPR_Cas.ipynb

In [4]:
import glob
import os
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from Bio import SearchIO
from Bio import SeqIO
import json
from collections import defaultdict
import skbio
import subprocess as sp
from collections import OrderedDict
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Seq import Seq
from bs4 import BeautifulSoup
import math

In [5]:
rootdir = "to_fill_in"

In [6]:
def makedir(path):
    if not os.path.isdir(path):
        os.mkdir(path)

## Blast drepped spacers to IMG/VR & GVD

In [7]:
makedir(rootdir + "Blast_spacers/")
makedir(rootdir + "Blast_spacers/spacer_blast_db")

cp = "cp {0}All_qc_spacers_derepped.fna {0}Blast_spacers/spacer_blast_db/All_qc_spacers_derepped.fna".format(rootdir)
os.system(cp)

0

In [8]:
#make blast database out of spacers
cmd = 'makeblastdb -in {0}Blast_spacers/spacer_blast_db/All_qc_spacers_derepped.fna -dbtype nucl'.format(rootdir)
os.system(cmd)

0

In [11]:
#blast function
def blast_spacer(query, output):
    cmd = "blastn -task 'blastn-short' -query {0} -db {1}Blast_spacers/spacer_blast_db/All_qc_spacers_derepped.fna -outfmt '6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen' -out {2} -evalue 0.003 -word_size 7 -gapopen 10 -gapextend 2 -penalty -1 -max_target_seqs 1000 -num_threads 16".format(query, rootdir, output)
    print(cmd)

In [12]:
makedir(rootdir + "Blast_spacers/Blast_output")

In [13]:
#IMG and GVD blast commands
#IMG
blast_spacer("path_to_IMGVR_phages/IMGVR_all_nucleotides.fna", rootdir + "Blast_spacers/Blast_output/ALL_Spacers_QC_blast_IMGVR.csv")

#GVD
blast_spacer("path_to_GVD_phages/GVDv1_viralpopulations.fna", rootdir + "Blast_spacers/Blast_output/ALL_Spacers_QC_blast_GVD_human_gut.csv")

blastn -task 'blastn-short' -query path_to_IMGVR_phages/IMGVR_all_nucleotides.fna -db /groups/banfield/projects/multienv/cpr/2020/tm7_sr1_gracili/Env_CPR_Alex/Clean_run_for_pub/Blast_spacers/spacer_blast_db/All_qc_spacers_derepped.fna -outfmt '6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen' -out /groups/banfield/projects/multienv/cpr/2020/tm7_sr1_gracili/Env_CPR_Alex/Clean_run_for_pub/Blast_spacers/Blast_output/ALL_Spacers_QC_blast_IMGVR.csv -evalue 0.003 -word_size 7 -gapopen 10 -gapextend 2 -penalty -1 -max_target_seqs 1000 -num_threads 16
blastn -task 'blastn-short' -query path_to_GVD_phages/GVDv1_viralpopulations.fna -db /groups/banfield/projects/multienv/cpr/2020/tm7_sr1_gracili/Env_CPR_Alex/Clean_run_for_pub/Blast_spacers/spacer_blast_db/All_qc_spacers_derepped.fna -outfmt '6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen' -out /groups/banfield/projects/multienv/cpr/2020/tm7_sr1_grac

### parse blast output, grab spacer hits

IMG/VR

In [14]:
#function to screen blast results to >= 95% coverage and <= 1 mismatch of the spacer
def get_spacer_hits(blast_output, Viral_database):
    df = skbio.io.read(blast_output, format="blast+6", into=pd.DataFrame, default_columns=False,  columns=['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'qlen', 'slen'])
    #one mismatch
    df = df[df["mismatch"] <= 1]
    #95% coverage
    df["Coverage"] = df['length']/df['slen']
    df = df[df["Coverage"] >= 0.95]
    df["Viral_database"] = Viral_database
    return df

In [17]:

IMG_VR_spacer_df = get_spacer_hits(rootdir + "Blast_spacers/Blast_output/ALL_Spacers_QC_blast_IMGVR.csv", "IMG_VR")
IMG_VR_spacer_df["UViG"] = IMG_VR_spacer_df["qseqid"].apply(lambda x: x.split("|")[0])

#host data
IMG_VR_Host_dict = {}
IMG_VR_host_df = pd.read_csv(rootdir + "metadata/IMGVR_all_Host_information.tsv", sep="\t")
for key, row in IMG_VR_host_df.iterrows():
    IMG_VR_Host_dict[row["## UViG"]] = row["Host taxonomy prediction"]


IMG_VR_spacer_df["host"] = IMG_VR_spacer_df.UViG.map(IMG_VR_Host_dict).fillna("none")

In [18]:
#dropping eukaryotic virsues
IMG_VR_spacer_df_no_euk = IMG_VR_spacer_df[~IMG_VR_spacer_df.host.str.contains("Eukaryota", case=False)]
IMG_VR_spacer_df_no_euk = IMG_VR_spacer_df_no_euk[IMG_VR_spacer_df_no_euk.qseqid != "GCA_002966405.1"]

GVD

In [19]:
GVD_Human_Gut_spacer_df = get_spacer_hits(rootdir + "Blast_spacers/Blast_output/ALL_Spacers_QC_blast_GVD_human_gut.csv", "GVD_Human_Gut_Virome")

## get phages from both databases:

In [20]:
#drop duplicates because there are overlaping phages between the databases and get a list of the phages

IMG_VR_GVD_Hits_df = pd.concat([IMG_VR_spacer_df_no_euk.drop(columns=["host", "UViG"]), GVD_Human_Gut_spacer_df])

IMG_VR_phages = IMG_VR_GVD_Hits_df.drop_duplicates(subset="qseqid")[IMG_VR_GVD_Hits_df.Viral_database == "IMG_VR"].qseqid
GVD_phages = IMG_VR_GVD_Hits_df.drop_duplicates(subset="qseqid")[IMG_VR_GVD_Hits_df.Viral_database == "GVD_Human_Gut_Virome"].qseqid

  IMG_VR_phages = IMG_VR_GVD_Hits_df.drop_duplicates(subset="qseqid")[IMG_VR_GVD_Hits_df.Viral_database == "IMG_VR"].qseqid
  GVD_phages = IMG_VR_GVD_Hits_df.drop_duplicates(subset="qseqid")[IMG_VR_GVD_Hits_df.Viral_database == "GVD_Human_Gut_Virome"].qseqid


In [21]:
#write header names to file
#img vr
with open(rootdir + "Blast_spacers/img_vr_phage_hit_list.txt", "w") as file:
    for phage in IMG_VR_phages:
        file.write(phage + "\n")
#gvd
with open(rootdir + "Blast_spacers/GVD_phages_hit_list.txt", "w") as file:
    for phage in GVD_phages:
        file.write(phage + "\n")

In [None]:
#img_vr pullseq
pullseq = 'pullseq -n {0}Blast_spacers/img_vr_phage_hit_list.txt -i path_to_IMGVR_database/IMGVR_all_nucleotides.fna > "{0}Blast_spacers/IMG_VR_hits.fna"'.format(rootdir)
os.system(pullseq)

#gvd pullseq
pullseq = 'pullseq -n {0}Blast_spacers/GVD_phages_hit_list.txt -i path_to_GVD_database/GVDv1_viralpopulations.fna > "{0}Blast_spacers/GVD_hits.fna"'.format(rootdir)
os.system(pullseq)

#cat IMG_VR and GVD hits
cmd = 'cat {0}Blast_spacers/IMG_VR_hits.fna {0}Blast_spacers/GVD_hits.fna > {0}Blast_spacers/IMG_VR_GVD_hits.fna'.format(rootdir)
os.system(cmd)

### Drep Phages

In [11]:
makedir(rootdir + "Blast_spacers/drep_phages/")
makedir(rootdir + "Blast_spacers/drep_phages/prelim_genomes")


In [14]:
#Creating unique fastas for each genome and a text file with the paths
with open(rootdir + "Blast_spacers/drep_phages/IMG_VR_GVD_file_path.txt", "w") as file:
    for phage in SeqIO.parse(rootdir + "Blast_spacers/IMG_VR_GVD_hits.fna", "fasta"):
        #removing pipe symbols from the file names but not the fasts headers
        phage_id = phage.id.replace("|", "_")
        with open(rootdir + "Blast_spacers/drep_phages/prelim_genomes/{0}.fna".format(phage_id), "w") as fna_file:
            fna_file.write(">{0}\n{1}\n".format(phage.id, str(phage._seq)))
        file.write(rootdir + "Blast_spacers/drep_phages/prelim_genomes/{0}.fna\n".format(phage_id))

In [13]:
makedir(rootdir + "Blast_spacers/drep_phages/drep_results_99_95/")

In [None]:
#read in results
Drep_results = pd.read_csv(rootdir + "Blast_spacers/drep_phages/drep_results_99_95/data_tables/Cdb.csv")

In [None]:
#grabbing the dreped phages
drep_phage_list = []
with open(rootdir + "/Blast_spacers/drep_phages/IMG_VR_GVD_file_path.txt", "w") as text_file:
    with open(rootdir + "IMG_VR_GVD_hits_dRepped.fna", "w") as file:
        for phage in Drep_results.drop_duplicates(subset="secondary_cluster").genome:
            for record in SeqIO.parse(rootdir + "Blast_spacers/drep_phages/prelim_genomes/{0}".format(phage), "fasta"):
                file.write(">{0}\n{1}\n".format(record.id, str(record._seq)))
                text_file.write(record.id + "\n")
                drep_phage_list.append(record.id)

compile metadata for phage hits

In [None]:
#compile metadata for phage hits
#phage hits complete systems

gvd_img_hits = pd.concat([GVD_Human_Gut_spacer_df,IMG_VR_spacer_df_no_euk])
gvd_img_hits["host"] = gvd_img_hits["host"].fillna("none")
gvd_img_hits = gvd_img_hits.rename(columns={"qseqid":"phage", "sseqid":"spacer_ID", "mismatch":"cpr_spacer_mismatch", "qlen":"phage_length", "slen":"spacer_length", "evalue":"evalue_cpr_spacer_blast", "Coverage":"Spacer_coverage", "host":"IMG_VR_predicted_host"})

gvd_img_hits_sub = gvd_img_hits[["phage", "spacer_ID", "cpr_spacer_mismatch", "phage_length", "spacer_length", "Spacer_coverage", "IMG_VR_predicted_host"]]

In [None]:
#only look at drepped phages
gvd_img_hits_sub_drep = gvd_img_hits_sub[gvd_img_hits_sub.phage.isin(drep_phage_list)]

In [None]:
#make a spacer2scaf dict and mapping
spacer2scaf = {}
for record in SeqIO.parse(rootdir + "All_qc_spacers_derepped.fna", "fasta"):
    scaffold_name = record.id.split("_spacer")[0].split("_region")[0]
    spacer2scaf[record.id] = scaffold_name

gvd_img_hits_sub_drep["cpr_spacer_scaffold"] = gvd_img_hits_sub_drep.spacer_ID.map(spacer2scaf)

In [None]:
#Creating dictionary to match SGA scaffolds to a bin name
scaf2bin_dic = {}
for file in glob.glob(rootdir + "All_genomes/*fna"):
    name = file.replace(rootdir + "All_genomes/", "").replace(".fna", "")
    for record in SeqIO.parse(open(file), "fasta"):
        scaf2bin_dic[record.description.split(" ")[0]] = name

In [None]:
# adding in CPR bin and cpr tax

#maping our assembled scaffolds to the publically avalible scaffolds
assembled2public = {}
assembled_scaffolds_df = pd.read_csv(rootdir + "metadata/merged_public_assembled_scaffolds.csv")
for key, row in assembled_scaffolds_df.iterrows():
    assembled2public[row["ccf_scaffold_name"]] = row["assembled_scaffold_name"]

gvd_img_hits_sub_drep = gvd_img_hits_sub_drep.reset_index(drop=True)
gvd_img_hits_sub_drep["public_cpr_scaffold"] = ""
for indA in gvd_img_hits_sub_drep.index:
    if gvd_img_hits_sub_drep["cpr_spacer_scaffold"][indA] in assembled2public.keys():
        gvd_img_hits_sub_drep["public_cpr_scaffold"][indA] = assembled2public[gvd_img_hits_sub_drep["cpr_spacer_scaffold"][indA]]
    else:
        gvd_img_hits_sub_drep["public_cpr_scaffold"][indA] = gvd_img_hits_sub_drep["cpr_spacer_scaffold"][indA]

#match scaffold with bin name
gvd_img_hits_sub_drep["bin"] = gvd_img_hits_sub_drep.public_cpr_scaffold.map(scaf2bin_dic)

#bin2tax mapping
bin_2_tax_df = pd.read_csv(rootdir + "metadata/bin2tax.tsv", "\t", names=["bin", "tax"])
bin2tax_dict = {}
for key, row in bin_2_tax_df.iterrows():
    bin2tax_dict[row["bin"]] = row["tax"]

gvd_img_hits_sub_drep["cpr_tax"] = gvd_img_hits_sub_drep.bin.map(bin2tax_dict)

## SGA Prophages

In [15]:
makedir(rootdir + "VIBRANT_Prophages")

In [None]:
#subsetting vibrant lysogenic results to derepped genomes
drep_clusters = pd.read_csv(rootdir + "metadata/drep_df.csv")
drep_clusters = drep_clusters.drop_duplicates(subset=["secondary_cluster"])
drep_genomes = drep_clusters.genome.str.replace(".fna", "")

non_in_bins = []
with open(rootdir + "VIBRANT_Prophages/drep_lysogenic_genes.faa", "w") as file:
    for record in SeqIO.parse(open(rootdir + "VIBRANT_Prophages/VIBRANT_ALL_genome_proteins/VIBRANT_results_ALL_genome_proteins/ALL_genome_proteins.phages_lysogenic.faa"), "fasta"):
        bin = scaf2bin_dic[record.id.split("-see_note")[0].rsplit("_", 1)[0].rsplit("_fragment", 1)[0]]
        if bin in list(drep_genomes):
            file.write(">{0}\n{1}\n".format(record.id, str(record._seq)))
        else:
            non_in_bins.append(bin)


## Circularizaing Phages predicted by spacer matching via Vibrant

In [16]:
makedir(rootdir + "VIBRANT_circularization/")

In [None]:
#read in results
vibrant_circularization_df = pd.read_csv(rootdir + "VIBRANT_circularization/VIBRANT_results_VIBRANT_circularization/VIBRANT_complete_circular_VIBRANT_circularization.tsv", sep="\t", names=["phage", "lytic_or_lysogenic", "circular"])

## Identify likely genetic code of SGA phages

In [17]:
makedir(rootdir + "determining_phage_genetic_code/")
makedir(rootdir + "determining_phage_genetic_code/drepped_genomes/")

In [None]:
#Creating unique fastas for each drepped genome
for phage in SeqIO.parse(rootdir + "IMG_VR_GVD_hits_dRepped.fna", "fasta"):
    #removing pipe symbols from the file names but not the fasts headers
    phage_id = phage.id.replace("|", "_")
    with open(rootdir + "/determining_phage_genetic_code/drepped_genomes/{0}.fna".format(phage_id), "w") as fna_file:
        fna_file.write(">{0}\n{1}\n".format(phage.id, str(phage._seq)))

In [None]:
# run prodigal varying code
for genome in glob.glob(rootdir + "/determining_phage_genetic_code/drepped_genomes/*.fna"):
    scaflen = [len(record.seq) for record in SeqIO.parse(open(genome), "fasta")][0]
    if scaflen >20000:
        for code in ["11", "25"]:
            prodigal = "prodigal -p single -m -a %s -d %s -g %s -i %s" %(genome.replace("fna", code + ".faa"), genome.replace("fna", code + ".genes"), code, genome)
            sp.call(prodigal, shell=True)
    else:
        prodigal = "prodigal -p meta -m -a %s -d %s -i %s -o %s" %(genome.replace("fna", "meta.faa"), genome.replace("fna", "meta.genes"), genome, genome.replace("fna", "meta.gbk"))
        sp.call(prodigal, shell=True)

In [None]:
#get correct phage name
mod_phage_name2phagename_dict = {}
for phage in glob.glob(rootdir + "/determining_phage_genetic_code/drepped_genomes/*fna"):
    for record in SeqIO.parse(phage, "fasta"):
        mod_phage_name2phagename_dict[(os.path.basename(phage).replace(".fna", ""))] = record.id

In [None]:
# read results back into df
coding_info = defaultdict(list)
for genome in glob.glob(rootdir + "/determining_phage_genetic_code/drepped_genomes/*fna"):
    scaflen = [len(record.seq) for record in SeqIO.parse(open(genome), "fasta")][0]
    for pred in glob.glob(genome.replace(".fna", "*genes")):
        if "meta" in pred:
            code = open(pred.replace("genes", "gbk")).readlines()[0].rsplit(";", 2)[1].split("=")[1]
        else: code = os.path.basename(pred).rsplit(".", 2)[1]
        nuclens = [len(record.seq) for record in SeqIO.parse(open(pred), "fasta")]
        protlens = [len(record.seq) for record in SeqIO.parse(open(pred.replace("genes", "faa")), "fasta")]
        coding_info["phage"].append(mod_phage_name2phagename_dict[os.path.basename(genome).replace(".fna", "")])
        coding_info["scaflen"].append(scaflen)
        coding_info["code"].append(code)
        coding_info["median_orf_len"].append(np.median(nuclens))
        coding_info["median_prot_len"].append(np.median(protlens))
        coding_info["coding_density"].append(sum(nuclens)/float(scaflen))
coding_df = pd.DataFrame(coding_info)
coding_df.head()

In [None]:
#comparing code 11 and code 25 for phages larger than 20 kb
code_11_code_25_dict = defaultdict(list)
for phage in coding_df.phage.unique():
    sub_df = coding_df[coding_df.phage == phage]

    if 11 in list(sub_df.code.astype(int)) and 25 in list(sub_df.code.astype(int)):
        code_11_code_25_dict["phage"].append(phage)

        for key, row in sub_df.iterrows():
            if int(row["code"]) == 11:
                code_11_code_25_dict["code_11_CD"].append(row["coding_density"])
            if int(row["code"]) == 25:
                code_11_code_25_dict["code_25_CD"].append(row["coding_density"])

code_11_code_25_df = pd.DataFrame(code_11_code_25_dict)
code_11_code_25_df["delta_11_to_25"] = code_11_code_25_df.code_25_CD.astype(float) - code_11_code_25_df.code_11_CD.astype(float)

#save these data
code_11_code_25_df.to_csv(rootdir + "/determining_phage_genetic_code/IMG_VR_GVD_code_11_code_25.csv", index=False)

### Likely Genetic Code of SGA Prophages

In [21]:
makedir(rootdir + "determining_phage_genetic_code/prophages")
makedir(rootdir + "determining_phage_genetic_code/prophages/prophage_genomes/")

In [None]:
#read in Vibrant prophage coordinates
vibrant_prophage_df = pd.read_csv(rootdir + "VIBRANT_Prophages/VIBRANT_ALL_database_proteins/VIBRANT_results_ALL_genome_proteins/VIBRANT_integrated_prophage_coordinates_ALL_genome_proteins.tsv", sep="\t")

#adding bin name
vibrant_prophage_df["bin"] = vibrant_prophage_df["scaffold"].map(scaf2bin_dic)


#adding SGA taxonomy
bin_2_tax_df = pd.read_csv("/groups/banfield/users/ajaffe/cpr-dpann/crossenv/genomes/metadata/bin2tax.tsv", "\t", names=["bin", "tax"])
vibrant_prophage_df = vibrant_prophage_df.merge(bin_2_tax_df, how="left", on="bin")


In [None]:
#grabbing the prophage_scaffold
for key, row in vibrant_prophage_df.iterrows():
    scaf_bin = rootdir + "All_genomes/" + row["bin"] + ".fna"
    for record in SeqIO.parse(open(scaf_bin), "fasta"):
        if record.id == row["scaffold"]:
            phage_seq = record._seq
            with open(rootdir + "determining_phage_genetic_code/prophages/prophage_genomes/" + row["scaffold"] + ".fna", "w") as file:
                file.write(">" + row['scaffold'] + "\n" + str(phage_seq) + "\n")

In [24]:
makedir(rootdir + "determining_phage_genetic_code/prophages/prodigal_predictions/")

In [None]:
# run prodigal varying code, only on scaffolds > 20 kb
for genome in glob.glob(rootdir + "determining_phage_genetic_code/prophages/prophage_genomes/*.fna"):
    output = genome.replace("prophage_genomes", "prodigal_predictions")
    scaflen = [len(record.seq) for record in SeqIO.parse(open(genome), "fasta")][0]
    if scaflen >20000:
        for code in ["11", "15", "25"]:
            prodigal = "prodigal -p single -m -a %s -d %s -g %s -i %s" %(output.replace("fna", code + ".faa"), output.replace("fna", code + ".genes"), code, genome)
            sp.call(prodigal, shell=True)

In [None]:
#grabbing the coding density of the phages in different codes (not the coding density of the entire scaffold)
coding_info = defaultdict(list)
for key, row in vibrant_prophage_df.iterrows():
    scaffold = row["scaffold"]
    phage_start = row["nucleotide start"]
    phage_end = row["nucleotide stop"]

    #grabbing the genes in different codes that are within the prophage region
    for pred in glob.glob(rootdir + "determining_phage_genetic_code/prophages/prodigal_predictions/" + scaffold + "*genes"):
        code = os.path.basename(pred).rsplit(".", 2)[1]
        nuclens = []
        for record in SeqIO.parse(open(pred), "fasta"):
            start = int(record.description.split(" # ")[1])
            end = int(record.description.split(" # ")[2])
            #check if gene is in the prophage region
            if start >= phage_start and end <= phage_end:
                nuclens.append(len(record._seq))
        coding_info["fragment"].append(row["fragment"])
        coding_info["code"].append(code)
        #computing coding_density by comparing to the vibrant predicted nucleotide length
        coding_info["coding_density"].append(sum(nuclens)/float(row["nucleotide length"]))
coding_df = pd.DataFrame(coding_info)

In [None]:
#making coding density dicts and merging
prophage_code_11_cd = {}
prophage_code_25_cd = {}
for key, row in coding_df.iterrows():
    if int(row["code"]) == 11:
        prophage_code_11_cd[row["fragment"]] = row["coding_density"]
    if int(row["code"]) == 25:
        prophage_code_25_cd[row["fragment"]] = row["coding_density"]

vibrant_prophage_df["code_11_cd"] = vibrant_prophage_df.fragment.map(prophage_code_11_cd)
vibrant_prophage_df["code_25_cd"] = vibrant_prophage_df.fragment.map(prophage_code_25_cd)

#getting the delta CD
delta_cd_dict = {}
for key, row in vibrant_prophage_df.iterrows():
    delta_cd = row["code_25_cd"] - row["code_11_cd"]
    delta_cd_dict[row["fragment"]] = delta_cd

vibrant_prophage_df["delta_11_25"] = vibrant_prophage_df.fragment.map(delta_cd_dict)

In [None]:
#save
vibrant_prophage_df.to_csv(rootdir + "determining_phage_genetic_code/Prophage_cpr_code_11_code_25_coding_densities.csv", index=False)

## Pvog annotation of phage genes in code 11 and code 25

In [25]:
makedir(rootdir + "determining_phage_genetic_code/pvog_annotations/")

In [None]:
#cat all code 11 protein calls and code 25 protein calls
cat_11 = "cat {0}determining_phage_genetic_code/drepped_genomes/*.11.faa > {0}determining_phage_genetic_code/pvog_annotations/phage_code_11.faa".format(rootdir)
cat_25 = "cat {0}determining_phage_genetic_code/drepped_genomes/*.25.faa > {0}determining_phage_genetic_code/pvog_annotations/phage_code_11.faa"
os.system(cat_11)
os.system(cat_25)

In [26]:
#hmm parse function
def parse_hmm(result_table):
    temp = {}
    count = 0
    # parse each result file using searchio
    for result in SearchIO.parse(result_table, "hmmer3-tab"):
        for item in result.hits:
            temp[count] = {"gene": item.id, "score": item.bitscore, "eval": item.evalue, "pvog_query": result.id}
            count += 1
    return(pd.DataFrame.from_dict(temp, orient="index"))

In [None]:
#parse the pvog hmm results
pvog_code_11_hmm_df = parse_hmm(rootdir + "/determining_phage_genetic_code/pvog_annotations/phage_code_11_pvog_all.csv")
pvog_code_25_hmm_df = parse_hmm(rootdir + "/determining_phage_genetic_code/pvog_annotations/phage_code_25_pvog_all.csv")

In [None]:
#getting phage cpr tax
phage_targeting_tax_df = gvd_img_hits_sub_drep[["phage", "cpr_tax"]]
phage_targeting_tax_df = phage_targeting_tax_df.rename(columns={"tax":"cpr_tax"})

In [None]:
#reading in the meta data for each pvog accession
pvog_all_tables_df = pd.read_csv("/groups/banfield/projects/multienv/cpr/2020/tm7_sr1_gracili/Env_CPR_Alex/pVOG_HMM_Profiles/pvog_all_tables.csv")
#making a smaller pvog metadata table for merging
pvog_all_tables_df_sub = pvog_all_tables_df[["pvog_query", "pvog_def"]]

In [None]:
#filtering pvog results based on evalue and dropping duplicates (code 11)
pvog_code_11_hmm_df_duplicates_dropped = pvog_code_11_hmm_df[pvog_code_11_hmm_df["eval"] < 0.00001]
pvog_code_11_hmm_df_duplicates_dropped = pvog_code_11_hmm_df_duplicates_dropped.sort_values(by=["eval"]).drop_duplicates(subset=["gene"])
pvog_code_11_hmm_df_duplicates_dropped["phage"] = pvog_code_11_hmm_df_duplicates_dropped["gene"].apply(lambda x: x.rsplit("_",1)[0])
pvog_code_11_hmm_df_duplicates_dropped = pvog_code_11_hmm_df_duplicates_dropped.merge(phage_targeting_tax_df, how="left", on="phage")
pvog_code_11_hmm_df_duplicates_dropped = pvog_code_11_hmm_df_duplicates_dropped.merge(pvog_all_tables_df_sub, how="left", on=["pvog_query"])
pvog_code_11_hmm_df_duplicates_dropped = pvog_code_11_hmm_df_duplicates_dropped.sort_values(by=["eval"]).drop_duplicates(subset=["gene"])
pvog_code_11_hmm_df_duplicates_dropped["code"] = 11
pvog_code_11_hmm_df_duplicates_dropped["gene"] = pvog_code_11_hmm_df_duplicates_dropped["gene"].apply(lambda x: x + "_code_11")

#save
pvog_code_11_hmm_df_duplicates_dropped.to_csv(rootdir + "determining_phage_genetic_code/pvog_annotations/pvog_code_11_hmm_df_duplicates_dropped.csv", index=False)

In [None]:
#filtering pvog results based on evalue and dropping duplicates (code 25)
pvog_code_25_hmm_df_duplicates_dropped = pvog_code_25_hmm_df[pvog_code_25_hmm_df["eval"] < 0.00001]
pvog_code_25_hmm_df_duplicates_dropped = pvog_code_25_hmm_df_duplicates_dropped.sort_values(by=["eval"]).drop_duplicates(subset=["gene"])
pvog_code_25_hmm_df_duplicates_dropped["phage"] = pvog_code_25_hmm_df_duplicates_dropped["gene"].apply(lambda x: x.rsplit("_",1)[0])
pvog_code_25_hmm_df_duplicates_dropped = pvog_code_25_hmm_df_duplicates_dropped.merge(phage_targeting_tax_df, how="left", on="phage")
pvog_code_25_hmm_df_duplicates_dropped = pvog_code_25_hmm_df_duplicates_dropped.merge(pvog_all_tables_df_sub, how="left", on=["pvog_query"])
pvog_code_25_hmm_df_duplicates_dropped = pvog_code_25_hmm_df_duplicates_dropped.sort_values(by=["eval"]).drop_duplicates(subset=["gene"])
pvog_code_25_hmm_df_duplicates_dropped["code"] = 25
pvog_code_25_hmm_df_duplicates_dropped["gene"] = pvog_code_25_hmm_df_duplicates_dropped["gene"].apply(lambda x: x + "_code_25")

#save
pvog_code_25_hmm_df_duplicates_dropped.to_csv(rootdir + "determining_phage_genetic_code/pvog_annotations/pvog_code_25_hmm_df_duplicates_dropped.csv", index=False)

## Finding in-frame stop codons

In [None]:
#code 25 gene files for cpr phage
phage_genes_25_dict = defaultdict(list)
for file in glob.glob(rootdir + "determining_phage_genetic_code/drepped_genomes/*.25.genes"):
    for record in SeqIO.parse(file, "fasta"):
        phage_genes_25_dict["phage_gene"].append(record.id + "_code_25")
        phage_genes_25_dict["phage"].append(str(record.id).rsplit("_",1)[0])
        phage_genes_25_dict["NA_sequence"].append(str(record._seq))
        phage_genes_25_dict["Stop_codon"].append(str(record._seq[-3:]))
        phage_genes_25_dict["orf_length"].append(len(record._seq))
        phage_genes_25_dict["start"].append(record.description.split("#")[1])
        phage_genes_25_dict["end"].append(record.description.split("#")[2])
        phage_genes_25_dict["strand"].append(record.description.split("#")[3])
        phage_genes_25_dict["code"].append(25)
phage_genes_25_df = pd.DataFrame(phage_genes_25_dict)

In [None]:
#code 11 gene files for cpr phage
phage_genes_11_dict = defaultdict(list)
for file in glob.glob(rootdir + "determining_phage_genetic_code/drepped_genomes/*.11.genes"):
    for record in SeqIO.parse(file, "fasta"):
        phage_genes_11_dict["phage_gene"].append(record.id + "_code_11")
        phage_genes_11_dict["phage"].append(str(record.id).rsplit("_",1)[0])
        phage_genes_11_dict["NA_sequence"].append(str(record._seq))
        phage_genes_11_dict["Stop_codon"].append(str(record._seq[-3:]))
        phage_genes_11_dict["orf_length"].append(len(record._seq))
        phage_genes_11_dict["start"].append(record.description.split("#")[1])
        phage_genes_11_dict["end"].append(record.description.split("#")[2])
        phage_genes_11_dict["strand"].append(record.description.split("#")[3])
        phage_genes_11_dict["code"].append(11)
phage_genes_11_df = pd.DataFrame(phage_genes_11_dict)

In [None]:
#concat tables
phage_genes_11_and_25 = pd.concat([phage_genes_25_df,phage_genes_11_df])

In [None]:
#add in cpr taxnomomy
phage_genes_11_and_25 = phage_genes_11_and_25.merge(phage_targeting_tax_df, how="left", on="phage")
phage_genes_11_and_25 = phage_genes_11_and_25.drop_duplicates(subset = "phage_gene")

In [None]:
#grab pvog results
pvog_code_11_25_hmm_df_duplicates_dropped = pd.concat([pvog_code_25_hmm_df_duplicates_dropped,pvog_code_11_hmm_df_duplicates_dropped])

gene2pvog = {}
for key,row in pvog_code_11_25_hmm_df_duplicates_dropped.iterrows():
    gene2pvog[row["gene"]] = row["pvog_def"]

#add in pvog results
phage_genes_11_and_25["pvog_def"] = phage_genes_11_and_25["phage_gene"].map(gene2pvog)
phage_genes_11_and_25 = phage_genes_11_and_25.fillna("unknown")

#save
phage_genes_11_and_25.to_csv(rootdir + "determining_phage_genetic_code/pvog_annotations/phage_genes_with_pvog_code_11_code_25.csv")

In [None]:
#finding inframe stop codons
stop_codons = ["TAG", "TAA", "TGA"]

#finding stop codons in each orf
stop_codon_localization_dict = defaultdict(list)
for key, row in phage_genes_11_and_25.iterrows():
    seq = row["NA_sequence"]
    seq_len = len(seq)
    gene = row["gene"]
    gene_start = float(row["start"])
    gene_end = float(row["end"])
    strand = int(row["strand"])
    #looking at each codon in the reading frame
    count = 0
    for start in range(0,seq_len,3):
        stop = start + 3
        codon = seq[start:stop]
        #pulling out TGA codons
        for stop_codon in stop_codons:
            if codon == stop_codon:
                codon_start = gene_start + start
                codon_end = gene_end - stop
                count += 1
                TGA_header = gene + "_" + "code_" + str(row["code"]) + "_" + stop_codon + "_" + str(count)
                #creating a "localization percentage" to quantify where on the protein the TGA is, the higher the percentage the closer to the end of the protein
                TGA_percentage_along_protein = stop/(seq_len)*100
                NA_to_end_of_protein = seq_len - stop
                codons_to_end_of_protein = NA_to_end_of_protein/3
                stop_codon_localization_dict["stop_codon_number"].append(TGA_header)
                stop_codon_localization_dict["stop_codon"].append(stop_codon)
                stop_codon_localization_dict["phage_gene"].append(gene)
                stop_codon_localization_dict["phage_gene_start"].append(gene_start)
                stop_codon_localization_dict["phage_gene_end"].append(gene_end)
                stop_codon_localization_dict["phage"].append(row["phage"])
                stop_codon_localization_dict["cpr_tax"].append(row["cpr_tax"])
                stop_codon_localization_dict["code"].append(row["code"])
                stop_codon_localization_dict["strand"].append(row["strand"])
                if strand == 1:
                    codon_start = gene_start + start
                    codon_end = gene_start + stop
                    stop_codon_localization_dict["start"].append(codon_start)
                    stop_codon_localization_dict["end"].append(codon_end)
                if strand == -1:
                    codon_start = gene_start + (seq_len - stop)
                    codon_end = gene_start + (seq_len - start)
                    stop_codon_localization_dict["start"].append(codon_start)
                    stop_codon_localization_dict["end"].append(codon_end)
stop_codon_localization_df = pd.DataFrame(stop_codon_localization_dict)

#save
stop_codon_localization_df.to_csv(rootdir + "determining_phage_genetic_code/pvog_annotations/stop_codon_localization_df.csv", index=False)

## Gene sharing network of phages

In [29]:
makedir(rootdir + "Gene_sharing_networks")
makedir(rootdir + "Gene_sharing_networks/phage_gene_sharing_network/")

In [None]:
#grab earth virome sr1 phages Paez-Espino et al. 2016
earth_virome_host_metadata_df = pd.read_csv(rootdir + "metadata/Earth_virome_host_metadata.csv")
earth_virome_host_metadata_df_sr1 = earth_virome_host_metadata_df[earth_virome_host_metadata_df.fillna("none").Host.str.contains("Sr1", case=False)]
earth_virome_host_metadata_df_sr1["TaxonID"] = earth_virome_host_metadata_df_sr1["ALL (125,842) mVCs (TaxonID_ScaffoldID)"].apply(lambda x: x.split("_", 1)[0])
earth_virome_host_metadata_df_sr1["Scaffold_ID"] = earth_virome_host_metadata_df_sr1["ALL (125,842) mVCs (TaxonID_ScaffoldID)"].apply(lambda x: x.split("_", 1)[1])

with open(rootdir + "Gene_sharing_networks/phage_gene_sharing_network/earth_virome_SR1_phages.fna", "w") as file:
    for record in SeqIO.parse("path_to_earth_virome_database/earth_virome_database/mVGs_sequences_v2.fna", "fasta"):
        for scaffold in earth_virome_host_metadata_df_sr1["Scaffold_ID"].unique():
            if scaffold in record.id:
                file.write(">{0}\n{1}\n".format(scaffold, str(record._seq)))

#prodigal code 25
cmd = "prodigal -i {0}Gene_sharing_networks/phage_gene_sharing_network/earth_virome_SR1_phages.fna -a {0}Gene_sharing_networks/phage_gene_sharing_network/earth_virome_SR1_phages.faa -g 25".format(rootdir)
os.system(cmd)

In [None]:
#extract OVD saccharibacteria phages Li et al. 2022

#metadata
ovd_host_df = pd.read_csv(rootdir + "metadata/OVD-info.csv")
saccharibacteria_phage_ovd_list = ovd_host_df[ovd_host_df.Host_taxonomy.str.contains("Saccharimonadia", case=False)]["vOTU ID"].unique()

#grab genomes
with open(rootdir + "Gene_sharing_networks/phage_gene_sharing_network/OVD-Saccharibacteria_phages.fna", "w") as file:
    for record in SeqIO.parse("path_to_OVD_database/OVD-genomes.fa", "fasta"):
        if record.id in saccharibacteria_phage_ovd_list:
            count += 1
            file.write(">{0}\n{1}\n".format(record.id, str(record._seq)))

#prodigal of OVD phages
cmd = "prodigal -i {0}Gene_sharing_networks/phage_gene_sharing_network/OVD-Saccharibacteria_phages.fna -a {0}Gene_sharing_networks/phage_gene_sharing_network/OVD-Saccharibacteria_phages.faa -g 11".format(rootdir)
os.system(cmd)


IMG VR Gracilibacteria phages and Borges et al. 2021 Absconditabacteria phages extracted manually (available in the metadata folder)

In [None]:
#grabbing proteins from phages identified in this study
#getting code 25 for gra and abs phages, code 11 for sac phages
for key, row in gvd_img_hits_sub.iterrows():
    if row['cpr_tax'] == "Saccharibacteria":
        phage = row["phage"].replace("|","_")
        cat = 'cat {0}determining_phage_genetic_code/drepped_genomes/<phage>.11.faa >> {0}Gene_sharing_networks/phage_gene_sharing_network/CPR_phage.faa'.replace("<phage>", phage).format(rootdir)
        os.system(cat)
        #meta phages
        cat = 'cat {0}determining_phage_genetic_code/drepped_genomes/<phage>.meta.faa >> /{0}Gene_sharing_networks/phage_gene_sharing_network/CPR_phage.faa'.replace("<phage>", phage).format(rootdir)
        os.system(cat)
    if (row["cpr_tax"] == "Gracilibacteria" or row["cpr_tax"] == "Absconditabacteria"):
        phage = row["phage"].replace("|","_")
        cat = 'cat {0}determining_phage_genetic_code/drepped_genomes/<phage>.25.faa >> {0}Gene_sharing_networks/phage_gene_sharing_network/CPR_phage.faa'.replace("<phage>", phage).format(rootdir)
        os.system(cat)
        #meta phages
        cat = 'cat {0}determining_phage_genetic_code/drepped_genomes/<phage>.meta.faa >> {0}Gene_sharing_networks/phage_gene_sharing_network/CPR_phage.faa'.replace("<phage>", phage).format(rootdir)
        os.system(cat)

In [None]:
#cat all phages together
cat = 'cat {0}Gene_sharing_networks/phage_gene_sharing_network/CPR_phage.faa {0}metadata/Borges_abs_phages.faa {0}metadata/img_vr_gra_phages.faa {0}VIBRANT_Prophages/drep_lysogenic_genes.faa {0}Gene_sharing_networks/phage_gene_sharing_network/OVD-Saccharibacteria_phages.faa {0}Gene_sharing_networks/phage_gene_sharing_network/earth_virome_SR1_phages.faa > {0}Gene_sharing_networks/phage_gene_sharing_network/all_phages_for_network.faa'.format(rootdir)
os.system(cat)

grabbing keywords for vcontact 2

In [None]:
phage_dict = defaultdict(list)
for record in SeqIO.parse(rootdir + "Gene_sharing_networks/phage_gene_sharing_network/all_phages_for_network.faa", "fasta"):
    phage_dict["protein_id"].append(record.name)
    phage_dict["contig_id"].append(record.name.rsplit("_",1)[0].rsplit("_fragment",1)[0])
phage_df = pd.DataFrame(phage_dict) 

In [None]:
cpr_keywords = {}
#phages identified in this study
for key, row in gvd_img_hits_sub_drep.iterrows():
    cpr_keywords[row["phage"]] = row["cpr_tax"]

In [None]:
#borges et al. keywords
borges_dict = {}
for record in SeqIO.parse(rootdir + "metdata/Borges_abs_phages.faa", "fasta"):
    borges_dict[record.name.rsplit("_",1)[0]] = "Borges_code_25_ABS"

In [None]:
#prophage key words
vibrant_prophage_keywords = {}
for record in SeqIO.parse(rootdir + "VIBRANT_Prophages/drep_lysogenic_genes.faa", "fasta"):
    vibrant_prophage_keywords[record.id.rsplit("-see",1)[0].rsplit("_",1)[0].rsplit("_fragment",1)[0]] = bin2tax_dict[scaf2bin_dic[record.id.rsplit("_",1)[0].rsplit("_fragment",1)[0]]] + "_prophage"

In [None]:
#grab ovd key words
ovd_dict = {}
for record in SeqIO.parse(rootdir + "Gene_sharing_networks/phage_gene_sharing_network/OVD-Saccharibacteria_phages.faa", "fasta"):
    ovd_dict[record.name.rsplit("_",1)[0]] = "OVD_Saccharibacteria"

In [None]:
#grab img vr gra keywords
imgvr_gra_dict = {}
for record in SeqIO.parse(rootdir + "metadata/img_vr_gra_phages.faa", "fasta"):
    imgvr_gra_dict[record.name.rsplit("_",1)[0]] = "IMG_VR_Gra"

In [None]:
#grab earth virome sr1 phages keywords
earth_virome_dict = {}
for record in SeqIO.parse(rootdir + "Gene_sharing_networks/phage_gene_sharing_network/earth_virome_SR1_phages.faa", "fasta"):
    earth_virome_dict[record.name.rsplit("_",1)[0]] = "Earth_virome_sr1"

In [None]:
cpr_all_keywords = {}
cpr_all_keywords.update(cpr_keywords)
cpr_all_keywords.update(borges_dict)
cpr_all_keywords.update(vibrant_prophage_keywords)
cpr_all_keywords.update(ovd_dict)
cpr_all_keywords.update(imgvr_gra_dict)
cpr_all_keywords.update(earth_virome_dict)

In [None]:
#map keywords
phage_df["host_tax"] = phage_df["contig_id"].map(cpr_all_keywords)

#save (for vContact 2 colors)
phage_df[["contig_id","host_tax"]].drop_duplicates(subset="contig_id").to_csv(rootdir + "Gene_sharing_networks/phage_gene_sharing_network/cytoscape_colors.csv", index=False)

#save genes to genome file
phage_df.to_csv(rootdir + "Gene_sharing_networks/phage_gene_sharing_network/gene2genome.csv", index=False)

In [30]:
makedir(rootdir + "Gene_sharing_networks/phage_gene_sharing_network/vContact2_output")

## Gene Sharing network with the CPR bacteria themselves

In [32]:
makedir(rootdir + "Gene_sharing_networks/phage_and_cpr_bacteria_gene_sharing_network/")

In [None]:
#cat the cpr phages with the cpr bacteria
cat = 'cat {0}/Gene_sharing_networks/phage_gene_sharing_network/all_phages_for_network.faa {0}ALL_genome_proteins.faa > {0}Gene_sharing_networks/phage_and_cpr_bacteria_gene_sharing_network/phages_with_cpr_bacteria.faa'.format(rootdir)
os.system(cat)

In [None]:
#make cpr bactera gene2genome
cpr_scaf_bin_tax_dict = defaultdict(list)
for record in SeqIO.parse(rootdir + "ALL_genome_proteins.faa", 'fasta'):
    cpr_scaf_bin_tax_dict["protein_id"].append(record.id)
    cpr_scaf_bin_tax_dict["contig_id"].append(scaf2bin_dic[record.id.rsplit("_", 1)[0]])
    cpr_scaf_bin_tax_dict["host_tax"].append(bin2tax_dict[scaf2bin_dic[record.id.rsplit("_", 1)[0]]] + "_cpr_genome")
cpr_scaf_bin_tax_df = pd.DataFrame(cpr_scaf_bin_tax_dict)

In [None]:
phage_cpr_gene2genome_df = pd.concat([phage_df, cpr_scaf_bin_tax_df])

#save
phage_cpr_gene2genome_df.rename(columns={"host_tax":"keywords"}).to_csv("{0}Gene_sharing_networks/phage_and_cpr_bacteria_gene_sharing_network/gene2genome.csv", index=False)

In [33]:
makedir(rootdir + "Gene_sharing_networks/phage_and_cpr_bacteria_gene_sharing_network/vContact2_output")

## Phage gene taxonomy predictions

In [28]:
makedir(rootdir + "phage_gene_taxonomy/")

In [None]:
fasta = rootdir + "Gene_sharing_networks/phage_gene_sharing_network/CPR_phage.faa"


In [None]:
# split into multiple diamond jobs (script)
with open(rootdir + "Scripts/run_diamond.sh", "w") as out:
    
    proteins = [record for record in SeqIO.parse(open(fasta), "fasta")]
    n = round(len(proteins)/9)
    
    for i, cursor in enumerate(range(0, len(proteins),n)):
        # first write out protein file
        with open(rootdir + "gene_taxonomy/block" + str(i) + ".faa", "w") as block:
            for record in proteins[cursor:cursor+n]:
                block.write(">%s\n%s\n" %(record.description, str(record.seq)))
        call = "sbatch -J dmnd" + str(i) + " --wrap 'diamond blastp -d /groups/banfield/users/clarelou/databases/UniRef100/uniref100.translated.dmnd " + \
            "-q %s -o %s --threads 48 -b8 -c1'" %(rootdir + "gene_taxonomy/block" + str(i) + ".faa", rootdir + "gene_taxonomy/matches" + str(i) + ".faa")
        out.write(call + "\n")

In [None]:
# concatenate + collect taxonomy
dmnd = pd.concat(skbio.io.read(item, format="blast+6", into=pd.DataFrame, default_columns=True) for \
            item in glob.glob(rootdir + "phage_gene_taxonomy/matches*"))
# compute coverage
faalens = {record.description.split(" ")[0]: len(record.seq) for record in SeqIO.parse(open(fasta), "fasta")}
dmnd["qlen"] = dmnd["qseqid"].map(faalens)
dmnd["qcov"] = dmnd.apply(lambda x: (x["qend"]-x["qstart"])/x["qlen"], axis=1)
# choose best hits for each
dmnd = dmnd.sort_values(["bitscore", "qcov"], ascending=[False,False]).drop_duplicates("qseqid")
# filter for min cov /eval
dmnd = dmnd[(dmnd["evalue"]<1e-6) & (dmnd["qcov"]>0.70)]
dmnd.head()

In [None]:
with open(rootdir + "phage_gene_taxonomy/uniref_search.sh", "w") as out:
    
    accs = []
    for item in dmnd["sseqid"].unique():
        if item.split("_")[-2] != "None":
            accs.append(item.split("_")[-2])
        else: accs.append(item.split("_")[-1])
    accs_unique = [item for item in set(accs) if item != "N/A"]
    
    # call maxes out at 1000, so adjust parts accordingly
    n = math.ceil(len(accs_unique)/13)
    for i in range(0, len(accs_unique),n):
        out.write("efetch -db taxonomy -mode xml -id %s > %s\n" %(",".join(accs_unique[i:i + n]), \
            rootdir + "phage_gene_taxonomy/uniref_results_" + str(int(i/n)) + ".xml"))

In [None]:
lineage_info = defaultdict(list)

for xml in glob.glob(rootdir + "phage_gene_taxonomy/*xml"):
    
    for block in BeautifulSoup(open(xml), "xml").findAll('Taxon'):
            
        lineage, phylum, species = "None", "None", "None"

        if block.find("Lineage"):
            lineage = block.find("Lineage").string

        for level in block.findAll("Taxon"):
            if level.find("Rank").string=="phylum":
                phylum = level.find("ScientificName").string
                
        for level in block.findAll("Taxon"):
            if level.find("Rank").string=="species":
                species = level.find("ScientificName").string

        lineage_info["taxid"].append(block.find("TaxId").string)
        lineage_info["lineage"].append(lineage)
        lineage_info["phylum"].append(phylum)
        lineage_info["species"].append(species)

lineage_df = pd.DataFrame(lineage_info).query("lineage!='None'").drop_duplicates(["taxid", "phylum"])
lineage_df.head()

In [None]:
# add in
dmnd["taxid"] = dmnd["sseqid"].apply(lambda x: x.split("_")[-2] if x.split("_")[-2] != 'None' else x.split("_")[-1])
dmnd["scaffold"] = dmnd["qseqid"].apply(lambda x: "_".join(x.split("_")[:-1]))
dmnd = dmnd.merge(lineage_df, how="left", on="taxid").fillna("None")
dmnd.head()

In [None]:
orf_counts = {}

for record in SeqIO.parse(fasta, "fasta"):
    scaf = "_".join(record.description.split(" ")[0].split("_")[:-1])
    if scaf not in orf_counts:
        orf_counts[scaf] = 1
    else: orf_counts[scaf] +=1
    
len(orf_counts.keys())

In [None]:
dmnd["cpr_tax"] = dmnd["qseqid"].map(bin2tax_dict)

In [None]:
tax_info = defaultdict(list)

for scaf in dmnd["scaffold"].unique():
    
    subtable = dmnd[dmnd["scaffold"]==scaf].groupby(["scaffold", "phylum"], \
        as_index=False).aggregate({"qseqid":"count"}).sort_values("qseqid", ascending=False)
    subtable["total_orfs"] = orf_counts[subtable["scaffold"].iloc[0]]
    subtable["perc_orfs"] = subtable.apply(lambda x: x["qseqid"]/x["total_orfs"], axis=1)
    sorted_table = subtable.sort_values("perc_orfs", ascending=False)
    tax_info["bin"].append(sorted_table["scaffold"].iloc[0])
    tax_info["phylum_winner"].append(sorted_table["phylum"].iloc[0])
    tax_info["phylum_winner_perc"].append(sorted_table["perc_orfs"].iloc[0])

tax_info_df = pd.DataFrame(tax_info)
tax_info_df

In [None]:
tax_info_df["cpr_tax"] = tax_info_df['bin'].map(bin2tax_dict)

In [None]:
#save
tax_info_df.to_csv(rootdir + "phage_gene_taxonomy/diamond_tax_call_results.csv", index=False)

## Host Range of CPR phages

In [None]:
#function to pull down ncbi metadata
import time
from Bio import Entrez
from bs4 import BeautifulSoup
Entrez.email = "To_fill_in"
def get_ncbi_metadata(accession, mode):
    # dont annoy ncbi
    time.sleep(0.1)
    try:
        # get genbank id
        handle = Entrez.esearch(db='nucleotide', term=accession, RetMax=1)
        result = Entrez.read(handle)
        # get metadata document
        handle = Entrez.efetch(db="nuccore", id=result["IdList"][0], retmode="xml")
        soup = BeautifulSoup(handle, "lxml")
        if mode == "def":
            return soup.find("gbseq_definition").text
        elif mode == "tax":
            return soup.find("gbseq_taxonomy").text
        else: print("Not a valid mode.")
    except: return "None"

In [35]:
makedir(rootdir + "phage_host_range/")
makedir(rootdir + "phage_host_range/blast_db/")

In [36]:
#making blast database with IMG/VR CPR predicted hosts 
#first copy
cp = "cp {0}IMG_VR_GVD_hits_dRepped.fna {0}phage_host_range/blast_db/IMG_VR_GVD_hits_dRepped.fna".format(rootdir)
os.system(cp)

#make blast db
makeblastdb = "makeblastdb -in {0}phage_host_range/blast_db/IMG_VR_GVD_hits_dRepped.fna -dbtype nucl".format(rootdir)
os.system(makeblastdb)

256

In [None]:
def get_spacer_hits(input_df):
    df = skbio.io.read(input_df, format="blast+6", into=pd.DataFrame, default_columns=False,  columns=['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'qlen', 'slen'])
    df = df[df["mismatch"] <= 1]
    df["Coverage"] = df['length']/df['qlen']
    df = df[df["Coverage"] >= 0.95]
    return df

In [37]:
makedir(rootdir + "phage_host_range/blast_output/")

Earth Virome Blast Paez-Espino et al., 2016

In [None]:
# Earth Virome Blast Paez-Espino et al. 2016
# spacers avaliable in metadata directory
blastn = 'blastn -task "blastn-short" -query {0}metdata/Earth_Virome_Project_ALL_spacers_fasta.fna -db {0}phage_host_range/blast_db/IMG_VR_GVD_hits_dRepped.fna -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen" -out {0}phage_host_range/blast_output/ALL_results_Earth_Virome_Spacers_to_IMG_VR_CPR_predicted_hosts.csv -evalue 0.003 -word_size 7 -gapopen 10 -gapextend 2 -penalty -1 -max_target_seqs 1000 -num_threads 16'.format(rootdir)
os.system(blastn)

In [None]:
#grab earth virome host tax id to host species
earth_virome_tax_dict = {}
earth_virome_tax_df = pd.read_table(rootdir + "metdata/Earth_Virome_spacer_tax.csv", sep=";")
earth_virome_tax_df["tax_id"] = earth_virome_tax_df["Query HOST (TaxID_____ ScafID_____ CRISPR algorithm_____ CRISPR Locus number_____ spacer location)"].apply(lambda x: x.split("____")[0])
earth_virome_tax_df = earth_virome_tax_df.drop_duplicates(subset="tax_id")
for key, row  in earth_virome_tax_df.iterrows():
    earth_virome_tax_dict[row["tax_id"]] = row["Host_species"]

In [None]:
#parse earth virome results
Earth_Virome_spacers_to_IMG_df = get_spacer_hits(rootdir + "phage_host_range/blast_output/ALL_results_Earth_Virome_Spacers_to_IMG_VR_CPR_predicted_hosts.csv ")
Earth_Virome_spacers_to_IMG_df["Tax_ID"] = Earth_Virome_spacers_to_IMG_df["qseqid"].apply(lambda x: x.split("_____")[0])
Earth_Virome_spacers_to_IMG_df["tax"] = Earth_Virome_spacers_to_IMG_df.Tax_ID.map(earth_virome_tax_dict)
Earth_Virome_spacers_to_IMG_df.to_csv(rootdir + 'phage_host_range/blast_output/HITS_Earth_Virome_Spacers_to_IMG_VR_CPR_predicted_hosts.csv', index=False)

Couvin et al., 2018

In [None]:
#blasting CCF spacer database to the CPR hits
blastn = "blastn -task 'blastn-short' -query {0}metadata/CCF_20190618_spacer_34.fasta -db {0}phage_host_range/blast_db/IMG_VR_GVD_hits_dRepped.fna -outfmt '6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen' -out {0}phage_host_range/blast_output/ALL_results_CCF_spacer_db_to_IMG_VR_CPR_predicted_hosts.csv -evalue 0.003 -word_size 7 -gapopen 10 -gapextend 2 -penalty -1 -max_target_seqs 1000 -num_threads 16".format(rootdir)
os.system(blastn)

In [None]:
#parse CCF blast results
CCF_spacer_to_IMG_df = get_spacer_hits(rootdir + "phage_host_range/blast_output/ALL_results_CCF_spacer_db_to_IMG_VR_CPR_predicted_hosts.csv")
CCF_spacer_to_IMG_df["def"] = CCF_spacer_to_IMG_df["qseqid"].apply(lambda x: get_ncbi_metadata(x, "def"))
CCF_spacer_to_IMG_df["tax"] = CCF_spacer_to_IMG_df["qseqid"].apply(lambda x: get_ncbi_metadata(x, "tax"))
CCF_spacer_to_IMG_df.to_csv(rootdir + 'phage_host_range/blast_output/HITS_CCF_spacer_db_to_IMG_VR_CPR_predicted_hosts.csv', index=False)

Shmakov et al., 2017

In [None]:
#terminal command to blast the Koonin spacer db
blastn = "blastn -task 'blastn-short' -query {0}metadata/Koonin_Supplementary_material_spacers.fna -db {0}phage_host_range/blast_db/IMG_VR_GVD_hits_dRepped.fna -outfmt '6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen' -out {0}phage_host_range/blast_output/ALL_results_Koonin_spacer_db_to_IMG_VR_CPR_predicted_hosts.csv -evalue 0.003 -word_size 7 -gapopen 10 -gapextend 2 -penalty -1 -max_target_seqs 1000 -num_threads 16".format(rootdir)
os.system(blastn)

In [None]:
#parse Shmakov et al spacer blasts
Koonin_spacer_to_IMG_df = get_spacer_hits(rootdir + "phage_host_range/blast_output/ALL_results_Koonin_spacer_db_to_IMG_VR_CPR_predicted_hosts.csv")
Koonin_spacer_to_IMG_df["Accession"] = Koonin_spacer_to_IMG_df["qseqid"].apply(lambda x: x.split("_")[0])
Koonin_spacer_to_IMG_df["def"] = Koonin_spacer_to_IMG_df["Accession"].apply(lambda x: get_ncbi_metadata(x, "def"))
Koonin_spacer_to_IMG_df["tax"] = Koonin_spacer_to_IMG_df["Accession"].apply(lambda x: get_ncbi_metadata(x, "tax"))
Koonin_spacer_to_IMG_df.to_csv('{0}phage_host_range/blast_output/HITS_Koonin_spacer_db_to_IMG_VR_CPR_predicted_hosts.csv', index=False)

Biswas et al., 2016

In [None]:
#terminal command to blast the CRISPRbank spacer db
blastn = "blastn -task 'blastn-short' -query {0}metadata/CRISPRBankSpacers_4_95_2555_100_bacteria_refseq_nr.fa -db {0}phage_host_range/blast_db/IMG_VR_GVD_hits_dRepped.fna -outfmt '6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen' -out {0}phage_host_range/blast_output/ALL_results_CRISPRbank_db_to_IMG_VR_CPR_predicted_hosts.csv -evalue 0.003 -word_size 7 -gapopen 10 -gapextend 2 -penalty -1 -max_target_seqs 1000 -num_threads 16".format(rootdir)
os.system(blastn)

In [None]:
CRISPRbank_spacer_to_IMG_df = get_spacer_hits(rootdir + "phage_host_range/blast_output/ALL_results_CRISPRbank_db_to_IMG_VR_CPR_predicted_hosts.csv")
CRISPRbank_spacer_to_IMG_df["Accession"] = CRISPRbank_spacer_to_IMG_df["qseqid"].apply(lambda x: x.split("|")[0])
CRISPRbank_spacer_to_IMG_df["def"] = CRISPRbank_spacer_to_IMG_df["Accession"].apply(lambda x: get_ncbi_metadata(x, "def"))
CRISPRbank_spacer_to_IMG_df["tax"] = CRISPRbank_spacer_to_IMG_df["Accession"].apply(lambda x: get_ncbi_metadata(x, "tax"))
CRISPRbank_spacer_to_IMG_df.to_csv(rootdir + 'phage_host_range/blast_output/HITS_CRISPRbank_db_to_IMG_VR_CPR_predicted_hosts.csv', index=False)

### Actinobacteria spacer extraction

In [39]:
makedir(rootdir + "phage_host_range/actinobacteria_spacers/")
makedir(rootdir + "phage_host_range/actinobacteria_spacers/CCF_output")

In [None]:
makedir(rootdir + "phage_host_range/actinobacteria_spacers/all_spacers/")

In [None]:
for sample in glob.glob(rootdir + 'phage_host_range/actinobacteria_spacers/CCF_output/result.json'):
        sample_name = sample.split('/')[10]
        with open(sample, 'r') as f:
            abs12_json = json.load(f)
        for abs12 in abs12_json['Sequences']:
            for abs12_a in abs12['Crisprs']:
                if abs12_a["Evidence_Level"] == 4 or abs12_a["Evidence_Level"] == 3:
                    for abs12_b in abs12_a['Regions']:
                        if "Spacer" in abs12_b["Type"]:
                            fasta = open(rootdir + 'phage_host_range/actinobacteria_spacers/all_spacers/' + sample_name + "_" + str(abs12_b["Start"]) + '_CCF_spacers.fna', "w")
                            header = (">" + abs12['Version'] + "_" + str(abs12_b["Start"]))
                            sequence = (abs12_b['Sequence'])
                            fasta.write(header + "\n" + sequence + "\n")
                            fasta.close()

blast the actino spacers

In [None]:
#blasting to IMG_VR CPR hits
blastn = "blastn -task 'blastn-short' -query {0}phage_host_range/actinobacteria_spacers/all_3_4_spacers.fna -db {0}phage_host_range/blast_db/IMG_VR_GVD_hits_dRepped.fna -outfmt '6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen' -out {0}phage_host_range/actinobacteria_spacers/ALL_results_Actinobacteria_db_to_IMG_VR_CPR.csv -evalue 0.003 -word_size 7 -gapopen 10 -gapextend 2 -penalty -1 -max_target_seqs 1000 -num_threads 16".format(rootdir)
os.system(blastn)

In [None]:
#parse blast results
Actinobacteria_db_1_spacers_to_IMG_df = get_spacer_hits(rootdir + "phage_host_range/actinobacteria_spacers/ALL_results_Actinobacteria_db_to_IMG_VR_CPR.csv ")
Actinobacteria_db_1_spacers_to_IMG_df["Accession"] = Actinobacteria_db_1_spacers_to_IMG_df["qseqid"].apply(lambda x: x.split("_")[0])
Actinobacteria_db_1_spacers_to_IMG_df["def"] = Actinobacteria_db_1_spacers_to_IMG_df["Accession"].apply(lambda x: get_ncbi_metadata(x, "def"))
Actinobacteria_db_1_spacers_to_IMG_df["tax"] = Actinobacteria_db_1_spacers_to_IMG_df["Accession"].apply(lambda x: get_ncbi_metadata(x, "tax"))
Actinobacteria_db_1_spacers_to_IMG_df.to_csv(rootdir + "phage_host_range/actinobacteria_spacers/HITS_results_Actinobacteria_db_to_IMG_VR_CPR.csv", index=False)