In [1]:
import glob
import os
import shutil
import subprocess
import pandas as pd
import time
import numpy as np
from pathlib import Path
import matplotlib
%matplotlib inline
from config import * #config file for paths of output and input files

In [None]:
# to pick cellwall and extracellular protein rows from Vaxign output
rv_CC1=pd.read_excel(os.path.join(sharepath, "Bovine_isolates", "Reverse_vaccinology", "vaxgin_result_B133_Y02_CC1.xlsx"), index_col="Protein Accession")
rv_B133_CC1_extracellwall=rv_CC1.loc[((rv_CC1["Localization"] == "Cellwall")| (rv_CC1["Localization"] == "Extracellular")) & (rv_CC1["Trans-membrane helices"] < 2)]
rv_B133_CC1_extracellwall.to_csv(os.path.join(sharepath, "Bovine_isolates", "Reverse_vaccinology", "rv_B133_CC1_extracellwall.csv"))

In [None]:
#to pick the protein sequences from prokka faa output based on the localization output from Vaxign
for f in rv_B133_CC1_extracellwall["Protein Accession"]:
    l=f.split("_0")[0]
    print("picking", f, "from", l+".faa")
    with open(os.path.join(sharepath, "Bovine_isolates", "Reverse_vaccinology", "B133_extracellwall_protein.fasta"), "a") as b:
        subprocess.run(["samtools", "faidx", os.path.join(sharepath, "Bovine_isolates", "prokka", l+".faa"), f],stdout=b)

In [None]:
#extracting the ANTIGEN lines from VaxiJen results and write a new text file
with open(os.path.join(sharepath, "Bovine_isolates", "Reverse_vaccinology", "B083_extracellwall_VaxiJenresult.txt"), "r") as openfile:
    for line in openfile:
        for part in line.split("\n"):
            if "Probable ANTIGEN" in part:
                print (part)
                with open (os.path.join(sharepath, "Bovine_isolates", "Reverse_vaccinology",
                                       "B083_antigen.txt"), "a") as f:
                    f.write(part+"\n")

In [None]:
#extracting protein files from faa using vaccine name file
with open (os.path.join(sharepath, "Bovine_isolates", "Reverse_vaccinology", "CC1_antigen.txt"), 'r') as f:
    for line in f:
        name=line.split(None, 1)[0].split(">")[1]
        filenamepath=glob.glob(os.path.join(sharepath, "Bovine_isolates", 
                                                   "prokka", "*B003_F02_Y13_CC1.faa"))
        for a in filenamepath:
            with open(os.path.join(sharepath, "Bovine_isolates", "Reverse_vaccinology", "vaccine_protein.fasta"), "a") as t:
                subprocess.run(["seqkit", "grep", "-r", "-p", "^"+name, a], stdout=t)

In [None]:
#extracting nucleotide sequeneces of vaccine candidates from ffn file
with open (os.path.join(sharepath, "Bovine_isolates", "Reverse_vaccinology", "B083_antigen.txt"), 'r') as f:
    for line in f:
        name=line.split(None, 1)[0].split(">")[1]
        filenamepath=glob.glob(os.path.join(sharepath, "Bovine_isolates", 
                                                   "prokka", "*B083_F63_Y18_CC1.ffn"))
        for a in filenamepath:
            with open(os.path.join(sharepath, "Bovine_isolates", "Reverse_vaccinology", "B083_vaccine_candidates.fasta"), "a") as t:
                subprocess.run(["seqkit", "grep", "-r", "-p", "^"+name, a], stdout=t)
                 

In [None]:
# writing csv file from text file
with open (os.path.join(sharepath, "Bovine_isolates", "Reverse_vaccinology", "B083_vaccine_names.txt")) as f:
    for line in f:
        name1=line.split(None, 1)[0]
        name2=line.split(None, 1)[1]
        with open (os.path.join(sharepath, "Bovine_isolates", "Reverse_vaccinology", "B083_vaccine_names.csv"), "a") as s:
            s.write(name1+","+name2)

In [None]:
#running ariba on bovine reads against vaccine candidate fasta file
tsv=os.path.join(sharepath, "Bovine_isolates", "Reverse_vaccinology", "B133_vaccine_report.tsv")
fasta=os.path.join(sharepath, "Bovine_isolates", "Reverse_vaccinology", "B133_vaccine_candidates.fasta")
ref=os.path.join(sharepath, "Bovine_isolates", "Reverse_vaccinology", "B133_vaccine_report")
output=os.path.join(workpath, "ARIBA_B133_vaccine")
namelist = pd.read_csv(os.path.join(paperpath, "data", "sbhmlst.csv"), index_col="Assembly")
subprocess.run(["ariba", "prepareref", "-f", fasta, "-m", tsv, ref])
count=0
r1_files=glob.glob(os.path.join(readpath, "reads", "Bovine_processed", "*R1*"))
for r1 in r1_files:
    print(time.ctime())
    start=time.time()
    count+=1
    r2=r1.replace("R1", "R2")
    labname=namelist["newname"][r1.split("/")[5].split("_")[1]]
    print(count, "mapping", labname)
    subprocess.run(["ariba", "run", ref, r1, r2, os.path.join(output,labname)])       
    end=time.time()
    print((end-start)/60,"mins")
    
!ariba summary /home/jabin/Documents/Comparative/ARIBA_B133_vaccine/B133_vaccine /home/jabin/Documents/Comparative/ARIBA_B133_vaccine/*/report.tsv

outfile=os.path.join(workpath, "ARIBA_B133_vaccine")
a_s= pd.read_csv(os.path.join(outfile, "B133_vaccine.csv"))
a_s["name"]=a_s["name"].str.split("/").str[6]
a_s=a_s.replace("yes", "1")
a_s=a_s.replace("no", "0")
a_s = a_s[a_s.columns.drop(list(a_s.filter(regex='colour')))]
a_s.to_csv(os.path.join(outfile, "out_vaccine.phandango.csv"), index=False)


#removing report.tsv from ariba summary output tree
tree=os.path.join(outfile, "B133_vaccine.phandango.tre")
new_tree = os.path.join(outfile, "out_vaccine.phandango.tree")
with open(tree) as f:
    data=f.read()
    
with open(new_tree, "w") as f:
    f.write(data.replace("/report.tsv", ""))

In [None]:
#selecting proteins which are present in all the major CCs from the ariba output summary
#selecting columns which has 1 as the value across all the rows using its standard deviation
#and dropping columns which does not have same values
B003_vaccine=pd.read_csv(os.path.join(rvpath, "B003_majorCC_vaccine.phandango.csv" ))
B003_selected=B003_vaccine.drop(B003_vaccine.std()[(B003_vaccine.std() != 0)].index, axis=1)
B003_selected.to_csv(os.path.join(rvpath, "B003_selected_vaccine.csv"))

In [None]:
#picking protein sequences of the selected vaccine output using the column names 
B083_vac=pd.read_csv(os.path.join(rvpath, "B083_selected_vaccine.csv"))
for b in B083_vac.columns[2:]:
    l=b.split("_0")[0]
    print("picking", b, "from", l+".faa")
    with open(os.path.join(rvpath, "B083_selected_vaccine_protein.fasta"), "a") as c:
        subprocess.run(["samtools", "faidx", os.path.join(sharepath, "Bovine_isolates", "prokka", l+".faa"), b],stdout=c)
    

In [None]:
#picking nucleotide seq from prokka using column names of the selected vaccine csv
B083_vac=pd.read_csv(os.path.join(rvpath, "B083_selected_vaccine.csv"))
for b in B083_vac.columns[2:]:
    l=b.split("_0")[0]
    s=os.path.join(sharepath, "Bovine_isolates", "prokka", l+".ffn")
    print("picking", b, "from", l+".ffn")
    with open(os.path.join(rvpath, "B083_selected_vaccine_nuc.fasta"), "a") as t:
                subprocess.run(["seqkit", "grep", "-r", "-p", "^"+b, s], stdout=t)

In [None]:
# running blastp on selected proteins to assess protein conservation and sequence variability
protein = glob.glob(os.path.join(sharepath, "Bovine_isolates", "prokka", "*.faa"))
query = os.path.join(sharepath, "Bovine_isolates", "Reverse_vaccinology", "Shortlist_proteins.fasta")
output = os.path.join(sharepath, "Bovine_isolates", "Reverse_vaccinology", "protein_conservation")

for p in protein:
    filename=p.split("/")[5].split(".")[0]
    f=open (os.path.join(output, "TSV_files", filename+"_blastp.tsv"), "w")
    subprocess.run(["blastp", "-query", query, "-subject", p, "-outfmt", "6"], stdout=f)
    f.close()
    

In [None]:
#selecting the best hits of the blastp results with pct_identity over 90

output_file=glob.glob(os.path.join(output, "TSV_files", "*.tsv"))

for o in output_file:
    df = pd.read_csv(o, sep="\t", header=None,
                 names=["query_id", "subject_id", "pct_identity", "aln_length", "n_of_mismatches",
                        "gap_openings", "q_start", "q_end", "s_start", "s_end", "e_value", "bit_score"])
    df = df[(df['pct_identity'] >= 90) & (df['pct_identity'] <= 100)]
    df.to_csv(os.path.join(output, "all_proteins_blast90.tsv"), sep="\t", mode="a")

In [None]:
#extracting protein sequences from faa file of prokka using locus_tag from csv file
def protein(name):
    df=pd.read_csv(os.path.join(output, "all_proteins_blast90.tsv"), sep="\t")
    df=df[(df["query_id"])== name]
    for b in df["subject_id"]:
        l=b.split("_0")[0]
        #print("picking", b, "from", l+".faa")
        with open(os.path.join(sharepath, "Bovine_isolates", "Reverse_vaccinology", 
                               "protein_conservation", "Alignment", name+".fasta"), "a") as c:
            subprocess.run(["samtools", "faidx", os.path.join(sharepath, "Bovine_isolates", "prokka", l+".faa"), b],stdout=c)
    