In [None]:
import glob
import os
import shutil
import subprocess
import pandas as pd
import time
import numpy as np
from pathlib import Path
import matplotlib
%matplotlib inline
from config import * #config file for paths of output and input files

In [None]:
# Assembly of multiple isolate reads using shovill
r1_files=glob.glob(os.path.join(readpath, "reads", "Bovine_processed", "*_R1.fastq*"))
count=0
for r1 in r1_files:
    start=time.time()
    r2=r1.replace("_R1.", "_R2.")
    count+=1
    labname=r1.split("/")[5].split("_")[1]
    print(count, "assembling", labname)
    subprocess.run(["shovill", "--outdir", os.path.join(workpath, "Bovine_isolates", "Assembly", labname), "--R1", r1, "--R2", r2])
    end=time.time()
    print((end-start)/60,"mins")

In [22]:
# Analysing the quality of contigs using QUAST
outdir=os.path.join(workpath, "Bovine_QUAST")
count=0
bovine_contigs=glob.glob(os.path.join(workpath, "Bovine_isolates", "Assembly", "*", "*contigs.fa"))
for b in bovine_contigs:
    count+=1
    quastname=b.split("/")[5]
    print(count, "processing quality of:", quastname)
    subprocess.run(["quast.py", "-o", outdir+"/"+quastname, b])
    

In [None]:
#MLST identification of isolates using mlst tool(Tseemann)
!mlst --csv   /home/jabin/Documents/Bovine_isolates/Assembly/*/contigs.fa > /home/jabin/Documents/Bovine_isolates/MLST/Bovine_mlst.csv

In [None]:
#Annotating assembly contigs using prokka
bovine_name=pd.read_csv(os.path.join(sharepath, "Bovine_isolates", "MLST", "bovinemlst_rename.csv" ), index_col="Isolate")
count=0
for b in bovine_contigs:
    print(time.ctime())
    start=time.time()
    count+=1
    prokkaname=bovine_name["New_name"][b.split("/")[5]]
    print(count, "annotating", prokkaname)
    subprocess.run(["prokka", "--outdir", "Bovine_isolates/prokka/", "--prefix", prokkaname, "--locustag", prokkaname, "--force", b])
    end=time.time()
    print((end-start)/60,"mins")

In [None]:
#Spa type identification using spa_typing tool
dir="/home/jabin/spa_typing/"
os.chdir(dir)
bovine_contigs=glob.glob(os.path.join(sharepath, "Bovine_isolates", "Assembly", "*", "*contigs.fa"))
bovine_name=pd.read_csv(os.path.join(paperpath, "data", "bovinenewnamemlst.csv"), index_col="Assembly")

r="/home/jabin/Documents/spa_type/sparepeats.fasta"
o="/home/jabin/Documents/spa_type/spatypes.txt"
lines=[]
for b in bovine_contigs:
    labname=bovine_name["newname"][b.split("/")[5]]
    print("spatyping", labname)
    data=subprocess.run(["python2", "get_spa_type.py","-f", b], stdout=subprocess.PIPE, encoding='utf-8')
    line1, line2, _=data.stdout.split("\n")
    if not lines:
        lines.append(line1 +'\n')
    lines.append(labname+'\t'+line2+'\n')
    with open(os.path.join(sharepath, "Bovine_isolates", "spa", "spa_results.txt"), "w") as f:
        f.writelines(lines)
    
    
homedir="/home/jabin/research/notebooks/"
os.chdir(homedir)

In [None]:
#pangenome analysis of genomes using roary
!roary -f /home/jabin/Documents/Bovine_isolates/roary_bovine_STname/ -e --mafft -r -p 4 -cd 100 /media/sf_dataJabin/Bovine_isolates/prokka/*.gff

In [None]:
#running snippy on reads using Staph_ST1 as reference and running core on the results.
outdir="/home/jabin/Documents/Bovine_isolates/snippy_CC1" #output file
os.mkdir(outdir)
ref=os.path.join(sharepath, "references", "Staph_ST1.gbk") #reference path
for m in mlst_CC1["Assembly"]:
    CC1_r1=glob.glob(os.path.join(readpath, "reads","*", "processed_"+m+"_R1.fastq.gz")) #read path
    for r1 in CC1_r1:
        r2=r1.replace("R1", "R2")
        labname=mlst_data["newname"][r1.split("/")[5].split("_")[1]]
        subprocess.run(["snippy", "--outdir", os.path.join(outdir,labname), "--ref", ref, "--R1", r1, "--R2", r2])


!snippy-core --prefix core --ref /media/sf_dataJabin/references/Staph_ST1.gbk /home/jabin/Documents/Bovine_isolates/snippy_CC1/*

In [None]:
#creating input file for GenAlex amova using snippy results
snp=pd.read_csv(os.path.join(readpath, "Bovine_isolates", "snippy_results", "snippy_CC1", "core_snippyCC1.tab"), sep="\t")
snp_change=snp.replace("A", "1")
snp_change=snp_change.replace("C", "2")
snp_change=snp_change.replace("G", "3")
snp_change=snp_change.replace("T", "4")
snp_change.to_csv(os.path.join(readpath, "Bovine_isolates", "snippy_results", "snippy_CC1","core_snippyCC1_amova.tab"), sep="\t", index=False)
pd.read_csv(os.path.join(readpath, "Bovine_isolates", "snippy_results", "snippy_CC1", "core_snippyCC1_amova.tab"), header=None, sep="\t").T.to_csv(os.path.join(readpath, "Bovine_isolates", "snippy_results", "snippy_CC1","core_snippyCC1_amova_trans.tab"), sep="\t", header=False, index=False)

In [None]:
#running roary on one farm bovine
!roary -f /home/jabin/Documents/Bovine_isolates/roary_bovine_onefarm_STname -e --mafft -r -p 4 -cd 100 /media/sf_PhD_data/Comparative/gff_files/Onefarm_bovine_gff/*.gff

In [None]:
#running roary on all CC1 bovine
!roary -f /home/jabin/Documents/Bovine_isolates/roary_bovine_CC1all_STname -e --mafft -r -p 4 -cd 100 /media/sf_PhD_data/Comparative/gff_files/CC1_bovine_gff/*.gff

In [None]:
#running roary on CC1 of one farm bovine
!roary -f /home/jabin/Documents/Bovine_isolates/roary_bovine_CC1_STname -e --mafft -r -p 4 -cd 100 /media/sf_PhD_data/Comparative/gff_files/CC1_onefarm_bovine_gff/*.gff

In [None]:
#running roary on all CC97 bovine
!roary -f /home/jabin/Documents/Bovine_isolates/roary_bovine_CC97all_STname -e --mafft -r -p 4 -cd 100 /media/sf_PhD_data/Comparative/gff_files/CC97_bovine_gff/*.gff

In [None]:
#running roary on all CC151 bovine
!roary -f /home/jabin/Documents/Bovine_isolates/roary_bovine_CC151all_STname -e --mafft -r -p 4 -cd 100 /media/sf_PhD_data/Comparative/gff_files/CC151_bovine_gff/*.gff

In [None]:
#running roary on all CC8 bovine
!roary -f /home/jabin/Documents/Bovine_isolates/roary_bovine_CC8all_STname -e --mafft -r -p 4 -cd 100 /media/sf_PhD_data/Comparative/gff_files/CC8_bovine_gff/*.gff

In [None]:
#running AgrVATE  on bovine isolates
bovine_isolates=glob.glob(os.path.join(workpath, "contigs_STname", "*.fa"))
outdir="/home/jabin/Documents/Bovine_isolates/AgrVATE_results"
for b in bovine_isolates:
    subprocess.Popen(["agrvate", "-i", b])

In [None]:
agr_operon=glob.glob(os.path.join(outdir, "*", "*.fna"))

In [None]:
len(agr_operon)

In [None]:
#extracting protein sequences of the pangenome genes from prokka output using samtools
with open("/home/jabin/Documents/Bovine_isolates/roary_bovine_STname/pan_genome_reference.fa") as pg:
    data = []
    for ln in pg:
        if ln.startswith(">"):
            data.append(ln)

s = " ".join(data)
a = s.split("\n ")

for b in a:
    l=b.split(" ")[0].split(",")[-1].split(">")[1]
    r=b.split("_0")[0].split(">")[1]
    with open(os.path.join("/home", "jabin", "Documents", "Bovine_isolates", "bovine_pan_genome_protein.fasta"), "a") as f:
        subprocess.run(["samtools", "faidx", os.path.join(sharepath, "Bovine_isolates", "prokka", r+".faa"), l],stdout=f)

In [None]:
agr_operon

In [None]:
#extracting agr_operon genes from fasta file and renaming the header to the isolate name using seqkit
mlstdata=pd.read_csv(os.path.join(paperpath, "data", "sbhmlst.csv"),index_col="newname")
file_dir="/home/jabin/Documents/Bovine_isolates/AgrVATE_results/"
agr_operon = glob.glob(os.path.join(file_dir, "*/*.fna*"))
agr_csv=pd.read_csv(os.path.join(file_dir, "Bovine_agr.csv"), sep="\t", index_col="filename")
with open (os.path.join(file_dir, "agr_concat.fasta"), "a") as f:
    for a in agr_operon:
        filename=a.split("/")[6].split("-")[0]
        agr_name=agr_csv["agr_group"][filename]
        comb_name=filename+"_"+agr_name
        ex=subprocess.Popen(["seqkit", "grep", "-r", "-p", "^contig", a], stdout=subprocess.PIPE)
        subprocess.Popen(["seqkit","replace", "-p",".+", "-r",comb_name],stdin=ex.stdout, stdout=f)



f.close()