### Diversity of prophages in KASPAH

In [1]:
# jupyter setup
%reload_ext autoreload
%autoreload 2

# import modules
import warnings
import pandas as pd
from pathlib import Path
from Bio import Phylo
from scripts.utils import preprocessing, run_ANImm, get_phariants, run_mashtree
from scripts.utils import tree2clades, table2genbank, run_easyfig
warnings.filterwarnings('ignore')

In [2]:
# paths
work_dir = '/home/MCB/jkoszucki/phagedb/PHAGES-DB'
animm_dir = '/home/MCB/jkoszucki/Code/ANImm'
phrogs_annot_table = '/home/MCB/jkoszucki/phagedb/upgraded_phrog_annot_v3.tsv'

input_dir = '/home/MCB/jkoszucki/phagedb'
inphared_dir = Path(input_dir, 'INPHARED-DB-1Aug2022/INPHARED-DB-1Aug2022-KLEBSIELLA')
prophages_dir = Path(input_dir, 'PROPHAGES-DB-1Aug2022/prophages')

# params
wgrr_threshold = 0.85

# Run

In [3]:
### preprocessing of PROPHAGE-DB-1Aug2022 & INPHARED-DB-1Aug2022-KLEBSIELLA
# integrate data
phagedb_dir = Path(work_dir, '0_phagedb')
preprocessing(inphared_dir, prophages_dir, phrogs_annot_table, phagedb_dir) 

Records saved successfully :) 
ORFs copied successfully :)
Nothing to do in table2genbank :)
To force rerun delete /home/MCB/jkoszucki/phagedb/PHAGES-DB/0_phagedb/phages.gb
Metadata unified and copied successfully :) 
Functiona annotation input tables merged and saved :)


In [4]:
### calculate wGRR
animm_results_dir = Path(work_dir, '1_ANImm')
process = run_ANImm(animm_dir, phagedb_dir, animm_results_dir)

ANImm already done! To rerun delete folder: /home/MCB/jkoszucki/phagedb/PHAGES-DB/1_ANImm


In [5]:
### phariants from wGRR
# MCL community detection

wgrr = Path(animm_results_dir, 'wgrr.csv')
annot_input = Path(phagedb_dir, 'annot_input.txt')
phariants = Path(animm_results_dir, 'phariants.tsv')

phariants_df = get_phariants(wgrr, annot_input, phariants, wgrr_threshold)

Check! In some case I can loose singletons here!
Done! With wGRR treshold = 0.85 we have 696 phage clusters :)

In [6]:
### mashtree

tree_dir = Path(work_dir, '2_mashtree')
cmd = run_mashtree(phagedb_dir, tree_dir)

Just run the command in bash. Problem with conda env AGAIN : / 

source ~/.bashrc; conda activate mashtree; mashtree.pl /home/MCB/jkoszucki/phagedb/PHAGES-DB/0_phagedb/split_records/fasta/* >> /home/MCB/jkoszucki/phagedb/PHAGES-DB/2_mashtree/tree.newick; conda activate mybase;


In [142]:
### get 'clades' based on mashtree 

tree_dir = Path(work_dir, '2_mashtree')
tree_path = Path(processed, 'tree.newick')
phariants = Path(processed, 'phariants.tsv')
clades = Path(processed, 'clades.tsv')

n_clusters = 25

phage_clusters_df = tree2clades(tree_path, phariants, clades, n_clusters, kmeans_show=False)

Everything is good :) Cluster tree with kmeans method (n_cluster: 25).
Done! Phages grouped using mashtree! :)

In [143]:
### table2genbank
# from functional annotation table (nr 5 proteins) generate genbank file

annot_table = Path(tables, 'annot_table.tsv')
phages_fasta = Path(records, 'prophages.fasta')
genbank = Path(records, 'prophages.gb')
phrog_annot_table = '/Users/januszkoszucki/MGG Dropbox/Janusz Koszucki/Databases/phrog_annot_v3_upgraded.tsv'

annot_df = table2genbank(annot_table, phages_fasta, genbank, phrog_annot_table)

Nothing to be done :)
To force rerun delete /Users/januszkoszucki/Work/Results/prophage-diversity/PROPHAGES-DB-1Aug2022/2_records/prophages.gb


In [147]:
### get easyfig figures
# annotated : ) 

clades = Path(processed, 'clades.tsv')
prophages = Path(tables, 'prophages.tsv')
phages_genbank_dir = Path(records, 'prophages_genbank')

easyfig = '/Users/januszkoszucki/MGG\ Dropbox/Janusz\ Koszucki/Work/Apps/Easyfig.py'
annotate_columns = ['K_locus', 'ST', 'phageID', 'genetic_localisation']
leg_name = 'structural'

results_dir, process = run_easyfig(work_dir, clades, prophages, phages_genbank_dir, easyfig, leg_name=leg_name, annotate_columns=annotate_columns)

Nothing to do :)
To force rerun delete /Users/januszkoszucki/Work/Results/prophage-diversity/PROPHAGES-DB-1Aug2022/easyfig/annotated
