### Diversity of prophages in KASPAH

In [5]:
# jupyter setup
%reload_ext autoreload
%autoreload 2

# import modules
import warnings
import pandas as pd
from pathlib import Path
from Bio import Phylo
from scripts.utils import preprocessing, coGRR, get_phariants, run_mashtree
from scripts.utils import tree2clades, table2genbank, between_phariants_easyfig
from scripts.utils import stGRR
warnings.filterwarnings('ignore')

In [19]:
# paths
work_dir = '/home/MCB/jkoszucki/phagedb/PHAGES-DB'
animm_dir = '/home/MCB/jkoszucki/Code/ANImm'
phrogs_annot_table = '/home/MCB/jkoszucki/Code/phage-diversity/other/upgraded_phrog_annot_v3.tsv'
# font_path='other/arial.ttf' 

input_dir = '/home/MCB/jkoszucki/phagedb'
inphared_dir = Path(input_dir, 'INPHARED-DB-1Aug2022/INPHARED-DB-1Aug2022-KLEBSIELLA')
prophages_dir = Path(input_dir, 'PROPHAGES-DB-1Aug2022/prophages')

# params
wgrr_threshold = 0.95

# Run

In [20]:
### preprocessing of PROPHAGE-DB-1Aug2022 & INPHARED-DB-1Aug2022-KLEBSIELLA
# integrate data
phagedb_dir = Path(work_dir, '0_phagedb')
preprocessing(inphared_dir, prophages_dir, phrogs_annot_table, phagedb_dir) 

Preprocessing alread done! To rerun delete folder: /home/MCB/jkoszucki/phagedb/PHAGES-DB/0_phagedb


In [21]:
### calculate wGRR
animm_results_dir = Path(work_dir, '1_cophariants')
process = coGRR(animm_dir, phagedb_dir, animm_results_dir)

ANImm already done! To rerun delete folder: /home/MCB/jkoszucki/phagedb/PHAGES-DB/1_cophariants


In [22]:
### phariants from wGRR
# MCL community detection

wgrr = Path(animm_results_dir, 'wgrr.csv')
annot_input = Path(phagedb_dir, 'annot_input.txt')
phariants = Path(animm_results_dir, 'phariants.tsv')

phariants_df = get_phariants(wgrr, annot_input, phariants, wgrr_threshold)

Check! In some case I can loose singletons here!
Done! With wGRR treshold = 0.95 we have 964 phage clusters :)

In [17]:
### mashtree
# local machine needs a lots of memory because tree is firstly dropped loccaly then copied
tree_dir = Path(work_dir, '2_mashtree')
cmd = run_mashtree(phagedb_dir, tree_dir)

Just run the command in bash. Problem with conda env AGAIN : / 

source ~/.bashrc; conda activate mashtree; mashtree.pl /home/MCB/jkoszucki/phagedb/PHAGES-DB/0_phagedb/split_records/fasta/* >> /home/MCB/jkoszucki/phagedb/PHAGES-DB/2_mashtree/tree.newick; conda activate mybase;


In [26]:
### get 'clades' based on mashtree (for visualisation purposes)
# there are size limitations on jpeg files and its easier to look on genomes when they are clustered 
tree_dir = Path(work_dir, '2_mashtree')
tree_path = Path(tree_dir, 'tree.newick')
phariants = Path(animm_results_dir, 'phariants.tsv')
clades = Path(tree_dir, 'clades.tsv')

n_clusters = 40

phage_clusters_df = tree2clades(tree_path, phariants, clades, n_clusters, kmeans_show=False)

Everything is good :) Cluster tree with kmeans method (n_cluster: 40).
Done! Phages grouped using mashtree! :)

In [27]:
### get easyfig figures between phariants of each representative phage from cluster
# annotated : ) 

clades = Path(tree_dir, 'clades.tsv')
phages = Path(phagedb_dir, 'phages.tsv')
phages_genbank_dir = Path(phagedb_dir, 'split_records/genbank')

easyfig = Path('other/Easyfig.py').resolve()

annotate_columns = ['K_locus', 'ST', 'phageID', 'genetic_localisation']
leg_name = 'structural'

results_dir, process = between_phariants_easyfig(work_dir, clades, phages, phages_genbank_dir, easyfig, leg_name=leg_name)

Fonth path for annotation of easyfig schemes: /home/MCB/jkoszucki/Code/phage-diversity/other/arial.ttf
Generating easyfig figures for 40 clusters :) 
Done! :)


In [18]:
### get structural clusters

struct_categories = ['head and packaging', 'connector', 'tail']
struct_dir = Path(work_dir, '3_struc_phariants')

process = stGRR(animm_dir, phagedb_dir, struct_dir, structural_categories)

In [None]:
import matplotlib.pyplot as plt
import searborn as sns


pd.read_csv(cogrr)
pd.read_csv(strugrr)