# Compare genbank phages to prophages

The genbank data comes from [the Cook et al paper](https://www.biorxiv.org/content/10.1101/2021.05.01.442102v1) and especially their [supplemental material](https://www.biorxiv.org/content/10.1101/2021.05.01.442102v1.supplementary-material)

```
Cook R, Brown N, Redgwell T, Rihtman B, Barnes M, Stekel DJ, Clokie M, Hobman J, Jones M, Millard AD. 2021. INfrastructure for a PHAge REference Database: Identification of large-scale biases in the current collection of phage genomes. bioRxiv.
```

In [1]:
# A lot of this is not used, but we import it so we have it later!
import os
import sys
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from matplotlib.ticker import FuncFormatter

import pandas as pd
import seaborn as sns
import numpy as np

import math
import re
import string

from PhiSpyAnalysis import theils_u, DateConverter, printmd
from PhiSpyAnalysis import read_phages, read_gtdb, read_checkv, read_base_pp, read_categories, read_metadata, read_transposons

from scipy.stats import pearsonr, f_oneway
from sklearn.linear_model import LinearRegression
from sklearn import decomposition
from sklearn.ensemble import RandomForestClassifier

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd, tukeyhsd, MultiComparison
from statsmodels.multivariate.manova import MANOVA


In [4]:
hosts = set()
with open('../data/Cook_unique_hosts.txt', 'r') as h:
    for l in h:
        if l.strip():
            hosts.add(l.strip())
hosts

{'Acaryochloris',
 'Acholeplasma',
 'Achromobacter',
 'Acidianus',
 'Acidithiobacillus',
 'Acidovorax',
 'Acinetobacter',
 'Actinomyces',
 'Actinoplanes',
 'Aeribacillus',
 'Aerococcus',
 'Aeromonas',
 'Aeropyrum',
 'Aggregatibacter',
 'Agrobacterium',
 'Alteromonas',
 'Aminobacter',
 'Anabaena',
 'Anoxybacillus',
 'Aphanizomenon',
 'Aquamicrobium',
 'Arthrobacter',
 'Arthronema',
 'Aurantimonas',
 'Azobacteroides',
 'Azospirillum',
 'Bacillus',
 'Bacilus',
 'Bacteroides',
 'Bdellovibrio',
 'Bifidobacterium',
 'Bordetella',
 'Bradyrhizobium',
 'Brevibacillus',
 'Brevibacterium',
 'Brevundimonas',
 'Brochothrix',
 'Brucella',
 'Burkholderia',
 'Buttiauxella',
 'Butyrivibrio',
 'Caldibacillus',
 'Campylobacter',
 'Caulobacter',
 'Celeribacter',
 'Cellulophaga',
 'Chlamydia',
 'Citrobacter',
 'Citromicrobium',
 'Clavibacter',
 'Clostridioides',
 'Clostridium',
 'Colwellia',
 'Corynebacterium',
 'Croceibacter',
 'Cronobacter',
 'Curvibacter',
 'Cutibacterium',
 'Cylindrospermopsis',
 'Delf

In [5]:
gtdb = read_gtdb()
gtdb

Unnamed: 0,accession,ambiguous_bases,checkm_completeness,checkm_contamination,checkm_marker_count,checkm_marker_lineage,checkm_marker_set_count,checkm_strain_heterogeneity,coding_bases,coding_density,...,trna_aa_count,trna_count,trna_selenocysteine_count,domain,phylum,class,order,family,genus,species
0,GB_GCA_000006155.2,1916,93.12,0.00,1171,g__Bacillus (UID902),324,0.0,4305660,80.178992,...,16,31,0,Bacteria,Firmicutes,Bacilli,Bacillales,Bacillaceae_G,Bacillus_A,Bacillus_A anthracis
1,GB_GCA_000007385.1,0,99.82,0.00,481,c__Gammaproteobacteria (UID4202),276,0.0,4190634,84.805944,...,20,53,0,Bacteria,Proteobacteria,Gammaproteobacteria,Xanthomonadales,Xanthomonadaceae,Xanthomonas,Xanthomonas oryzae
2,GB_GCA_000008605.1,67,100.00,0.00,235,f__Spirochaetaceae (UID2512),124,0.0,1048744,92.155875,...,20,45,0,Bacteria,Spirochaetota,Spirochaetia,Treponematales,Treponemataceae,Treponema,Treponema pallidum
3,GB_GCA_000010565.1,0,100.00,0.63,295,p__Firmicutes (UID1022),158,0.0,2608397,86.217312,...,20,51,1,Bacteria,Firmicutes_B,Desulfotomaculia,Desulfotomaculales,Pelotomaculaceae,Pelotomaculum,Pelotomaculum thermopropionicum
4,GB_GCA_000013845.2,0,100.00,0.00,332,o__Clostridiales (UID1375),124,0.0,2428396,82.037966,...,20,95,1,Bacteria,Firmicutes_A,Clostridia,Clostridiales,Clostridiaceae,Clostridium_P,Clostridium_P perfringens
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191522,RS_GCF_902166935.1,0,99.62,0.09,1312,g__Klebsiella (UID5140),336,0.0,4894244,87.293148,...,20,87,1,Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Klebsiella,Klebsiella pneumoniae
191523,RS_GCF_902166945.1,0,99.94,0.72,1162,f__Enterobacteriaceae (UID5121),336,0.0,5162721,86.540042,...,20,86,1,Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Klebsiella,Klebsiella pneumoniae
191524,RS_GCF_902167295.1,0,99.51,0.00,303,p__Bacteroidetes (UID2591),203,0.0,5737447,90.122088,...,20,58,0,Bacteria,Bacteroidota,Bacteroidia,Chitinophagales,Chitinophagaceae,Chitinophaga,Chitinophaga pinensis_A
191525,RS_GCF_902167305.1,0,99.51,0.00,303,p__Bacteroidetes (UID2591),203,0.0,5738096,90.147873,...,20,58,0,Bacteria,Bacteroidota,Bacteroidia,Chitinophagales,Chitinophagaceae,Chitinophaga,Chitinophaga pinensis_A


In [None]:
for h in hosts:
    if 

In [10]:
phyla = {}
for h in hosts:
    for ph in pd.unique(gtdb[gtdb['genus'] == h]['phylum']):
        phyla[ph] = phyla.get(ph, 0)+1
phyla

{'Firmicutes': 28,
 'Proteobacteria': 92,
 'Bacteroidota': 14,
 'Cyanobacteria': 10,
 'Actinobacteriota': 25,
 'Campylobacterota': 3,
 'Firmicutes_A': 7,
 'Aquificota': 1,
 'Fusobacteriota': 1,
 'Deinococcota': 2,
 'Thermotogota': 1,
 'Verrucomicrobiota_A': 1,
 'Bdellovibrionota': 1,
 'Spirochaetota': 1,
 'Myxococcota': 1}