# Merge the prophage predictions with RAST metadata

Combine RAST metadata and prophage predictions

In [1]:
import os
import sys
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import pandas as pd
import seaborn as sns
import numpy as np

import math
import re

from scipy.stats import pearsonr, f_oneway
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd, tukeyhsd, MultiComparison
from statsmodels.multivariate.manova import MANOVA
from sklearn import decomposition
from sklearn.ensemble import RandomForestClassifier

# for parsing collection dates
from dateutil.parser import parse, ParserError
import pytz

import subprocess
import gzip


In [2]:
# this is a neat trick for getting markdown in our output
# see https://stackoverflow.com/questions/23271575/printing-bold-colored-etc-text-in-ipython-qtconsole
# for the inspiration
from IPython.display import Markdown, display
def printmd(string, color="black"):
    colorstr = "<span style='color:{}'>{}</span>".format(color, string)
    display(Markdown(colorstr))

# Read the phage data. Check the version!

In [3]:
#phagesdf = pd.read_csv("../small_data/phages_per_genome.tsv.gz", compression='gzip', header=0, delimiter="\t")
phagesdf = pd.read_csv("../data/phages_per_genome.tsv.gz", compression='gzip', header=0, delimiter="\t")
githash = subprocess.check_output(["git", "describe", "--always"]).strip().decode()
print(f"Please note that this was run with git commit {githash} that has {phagesdf.shape[0]:,} genomes parsed and {phagesdf['Total Predicted Prophages'].sum():,} total prophages")

Please note that this was run with git commit 361ddcb that has 553,082 genomes parsed and 20,946,107 total prophages


### Date conversion

This routine is used to convert the dates into something useable. Here we choose a point in time (i.e. Feb 24, 1977) and set all dates to be relative to that. We use years as our unit, but could also use seconds. ([Hint](https://pubmed.ncbi.nlm.nih.gov/870828/))


In [4]:
# Convert the collection date to a number. We take todays date, and then convert relative to that
# it allows dates pre-epoch

yr = re.compile("^[~]*(\\d{4})['s]*$") # eg ~1970's
yrrange = re.compile("^(\\d{4})\\s*.\\s*(\\d{4})$") # eg 2001-2002
myd = re.compile('^\\d{1,4}/\\d{1,2}/\\d{1,4}$') # eg 2/3/21
moyr = re.compile('^\\w{3}-\\d{2}$') # eg Dec-09 should be 01-Dec-09 else parsed as 9th Dec this year
damoyrrange = re.compile('^(\\d+\\s+\\w+\\s+\\d{4})\\s*.{1,3}?\\s*(\\d+\\s+\\w+\\s+\\d{4})$') 
modacyrrange = re.compile('^(\\w+\\s+\\d+,*\\s+\\d{4})\\s*.{1,3}?\\s*(\\w+\\s+\\d+,*\\s+\\d{4})$')
moyrrange = re.compile('^(\\w+\\s+\\d{4})\\s*.{1,3}?\\s*(\\w+\\s+\\d{4})$')
year42 = re.compile('^(\\d{4})-\\d{2}$')
lem = re.compile("^late\\s*|^early\\s*|^mid\\s*|^prior to\\s*|^before\\s*|^pre-", re.IGNORECASE)
splitseen = set()

try:
    # adate = parse("24/02/1977")
    adate = parse("1/1/0001")
except ParserError as e:
    sys.stderr.write("Error parsing adate: {e}\n")
    sys.exit(1)
    
adate = adate.replace(tzinfo=pytz.UTC)

def try_parsing(x):
    """
    Attempt to parse a date, and catch an error.
    
    If we fail, we return None, otherwise we return the years since now()
    """
    try:
        dt = parse(x)
    except:
        return None
    
    dt = dt.replace(tzinfo=pytz.UTC)
    
    if dt < adate:
        tdelt = adate - dt
        seconds = -1 * ((tdelt.days * 86400) + tdelt.seconds)
    else:
        tdelt = dt - adate
        seconds = (tdelt.days * 86400) + tdelt.seconds
    # convert seconds to years
    # then we add one because our epoch is now Jan 1, 0001
    return (seconds/31557600)+1
    

def convert_date(x, verbose=False):
    """
    Convert the date to years and fractions.
    
    We try several times, and clean it up as we go along.    
    
    :param x: the date string to convert
    :param verbose: more output information
    """
    if pd.isna(x):
        return np.nan
    
    # we need to fix this before trying to parse
    m = moyr.match(x)
    if m:
        x = '01-' + x

    # can we parse this date? If so, lets do it and return the value
    attempt = try_parsing(x)
    if attempt:
        if verbose:
            sys.stderr.write("Parsed at step 1\n")
        return attempt
    orix = x
    
    if x.lower() in ['restricted access', 'none', 'not collected', 'not applicable', 
                     'not available: not collected', 'unspecified']:
        return np.nan
   
    # a few one off cases that are just easier to fix
    if 'May 2015-Nov 2015' == x:
        x = 'May 2015'
    
    if '1954-65' == x:
        x = '01 January 1954'
        
    if 'Jul-00' == x:
        x = 'Jul-2000'

    if '2015_9' == x:
        x = 'Sep-2015'
    
    if '31-Mac-2013' == x:
        x = '31-May-2013'
    
    if '2010-0916' == x:
        x = '16 Sep 2010'
     
    x = lem.sub('', x)
    
        
    if '_' in x:
        x = x.replace('_', '-')
        
    x = x.replace(' or earlier', '')
    x = x.replace('collected in the ', '')
        
    # some regular expressions of variants of day month year - day month year ranges. We choose 1
    m = yrrange.match(x)
    if m:
        x = '01 January ' + m.groups()[1]

    m = yr.match(x)
    if m:
        x = '01 January ' + m.groups()[0]

    m = year42.match(x)
    if m:
        x = '01 January ' + m.groups()[0]
            
    m = modacyrrange.match(x)
    if m:
        x = m.groups()[1]
        
    m = damoyrrange.match(x)
    if m:
        x = m.groups()[1]
    
    m = moyrrange.match(x)
    if m:
        x = '01 ' + m.groups()[1]
        
    
        
    # can we parse this date? If so, lets do it and return the value
    attempt = try_parsing(x)
    if attempt:
        if verbose:
            sys.stderr.write(f"Parsed at step 2. Now {orix} is {x}\n")
        return attempt
          
    if '/' in x:
        if x not in splitseen:
            # sys.stderr.write(f"Splitting {x}\n")
            splitseen.add(x)
        p = x.split('/')
        x = p[1]
   
    # can we parse this date? If so, lets do it and return the value
    attempt = try_parsing(x)
    if attempt:
        if verbose:
            sys.stderr.write(f"Parsed at step 3. Now {orix} is {x}\n")
        return attempt
   
    if x.endswith('-00'):
        x = x.replace('-00', '-2000')

    # can we parse this date? If so, lets do it and return the value
    attempt = try_parsing(x)
    if attempt:
        if verbose:
            sys.stderr.write(f"Parsed at step 4. Now {orix} is {x}\n")
        return attempt

    
    sys.stderr.write(f"can't parse |{x}| from |{orix}|\n")
            
    return np.nan

### Read the data file

The metadata is from PATRIC. The phage data is from us.

**NOTE:** Some of the PATRIC data refers to specific chromosomes/fragments in the GenBank file (e.g. plasmid, chromosome), but that may not equate to our predictions, because we have used the whole GenBank file. The PATRIC metadata is redundant for many fields, and so we just keep the first entry for each NCBI Assembly.

In [5]:
# the full data set. Don't try this at home!
# metadf = pd.read_csv("../small_data/patric_genome_metadata.tsv.gz", compression='gzip', header=0, delimiter="\t")
metadf = pd.read_csv("../data/patric_genome_metadata.tsv.gz", compression='gzip', header=0, delimiter="\t")
metadf['isolation_date'] = metadf.collection_date.apply(convert_date)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
can't parse |1999 and 2000| from |1999 and 2000|
can't parse |0-Apr-2008| from |0-Apr-2008|
can't parse |CGMCC No.5392| from |CGMCC No.5392|
can't parse |CGMCC No.5392| from |CGMCC No.5392|


## Filter for only the first of each genome assembly

    The metadata contains multiple entries for a genome assembly if it is submitted more than once, so here we just filter for the first instance. We might think about something smarter,  but this seems to work

In [6]:
metadf = metadf.groupby('assembly_accession').first().reset_index()

## Clean up the data

These are specific things in the data that we need to clean/replace. Add things here as you find them!

In [7]:
metadf['isolation_country'] = metadf['isolation_country'].replace('USA', 'United States')
metadf['isolation_country'] = metadf['isolation_country'].replace('Ecully', 'France')

#### Drop Geographic Location

This is just a proxy for `isolation_country` and we don't want it duplicated.

In [8]:
metadf['geographic_location'] = metadf['geographic_location'].replace('USA', 'United States')
metadf[
    (metadf['geographic_location'].notnull()) &
    (metadf['isolation_country'].notnull()) & 
    (metadf['geographic_location'] != metadf['isolation_country'])
][['isolation_country', 'geographic_location']]

Unnamed: 0,isolation_country,geographic_location
1,Canada,Canada: Quebec
5,United States,New York State
7,United States,"West Palm Beach, Florida"
10,Denmark,Denmark: Aarhus
22,Norway,"Kongsvinger, Norway"
...,...,...
320148,United Kingdom,"United Kingdom:London, United Kingdom"
320149,United Kingdom,"United Kingdom:Scotland, UK"
320163,United States,"USA:Muskegon County, MI"
320164,Slovenia,Slovenia: Pragersko


In [9]:
metadf = metadf.drop('geographic_location', axis=1)

## Read our categories

This is the data from Laura

In [10]:
catdf = pd.read_csv("../data/categories.tsv.gz", compression='gzip', header=0, delimiter="\t")
if 'gbff' in catdf:
    catdf = catdf.drop('gbff', axis=1)
catdf = catdf.groupby('assembly_accession').first().reset_index()
catdf

Unnamed: 0,assembly_accession,Category
0,GCA_000003645.1,food
1,GCA_000003925.1,soil
2,GCA_000003955.1,human blood
3,GCA_000005825.2,soil
4,GCA_000006155.2,human other
...,...,...
54640,GCA_902860175.1,built environment
54641,GCA_902860185.1,human other
54642,GCA_902860195.1,human other
54643,GCA_902860235.1,plant


### Make sure we have the right column

PATRIC calls the column `assembly_accession` while GTDB calls it `ncbi_genbank_assembly_accession`

In [11]:
acccol = 'assembly_accession'
metadf[acccol]

0         GCA_000003135.1
1         GCA_000003215.1
2         GCA_000003645.1
3         GCA_000003925.1
4         GCA_000003955.1
               ...       
320171    GCF_900167595.1
320172    GCF_900167605.1
320173    GCF_900167615.1
320174    GCF_900167625.1
320175    GCF_900167635.1
Name: assembly_accession, Length: 320176, dtype: object

In [12]:
catdf[acccol]

0           GCA_000003645.1
1           GCA_000003925.1
2           GCA_000003955.1
3           GCA_000005825.2
4           GCA_000006155.2
                ...        
54640       GCA_902860175.1
54641       GCA_902860185.1
54642       GCA_902860195.1
54643       GCA_902860235.1
54644    assembly_accession
Name: assembly_accession, Length: 54645, dtype: object

### Abstract out the accession and name into new columns

This will allow us to merge the data with the PATRIC and GTDB data

In [13]:
def get_acc_name(x):
    regexp = re.compile('(\w+\.\d+)_([\w\.\-]+)_genomic.gbff.gz')
    m = regexp.match(x)
    if not m:
        sys.stderr.write(f"WARNING: Regexp did not match {x}\n")
        return (None, None)
    return list(m.groups())

phagesdf = pd.concat([pd.DataFrame.from_records(phagesdf['Contig'].apply(get_acc_name), columns=[acccol, 'Name']), phagesdf], axis=1)
phagesdf

Unnamed: 0,assembly_accession,Name,Contig,Genome length,Contigs,Phage Contigs,Total Predicted Prophages,Kept,No phage genes,Not enough genes,bp prophage
0,GCA_000003135.1,ASM313v1,GCA_000003135.1_ASM313v1_genomic.gbff.gz,2396359,114,10,16,2,1,13,48916
1,GCA_000003645.1,ASM364v1,GCA_000003645.1_ASM364v1_genomic.gbff.gz,5269725,1,1,31,1,10,20,40297
2,GCA_000003925.1,ASM392v1,GCA_000003925.1_ASM392v1_genomic.gbff.gz,5561906,1,1,38,6,13,19,268081
3,GCA_000003955.1,ASM395v1,GCA_000003955.1_ASM395v1_genomic.gbff.gz,5790501,1,1,46,6,11,29,166286
4,GCA_000005825.2,ASM582v2,GCA_000005825.2_ASM582v2_genomic.gbff.gz,4249248,3,3,33,3,9,21,93416
...,...,...,...,...,...,...,...,...,...,...,...
553077,GCA_902860175.1,LMG_5997,GCA_902860175.1_LMG_5997_genomic.gbff.gz,7197255,38,21,33,2,14,17,69051
553078,GCA_902860185.1,LMG_6103,GCA_902860185.1_LMG_6103_genomic.gbff.gz,6497464,13,8,22,0,10,12,0
553079,GCA_902860195.1,LMG_7053,GCA_902860195.1_LMG_7053_genomic.gbff.gz,6702936,200,148,33,1,11,21,12819
553080,GCA_902860205.1,LMG_6001,GCA_902860205.1_LMG_6001_genomic.gbff.gz,6320373,36,19,35,2,21,12,41572


### Merge the dataframes

First, select some columns we want to keep from PATRIC, and then merge the data frames. We write this to a tsv file for Laura to append isolation information to.

In [14]:
interesting_cols = [acccol, 'isolation_site', 'isolation_country', 'latitude', 'longitude', 'altitude', 'depth',
                    'other_environmental', 'host_name', 'host_gender', 'host_age', 'host_health', 
                    'body_sample_site', 'body_sample_subsite', 'other_clinical', 'gram_stain', 'cell_shape',
                    'motility', 'sporulation', 'temperature_range', 'optimal_temperature', 'salinity',
                    'oxygen_requirement', 'habitat', 'disease', 'isolation_date']

few_interesting_cols = [acccol,  'isolation_country',  'gram_stain',  
                    'motility', 'sporulation', 'temperature_range', 'habitat', 'disease', 'isolation_date']


tempdf = metadf[interesting_cols]
# tempdf = metadf[few_interesting_cols]
temp1 = pd.merge(tempdf, catdf, how='left', left_on=acccol, right_on=acccol)
# phagemeta = pd.merge(tempdf, phagesdf, how='inner', left_on=acccol, right_on=acccol)
# phagemeta.to_csv(os.path.join('results', 'example_isolations.tsv'), sep='\t')

phagemeta = pd.merge(temp1, phagesdf, how='right', left_on=acccol, right_on=acccol)


# Define the columns

These are for the columns we are going to use

In [15]:
begdata = 1 # ignore the first column which is the assembly name
# enddata = 9 # this is if we are using a smaller dataset
enddata = 30 # this is all the metadata
phagemeta.iloc[:,begdata:enddata]

Unnamed: 0,isolation_site,isolation_country,latitude,longitude,altitude,depth,other_environmental,host_name,host_gender,host_age,...,optimal_temperature,salinity,oxygen_requirement,habitat,disease,isolation_date,Category,Name,Contig,Genome length
0,,,,,,,,"Human, Homo sapiens",,,...,-,,Anaerobic,Host-associated,,,,ASM313v1,GCA_000003135.1_ASM313v1_genomic.gbff.gz,2396359
1,,,,,,,,,,,...,-,,Aerobic,Multiple,Food poisoning,,food,ASM364v1,GCA_000003645.1_ASM364v1_genomic.gbff.gz,5269725
2,,,,,,,,,,,...,-,,,,,,soil,ASM392v1,GCA_000003925.1_ASM392v1_genomic.gbff.gz,5561906
3,,Iceland,,,,,,"Human, Homo sapiens",,,...,-,,Aerobic,Multiple,Food poisoning,,human blood,ASM395v1,GCA_000003955.1_ASM395v1_genomic.gbff.gz,5790501
4,,United States,,,,,,,,,...,-,,Facultative,Terrestrial,,,soil,ASM582v2,GCA_000005825.2_ASM582v2_genomic.gbff.gz,4249248
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
553077,,,,,,,,,,,...,,,,,,1999.956879,built environment,LMG_5997,GCA_902860175.1_LMG_5997_genomic.gbff.gz,7197255
553078,,United Kingdom,,,,,,,,,...,,,,,,1999.956879,human other,LMG_6103,GCA_902860185.1_LMG_6103_genomic.gbff.gz,6497464
553079,,United Kingdom,,,,,,,,,...,,,,,,1999.956879,human other,LMG_7053,GCA_902860195.1_LMG_7053_genomic.gbff.gz,6702936
553080,,,,,,,,,,,...,,,,,,1999.956879,,LMG_6001,GCA_902860205.1_LMG_6001_genomic.gbff.gz,6320373


## Sanity check

This is just to make sure that we are all working on the same data. Here, we make a temporary data frame that has the number of contigs and the number of basepairs in the genome, and use those to compare to see if we think we are working on the same thing.

This is tricky because PATRIC has duplicates in the data, thus `GCA_000005825.2` appears three times in the PATRIC metadata, twice for the plasmids and once for the assembly.

We can maybe merge on two fields:

```

comparison = pd.merge(metadf[[acccol, 'contigs', 'genome_length']],
                      phagesdf[[acccol, 'Name', 'Contigs', 'Genome length']],
                      how='right', left_on=[acccol, 'genome_length'], right_on=[acccol, 'Genome length'])
```

and then look for entries in phagesdf that are missing from the merged (or just use the entries in the merged throughout).

In [16]:
def are_same(x):
    # currently fixing contig counts, so when that is done delete the next line and uncomment the one after
    return x['genome_length'] == x['Genome length']
    # return x['genome_length'] == x['Genome length'] and x['contigs'] == x['Contigs']


comparison = pd.merge(metadf[[acccol, 'contigs', 'genome_length']],
                      phagesdf[[acccol, 'Name', 'Contigs', 'Genome length']],
                      how='right', left_on=acccol, right_on=acccol)

comparison['same'] = comparison.apply(are_same, axis=1)
# look for rows that are not the same
print(f"{comparison[comparison['same']==False].shape[0]:,} rows are not the same")
print(f"{comparison[comparison['same']==True].shape[0]:,} rows are the same")

444,503 rows are not the same
108,579 rows are the same


## How many NaNs are there?

In [17]:
for c in phagemeta.iloc[:,begdata:enddata].columns:
    print(f"{c}\t{phagemeta[phagemeta[c].notnull()].shape[0]}")

isolation_site	128
isolation_country	96876
latitude	2910
longitude	2909
altitude	1033
depth	8094
other_environmental	9841
host_name	59086
host_gender	4069
host_age	3610
host_health	11651
body_sample_site	2032
body_sample_subsite	0
other_clinical	11223
gram_stain	8097
cell_shape	7380
motility	5267
sporulation	3873
temperature_range	4015
optimal_temperature	10737
salinity	705
oxygen_requirement	6440
habitat	4344
disease	2096
isolation_date	90756
Category	54644
Name	553082
Contig	553082
Genome length	553082


## Encode the data

This converts every column into categories so we can (hopefully) use it in the RF models

In [18]:
pmenc = pd.DataFrame()
for c in phagemeta.iloc[:,begdata:enddata].columns:
    pmenc[c] = phagemeta[c].astype('category').cat.codes
pmenc['isolation_date'] = phagemeta['isolation_date'].fillna(-1)
pmenc.sort_values('isolation_date')

Unnamed: 0,isolation_site,isolation_country,latitude,longitude,altitude,depth,other_environmental,host_name,host_gender,host_age,...,optimal_temperature,salinity,oxygen_requirement,habitat,disease,isolation_date,Category,Name,Contig,Genome length
0,-1,-1,-1,-1,-1,-1,-1,1044,-1,-1,...,1,-1,3,14,-1,-1.000000,-1,105501,0,95431
357518,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1.000000,-1,485889,357518,226581
357517,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1.000000,-1,485885,357517,188813
357516,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1.000000,-1,485887,357516,190524
357515,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1.000000,-1,485883,357515,334948
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
503279,-1,224,-1,-1,-1,104,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,2020.063655,46,55559,503279,456799
503278,-1,224,-1,-1,-1,104,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,2020.063655,46,55558,503278,466952
503277,-1,224,-1,-1,-1,104,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,2020.063655,46,55557,503277,456794
503275,-1,224,-1,-1,-1,104,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,2020.063655,46,55554,503275,461313


## Random Forest

Can we identify which features best predict the number of phages predicted. Note we should compare total/kept/etc

First, an RF with all the data. You might want to skip this!

In [None]:
clf = RandomForestClassifier(random_state=42, n_estimators=1000, bootstrap=True, n_jobs=-1, oob_score=True)
rf = clf.fit(pmenc, phagemeta.Kept.values.ravel())
print(rf)

## Important features

What are the most important features for predicting the number of prophages in a genome?

In [None]:
fi = pd.DataFrame(zip(pmenc.columns, rf.feature_importances_), columns=['feature', 'importance'])
impdf = fi.sort_values('importance', ascending=False)
# impdf['Number Not Null'] = phagemeta[phagemeta[impdf['feature']].notnull()].shape[0]
impdf

In [None]:
def number_not_null(x):
    return phagemeta[phagemeta[x.feature].notnull()].shape[0]

fi = pd.DataFrame(zip(pmenc.columns, rf.feature_importances_), columns=['feature', 'importance'])
impdf = fi.sort_values('importance', ascending=False)
impdf["Number of observations"] = impdf.apply(number_not_null, axis=1)
impdf

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(11, 8))
ax = sns.scatterplot(x='feature', y='importance', data=impdf, ax=ax)
plt.xticks(rotation=45,  horizontalalignment='right')
fig.savefig(f"images/rf_importance.svg")

In [None]:
printmd("**Random Forest output**", "blue")
printmd(f"There were {rf.n_features_} _features_ (columns in the dataframe), and {rf.n_classes_} _classes_ (unique categories)")
printmd(f"The OOB error was {rf.oob_score_}")

# Exploring the data

Lets take a look at some of these important things.

## Isolation date of the phage

Note that above we do a lot of work to create the `isolation_date` field, which is now the year of isolation as a decimal so that it plots out nicely! 


In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(11, 8))
ax = sns.scatterplot(x='isolation_date', y='Kept', data=phagemeta[phagemeta['isolation_date'] != -1], ax=ax, marker='+', color='k')
ax.set_ylabel("Number of phages predicted")
ax.set_xlabel("Isolation date")
#ax.axhline(y=0, color='k')
ax.spines['bottom'].set_position('zero')
ax.spines['top'].set_color('none')
ax.spines['right'].set_color('none')


fig.savefig(f"images/phages_vs_ya.svg")

## Isolation Country

Here we look at isolation country. Note that earlier we fixed a few countries (e.g. USA -> United States), etc.

Note that we have kept true zero counts: these are genomes for which we have no prophages isolated

In [None]:
phagemeta[(phagemeta['isolation_country'] == 'United States') & (phagemeta['Kept'] == 0)][['assembly_accession', 'Kept']]

In [None]:
tmp = phagemeta[phagemeta['isolation_country'].notnull()].sort_values('Kept', ascending=False)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(11, 8))
ax = sns.violinplot(x='isolation_country', y='Kept', data=tmp, ax=ax)
ax.set_ylabel("Number of phages predicted")
ax.set_xlabel("Isolation country")
#ax.axhline(y=0, color='k')
ax.spines['bottom'].set_position('zero')
ax.spines['top'].set_color('none')
ax.spines['right'].set_color('none')
plt.xticks(rotation=45,  horizontalalignment='right')
ax.set_title('All countries sort of sorted')
fig.savefig(f"images/phages_vs_country_all.svg")

In [None]:
topnc = 50
tmp = phagemeta[phagemeta['isolation_country'].isin(phagemeta.groupby('isolation_country').count().sort_values('assembly_accession', ascending=False).head(topnc).index)]
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(11, 8))
ax = sns.violinplot(x='isolation_country', y='Kept', data=tmp, ax=ax)
ax.set_ylabel("Number of phages predicted")
ax.set_xlabel("Isolation country")
ax.spines['bottom'].set_position('zero')
ax.spines['top'].set_color('none')
ax.spines['right'].set_color('none')
ax.set_title(f'Top {topnc} countries with the most genome isolates')
plt.xticks(rotation=45,  horizontalalignment='right')

fig.savefig(f"images/phages_vs_country_top.svg")

In [None]:
topnc = 100
tmp = phagemeta[phagemeta['isolation_country'].notnull()].sort_values('Kept', ascending=False)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(11, 8))
sortedorder = phagemeta.groupby('isolation_country')["Kept"].median().sort_values(ascending=False).index[0:topnc]
ax = sns.violinplot(x='isolation_country', y='Kept', data=tmp, ax=ax, order=sortedorder)
ax.set_ylabel("Number of phages predicted")
ax.set_xlabel("Isolation country")
#ax.axhline(y=0, color='k')
ax.spines['bottom'].set_position('zero')
ax.spines['top'].set_color('none')
ax.spines['right'].set_color('none')
ax.set_title(f'Top {topnc} genomes by median number of prophages')
plt.xticks(rotation=45,  horizontalalignment='right')

fig.savefig(f"images/phages_vs_country_100_median.svg")

In [None]:
phagemeta.groupby('isolation_country')["Kept"].median().sort_values(ascending=False).index[0:50]

In [None]:
topnc = 50
genomen = 5 
tmpc = pd.DataFrame(phagemeta.groupby('isolation_country').count()['assembly_accession'] > genomen)
tmpc[tmpc['assembly_accession']].index
tmp = phagemeta[phagemeta['isolation_country'].isin(tmpc[tmpc['assembly_accession']].index)]

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(11, 8))
sortedorder = tmp.groupby('isolation_country')["Kept"].median().sort_values(ascending=False).index[0:topnc]
ax = sns.violinplot(x='isolation_country', y='Kept', data=tmp, ax=ax, order=sortedorder)
ax.set_ylabel("Number of phages predicted")
ax.set_xlabel("Isolation country")
#ax.axhline(y=0, color='k')
ax.spines['bottom'].set_position('zero')
ax.spines['top'].set_color('none')
ax.spines['right'].set_color('none')
ax.set_title(f'Top {topnc} countries with more than {genomen} genomes\nSorted by median prophage abundance')
plt.xticks(rotation=45,  horizontalalignment='right')

fig.savefig(f"images/phages_vs_country_{topnc}_genomes_{genomen}_phages_median.svg")

In [None]:
topnc = 50
genomen = 5 
tmpc = pd.DataFrame(phagemeta.groupby('isolation_country').count()['assembly_accession'] > genomen)
tmpc[tmpc['assembly_accession']].index
tmp = phagemeta[phagemeta['isolation_country'].isin(tmpc[tmpc['assembly_accession']].index)]

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(11, 8))
sortedorder = tmp.groupby('isolation_country')["Kept"].sum().sort_values(ascending=False).index[0:topnc]
ax = sns.violinplot(x='isolation_country', y='Kept', data=tmp, ax=ax, order=sortedorder)
ax.set_ylabel("Number of phages predicted")
ax.set_xlabel("Isolation country")
#ax.axhline(y=0, color='k')
ax.spines['bottom'].set_position('zero')
ax.spines['top'].set_color('none')
ax.spines['right'].set_color('none')
ax.set_title(f'Top {topnc} countries with more than {genomen} genomes\nSorted by total prophage abundance')
plt.xticks(rotation=45,  horizontalalignment='right')

fig.savefig(f"images/phages_vs_country_{topnc}_genomes_{genomen}_phages_sum.svg")

## Host Name

This is generally where the bacteria was isolated from

In [None]:
pd.unique(phagemeta[phagemeta['host_name'].notnull()]['host_name'])