# Merge the prophage predictions with RAST metadata

Combine RAST metadata and prophage predictions

In [1]:
import os
import sys
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import pandas as pd
import seaborn as sns
import numpy as np

import math
import re

from scipy.stats import pearsonr, f_oneway
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd, tukeyhsd, MultiComparison
from statsmodels.multivariate.manova import MANOVA
from sklearn import decomposition
from sklearn.ensemble import RandomForestClassifier

import subprocess
import gzip


In [2]:
# this is a neat trick for getting markdown in our output
# see https://stackoverflow.com/questions/23271575/printing-bold-colored-etc-text-in-ipython-qtconsole
# for the inspiration
from IPython.display import Markdown, display
def printmd(string, color="black"):
    colorstr = "<span style='color:{}'>{}</span>".format(color, string)
    display(Markdown(colorstr))

### Read the data file

The metadata is from PATRIC. The phage data is from us.

**NOTE:** Some of the PATRIC data refers to specific chromosomes/fragments in the GenBank file (e.g. plasmid, chromosome), but that may not equate to our predictions, because we have used the whole GenBank file. The PATRIC metadata is redundant for many fields, and so we just keep the first entry for each NCBI Assembly.

In [3]:
# the full data set. Don't try this at home!
# metadf = pd.read_csv("../small_data/patric_genome_metadata.tsv.gz", compression='gzip', header=0, delimiter="\t")
metadf = pd.read_csv("../data/patric_genome_metadata.tsv.gz", compression='gzip', header=0, delimiter="\t")
metadf


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,genome_id,genome_name,organism_name,taxon_id,genome_status,strain,serovar,biovar,pathovar,mlst,...,motility,sporulation,temperature_range,optimal_temperature,salinity,oxygen_requirement,habitat,disease,comments,additional_metadata
0,469009.4,"""'Brassica napus' phytoplasma strain TW1""",,469009,WGS,TW1,,,,,...,,,,,,,,,Genome sequence of a strain of bacteria that c...,sample_type:metagenomic assembly;collected_by:...
1,1309411.5,"""'Deinococcus soli' Cha et al. 2014 strain N5""",,1309411,Complete,N5,,,,,...,,,,,,,,,Genome sequencing of a Gamma-Radiation-Resista...,sample_type:bacterial
2,1123738.3,"""'Echinacea purpurea' witches'-broom phytoplas...",,1123738,WGS,NCHU2014,,,,,...,,,,C,,,,,'Echinacea purpurea' witches'-broom phytoplasm...,lab_host:Catharanthus roseus
3,551115.6,"""'Nostoc azollae' 0708""",'Nostoc azollae' 0708,551115,Complete,708,,,,,...,Yes,,Mesophilic,-,,Aerobic,Multiple,,"Nostoc azollae 0708. Nostoc azollae 0708, also...",
4,1856298.3,"""'Osedax' symbiont bacterium Rs2_46_30_T18 str...",,1856298,WGS,Rs2_46_30_T18,,,,,...,,,,,,,,,"In this study, we simulate the Deepwater Horiz...",sample_type:metagenomic assembly
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433517,1131286.3,zeta proteobacterium SCGC AB-137-J06,zeta proteobacterium SCGC AB-137-J06,1131286,WGS,SCGC AB-137-J06,,,,,...,,,,,,,,,Single cell genome sequencing of biomineralizi...,
433518,1131287.3,zeta proteobacterium SCGC AB-602-C20,zeta proteobacterium SCGC AB-602-C20,1131287,WGS,SCGC AB-602-C20,,,,,...,,,,,,,,,Single cell genome sequencing of biomineralizi...,
433519,1131288.3,zeta proteobacterium SCGC AB-602-E04,zeta proteobacterium SCGC AB-602-E04,1131288,WGS,SCGC AB-602-E04,,,,,...,,,,,,,,,Single cell genome sequencing of biomineralizi...,
433520,1131289.3,zeta proteobacterium SCGC AB-604-B04,zeta proteobacterium SCGC AB-604-B04,1131289,WGS,SCGC AB-604-B04,,,,,...,,,,,,,,,Single cell genome sequencing of biomineralizi...,


### Make sure we have the right column

PATRIC calls the column `assembly_accession` while GTDB calls it `ncbi_genbank_assembly_accession`

In [4]:
acccol = 'assembly_accession'
metadf[acccol]

0         GCA_003181115.1
1         GCF_001007995.1
2         GCF_001307505.1
3         GCA_000196515.1
4         GCA_002163025.1
               ...       
433517    GCA_000379245.1
433518    GCA_000379345.1
433519    GCA_000379265.1
433520    GCA_000379205.1
433521    GCA_000372125.1
Name: assembly_accession, Length: 433522, dtype: object

### Abstract out the accession and name into new columns

This will allow us to merge the data with the PATRIC and GTDB data

In [5]:
def get_acc_name(x):
    regexp = re.compile('(\w+\.\d+)_([\w\.\-]+)_genomic.gbff.gz')
    m = regexp.match(x)
    if not m:
        sys.stderr.write(f"WARNING: Regexp did not match {x}\n")
        return (None, None)
    return list(m.groups())

#phagesdf = pd.read_csv("../small_data/phages_per_genome.tsv.gz", compression='gzip', header=0, delimiter="\t")
phagesdf = pd.read_csv("../data/phages_per_genome.tsv.gz", compression='gzip', header=0, delimiter="\t")
phagesdf = pd.concat([pd.DataFrame.from_records(phagesdf['Contig'].apply(get_acc_name), columns=[acccol, 'Name']), phagesdf], axis=1)
phagesdf

Unnamed: 0,assembly_accession,Name,Contig,Genome length,Contigs,Total Predicted Prophages,Kept,No phage genes,Not enough genes,bp prophage
0,GCA_000003135.1,ASM313v1,GCA_000003135.1_ASM313v1_genomic.gbff.gz,2396359,10,16,2,1,13,48916
1,GCA_000003645.1,ASM364v1,GCA_000003645.1_ASM364v1_genomic.gbff.gz,5269725,1,31,1,10,20,40297
2,GCA_000003925.1,ASM392v1,GCA_000003925.1_ASM392v1_genomic.gbff.gz,5561906,1,38,6,13,19,268081
3,GCA_000003955.1,ASM395v1,GCA_000003955.1_ASM395v1_genomic.gbff.gz,5790501,1,46,6,11,29,166286
4,GCA_000005825.2,ASM582v2,GCA_000005825.2_ASM582v2_genomic.gbff.gz,4249248,3,33,3,9,21,93416
...,...,...,...,...,...,...,...,...,...,...
399577,GCA_902860175.1,LMG_5997,GCA_902860175.1_LMG_5997_genomic.gbff.gz,7197255,21,33,2,14,17,69051
399578,GCA_902860185.1,LMG_6103,GCA_902860185.1_LMG_6103_genomic.gbff.gz,6497464,8,22,0,10,12,0
399579,GCA_902860195.1,LMG_7053,GCA_902860195.1_LMG_7053_genomic.gbff.gz,6702936,148,33,1,11,21,12819
399580,GCA_902860205.1,LMG_6001,GCA_902860205.1_LMG_6001_genomic.gbff.gz,6320373,19,35,2,21,12,41572


### Merge the dataframes

First, select some columns we want to keep from PATRIC, and then merge the data frames. We write this to a tsv file for Laura to append isolation information to.

In [6]:
interesting_cols = [acccol, 'isolation_site', 'isolation_source', 'isolation_comments', 'collection_date',
                    'isolation_country', 'geographic_location', 'latitude', 'longitude', 'altitude', 'depth',
                    'other_environmental', 'host_name', 'host_gender', 'host_age', 'host_health', 
                    'body_sample_site', 'body_sample_subsite', 'other_clinical', 'gram_stain', 'cell_shape',
                    'motility', 'sporulation', 'temperature_range', 'optimal_temperature', 'salinity',
                    'oxygen_requirement', 'habitat', 'disease']

tempdf = metadf[interesting_cols]
phagemeta = pd.merge(tempdf, phagesdf, how='right', left_on=acccol, right_on=acccol)
phagemeta.to_csv(os.path.join('results', 'example_isolations.tsv'), sep='\t')

In [7]:
phagemeta

Unnamed: 0,assembly_accession,isolation_site,isolation_source,isolation_comments,collection_date,isolation_country,geographic_location,latitude,longitude,altitude,...,disease,Name,Contig,Genome length,Contigs,Total Predicted Prophages,Kept,No phage genes,Not enough genes,bp prophage
0,GCA_000003135.1,,,,,,,,,,...,,ASM313v1,GCA_000003135.1_ASM313v1_genomic.gbff.gz,2396359,10,16,2,1,13,48916
1,GCA_000003645.1,,cream cheese,isolated from cream cheese,,,,,,,...,Food poisoning,ASM364v1,GCA_000003645.1_ASM364v1_genomic.gbff.gz,5269725,1,31,1,10,20,40297
2,GCA_000003925.1,,soil,isolated from soil,,,,,,,...,,ASM392v1,GCA_000003925.1_ASM392v1_genomic.gbff.gz,5561906,1,38,6,13,19,268081
3,GCA_000003955.1,,human blood,isolated from human blood in Iceland,,Iceland,Iceland,,,,...,Food poisoning,ASM395v1,GCA_000003955.1_ASM395v1_genomic.gbff.gz,5790501,1,46,6,11,29,166286
4,GCA_000005825.2,,soil,isolated from soil in New York State by alkali...,,United States,New York State,,,,...,,ASM582v2,GCA_000005825.2_ASM582v2_genomic.gbff.gz,4249248,3,33,3,9,21,93416
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403548,GCA_902860175.1,,Laboratory sink,,0000,,,,,,...,,LMG_5997,GCA_902860175.1_LMG_5997_genomic.gbff.gz,7197255,21,33,2,14,17,69051
403549,GCA_902860185.1,,Human,,0000,United Kingdom,United Kingdom,,,,...,,LMG_6103,GCA_902860185.1_LMG_6103_genomic.gbff.gz,6497464,8,22,0,10,12,0
403550,GCA_902860195.1,,Human,,0000,United Kingdom,United Kingdom,,,,...,,LMG_7053,GCA_902860195.1_LMG_7053_genomic.gbff.gz,6702936,148,33,1,11,21,12819
403551,GCA_902860205.1,,,,0000,,,,,,...,,LMG_6001,GCA_902860205.1_LMG_6001_genomic.gbff.gz,6320373,19,35,2,21,12,41572


In [8]:
phagemeta.iloc[:,2:29]

Unnamed: 0,isolation_source,isolation_comments,collection_date,isolation_country,geographic_location,latitude,longitude,altitude,depth,other_environmental,...,gram_stain,cell_shape,motility,sporulation,temperature_range,optimal_temperature,salinity,oxygen_requirement,habitat,disease
0,,,,,,,,,,,...,+,Rod,No,No,Mesophilic,-,,Anaerobic,Host-associated,
1,cream cheese,isolated from cream cheese,,,,,,,,,...,+,Rod,Yes,Yes,Mesophilic,-,,Aerobic,Multiple,Food poisoning
2,soil,isolated from soil,,,,,,,,,...,+,Rod,No,Yes,,-,,,,
3,human blood,isolated from human blood in Iceland,,Iceland,Iceland,,,,,,...,+,Rod,Yes,Yes,Mesophilic,-,,Aerobic,Multiple,Food poisoning
4,soil,isolated from soil in New York State by alkali...,,United States,New York State,,,,,,...,,Bacilli,Yes,,Mesophilic,,,Facultative,Terrestrial,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403548,Laboratory sink,,0000,,,,,,,,...,,,,,,,,,,
403549,Human,,0000,United Kingdom,United Kingdom,,,,,,...,,,,,,,,,,
403550,Human,,0000,United Kingdom,United Kingdom,,,,,,...,,,,,,,,,,
403551,,,0000,,,,,,,,...,,,,,,,,,,


In [9]:
phagemeta['isolation_source'].astype('category').cat.codes

0           -1
1         5261
2         8897
3         6481
4         8897
          ... 
403548    2025
403549    1762
403550    1762
403551      -1
403552     115
Length: 403553, dtype: int16

In [10]:
phagemeta.iloc[:,34:35]

Unnamed: 0,Kept
0,2
1,1
2,6
3,6
4,3
...,...
403548,2
403549,0
403550,1
403551,2


## Encode the data

This converts every column into categories so we can (hopefully) use it in the RF models

In [11]:
pmenc = pd.DataFrame()
for c in phagemeta.iloc[:,2:29].columns:
    pmenc[c] = phagemeta[c].astype('category').cat.codes
pmenc

Unnamed: 0,isolation_source,isolation_comments,collection_date,isolation_country,geographic_location,latitude,longitude,altitude,depth,other_environmental,...,gram_stain,cell_shape,motility,sporulation,temperature_range,optimal_temperature,salinity,oxygen_requirement,habitat,disease
0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,0,56,5,3,8,1,-1,3,14,-1
1,5261,1990,-1,-1,-1,-1,-1,-1,-1,-1,...,0,56,9,8,8,1,-1,1,23,159
2,8897,2657,-1,-1,-1,-1,-1,-1,-1,-1,...,0,56,5,8,-1,1,-1,-1,-1,-1
3,6481,2208,-1,93,2666,-1,-1,-1,-1,-1,...,0,56,9,8,8,1,-1,1,23,159
4,8897,2688,-1,228,4088,-1,-1,-1,-1,-1,...,-1,13,9,-1,8,-1,-1,5,37,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403548,2025,-1,0,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
403549,1762,-1,0,227,7171,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
403550,1762,-1,0,227,7171,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
403551,-1,-1,0,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


## Random Forest

Can we identify which features best predict the number of phages predicted. Note we should compare total/kept/etc

In [12]:
# which data frame are we going to use here.
# df = scdnonzero
df = phagemeta

In [None]:
clf = RandomForestClassifier(random_state=42, n_estimators=1000, bootstrap=True, n_jobs=-1)
rf = clf.fit(pmenc, phagemeta.iloc[:,34:35].values.ravel())
print(rf)

In [None]:
fi = pd.DataFrame(zip(df.iloc[:,1:].columns, rf.feature_importances_), columns=['genome', 'importance'])
fi.sort_values('importance', ascending=False).head(5)