# Plot the bray curtis PCoA before and after removing the 15 blooming bacteria
## for figure 2A,B
First load the AG, PGP, UK-Twins and Whole grain feces studies

Then randomly sample 200 samples (or less if don't have >200 samples) in order to have approx. same amount from each study

Join to a single experiment (while remembering from which experiment each sample came)

Save this experiment before, and after removing the 15 candidate blooming bacteria

Create a mapping file also containing the total bloom bacteria associated reads per sample

Then rarify and calculate the PCoA. Should be done externally using the qiime commands:

```
# rarify to 5k reads/sample
single_rarefaction.py -i combined.new.small.biom -o combined.new.small.sub5k.biom -d 5000
single_rarefaction.py -i combined.new.small.nobloom.biom -o combined.new.small.nobloom.sub5k.biom -d 5000
# calculate bray-curtis distance
beta_diversity.py -i combined.new.small.nobloom.sub5k.biom -o distmat -m bray_curtis
beta_diversity.py -i combined.new.small.sub5k.biom -o distmat -m bray_curtis
# and do the PCoA
principal_coordinates.py -i distmat/bray_curtis_combined.new.small.sub5k.txt -o pcoa.bloom.bc.txt
principal_coordinates.py -i distmat/bray_curtis_combined.new.small.nobloom.sub5k.txt -o pcoa.nobloom.bc.txt
```

Finally, Load the resulting PCoA into the notebook and plot in Emperor

# Setup

In [1]:
# plots inside the notebook
%matplotlib notebook

# load modules used in the analysis
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sn
# heatsequer module is available from:
# https://github.com/amnona/heatsequer
import heatsequer as hs

because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



('loading cooldb',)
('cooldb loaded',)
('loading bactdb',)
('bactdb loaded',)
('loading supercooldb',)
('supercooldb loaded',)


In [2]:
# set display to full width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Loading all experimental data

## american gut

In [3]:
ag=hs.load('../data/ag-6-2016.clean.min100.withtax.biom','../data/map.ag-6-2016.txt')

('Loading biom table data/ag-6-2016.clean.min100.withtax.biom',)
5ffd56560ea3217b52a6076d5dd5d2f6
('Loading mapping file data/map.ag-6-2016.txt',)
('number of samples in map is 9919',)
('number of samples in table is 9919',)
('removing 0 samples',)
('deleted. number of samples in table is now 9919',)
('number of samples in mapping file is now 9919',)
('Samples with 0 reads: 1214',)


In [4]:
# get only fecal
agf=hs.filtersamples(ag,'ENV_MATTER','ENVO:feces')

('7286 Samples left',)


In [5]:
# convert date to number
agf=hs.convertdatefield(agf,'COLLECTION_DATE','date-numeric','%m/%d/%Y')

('1 conversions failed',)


In [6]:
# and sort by collection date
agf=hs.sortsamples(agf,'date-numeric',numeric=True)

In [7]:
# keep only samples with >1000 reads
agf=hs.filterorigreads(agf,1000)

('7073 Samples left',)


## UK twins study

In [8]:
twins=hs.load('../data/twins2.clean.min10.withtax.biom','../data/map.twins2.txt')

('Loading biom table data/twins2.clean.min10.withtax.biom',)
d16fcd8fbbae6b25cbd464ab73950ba1
('Loading mapping file data/map.twins2.txt',)
('number of samples in map is 1081',)
('number of samples in table is 1046',)
('removing 0 samples',)
('deleted. number of samples in table is now 1046',)
('removing 35 samples from mapping file',)
('number of samples in mapping file is now 1046',)
('Samples with 0 reads: 29',)


In [9]:
# keep only high freq. enough bacteria
twinsf=hs.clusterbacteria(twins,50)

('2564 Bacteria left',)


In [10]:
# keep only samples with >1000 reads
twins=hs.filterorigreads(twins,1000)

('1017 Samples left',)


## PGP

In [11]:
pgp=hs.load('../data/pgp.baylor.clean.withtax.biom','../data/map.pgp.txt')

('Loading biom table data/pgp.baylor.clean.withtax.biom',)
7ca155612b152a9b7b20540489072e8e
('Loading mapping file data/map.pgp.txt',)
('number of samples in map is 429',)
('number of samples in table is 437',)
('Table sample Fermanagh.Mouth not found in mapping file',)
('Table sample Elmira.Stool not found in mapping file',)
('Table sample Erlandson.Stool not found in mapping file',)
('Table sample Innkeeper.Forehead not found in mapping file',)
('Table sample Tewmac.Mouth not found in mapping file',)
('Table sample Pinrock.Rightpalm not found in mapping file',)
('Table sample Lippokd.Forehead not found in mapping file',)
('Table sample Delmore.Leftpalm not found in mapping file',)
('Table sample Goddu.Forehead not found in mapping file',)
('Table sample Menekish.Stool not found in mapping file',)
('removing 10 samples',)
('deleted. number of samples in table is now 427',)
('removing 2 samples from mapping file',)
('number of samples in mapping file is now 427',)
('Samples with 0 read

In [12]:
# only fecal samples
pgpf=hs.filtersamples(pgp,'#SampleID','Stool',exact=False)

('80 Samples left',)


In [13]:
# keep only samples with >1000 reads
pgpf=hs.filterorigreads(pgpf,1000)

('79 Samples left',)


## Ercolini whole grain feces

In [14]:
erc=hs.load('../data/ercolini.feces.clean.withtax.biom','../data/map.ercolini.txt')

('Loading biom table data/ercolini.feces.clean.withtax.biom',)
2ad21c8bdf0b14e5c60dc29494838d15
('Loading mapping file data/map.ercolini.txt',)
('number of samples in map is 96',)
('number of samples in table is 93',)
('removing 0 samples',)
('deleted. number of samples in table is now 93',)
('removing 3 samples from mapping file',)
('number of samples in mapping file is now 93',)
('Samples with 0 reads: 4',)


In [15]:
# only fecal samples
ercf=hs.filtersamples(erc,'env_matter','ENVO:feces')

('89 Samples left',)


In [16]:
# keep only samples with >1000 reads
ercf=hs.filterorigreads(ercf,1000)

('88 Samples left',)


# Analysis
## look at how filtering improves pcoa similarity

## Create small even mixture biom table from all experiments

In [17]:
np.random.seed(2016)
samplesperexp=200

In [25]:
# load the blooming bacteria list
bloomseqs,bloomnames=hs.readfastaseqs('../data/newbloom.all.fa')

In [19]:
# randomly select 200 AG samples
small=hs.randomsplit(agf,samplesperexp)[0]

In [20]:
# add PGP samples (only 79 since this is all PGP contains)
tt=hs.randomsplit(pgpf,samplesperexp)[0]
small=hs.joinexperiments(small,tt)

('Less samples (79) than requested (200)',)


In [21]:
# add Whole grain feces samples (only 88 since this is all Whole grain feces contains)
tt=hs.randomsplit(ercf,samplesperexp)[0]
small=hs.joinexperiments(small,tt)

('Less samples (88) than requested (200)',)


In [22]:
# add randomly selected 200 UK-Twins samples
tt=hs.randomsplit(twinsf,samplesperexp)[0]
small=hs.joinexperiments(small,tt)

In [23]:
# remove low freq. sOTUs (less than 10 reads total)
small=hs.filterminreads(small,10)

('3783 Bacteria left',)


In [24]:
# keep only the original experiment mapping field (to make things faster)
hs.filtermapfields(small,['origexp'],inplace=True)

<heatsequer.experiment.expclass.Experiment at 0x117743898>

In [26]:
hs.savetobiom(small,'../data/combined.new.small.biom',useorigreads=True)

('10 Commands saved to file combined.new.small.biom.commands.txt',)
('table saved to file combined.new.small.biom',)


In [27]:
# filter blooming bacteria
smallnobloom=hs.filterseqs(small,bloomseqs,exclude=True)

In [28]:
hs.savetobiom(smallnobloom,'../data/combined.new.small.nobloom.biom',useorigreads=True)

('11 Commands saved to file combined.new.small.nobloom.biom.commands.txt',)
('table saved to file combined.new.small.nobloom.biom',)


## add bloom level as a mapping field

In [29]:
# add a new field to mapping file
smallf=hs.addmapfield(small,'bloomlevel',0)

In [30]:
# and for each sample put the number of blooming bacteria reads in this field
tt=hs.filterseqs(small,bloomseqs)
for idx,csamp in enumerate(tt.samples):
    smallf.smap[csamp]['bloomlevel']=str(np.round(np.sum(tt.data[:,idx]))+1)

In [31]:
# save the mapping file with the 'bloomlevel' field containing the total amount of bloom bacteria
hs.savemap(smallf,'../data/combined.new.small.withbloomlevel.map.txt')

## Process the biom tables using qiime:
```
single_rarefaction.py -i combined.new.small.biom -o combined.new.small.sub5k.biom -d 5000
single_rarefaction.py -i combined.new.small.nobloom.biom -o combined.new.small.nobloom.sub5k.biom -d 5000

# get bray curtis and binary jaccard distances
beta_diversity.py -i combined.new.small.nobloom.sub5k.biom -o distmat -m bray_curtis,binary_jaccard
beta_diversity.py -i combined.new.small.sub5k.biom -o distmat -m bray_curtis,binary_jaccard


# pcoa
principal_coordinates.py -i distmat/bray_curtis_combined.new.small.sub5k.txt -o pcoa.bloom.bc.txt
principal_coordinates.py -i distmat/bray_curtis_combined.new.small.nobloom.sub5k.txt -o pcoa.nobloom.bc.txt
principal_coordinates.py -i distmat/binary_jaccard_combined.new.small.sub5k.txt -o pcoa.bloom.bj.txt
principal_coordinates.py -i distmat/binary_jaccard_combined.new.small.nobloom.sub5k.txt -o pcoa.nobloom.bj.txt
```

## plot emperor pcoa (in notebook!)

In [32]:
# load some emperor goodies needed for in notebook pcoa plotting
from emperor import Emperor, nbinstall

from emperor.qiime_backports.parse import parse_mapping_file
from emperor.qiime_backports.format import format_mapping_file

from skbio import OrdinationResults
from skbio.io.util import open_file
import pandas as pd

In [33]:
# a function to load a mapping file to a pandas dataframe
def load_mf(fn):
    with open_file(fn) as f:
        mapping_data, header, _ = parse_mapping_file(f)
        _mapping_file = pd.DataFrame(mapping_data, columns=header)
        _mapping_file.set_index('SampleID', inplace=True)
    return _mapping_file


In [34]:
# load the mapping file with the bloom level field
pmf=load_mf('../data/combined.new.small.withbloomlevel.map.txt')

In [35]:
nbinstall()


In [36]:
# load the pcoa result bray curtis with the blooming bacteria (no filtering)
resbloom = OrdinationResults.read('../data/pcoa/pcoa.bloom.bc.txt')

In [37]:
# load the pcoa result bray curtis without the blooming bacteria (with filtering)
resnobloom = OrdinationResults.read('../data/pcoa/pcoa.nobloom.bc.txt')

In [38]:
# create the with the blooming bacteria (no filtering) emperor data
x = Emperor(resbloom, pmf, remote=False)

In [39]:
# need to convert to int (otherwise problem with emperor size from field)
x.mf.bloomlevel=x.mf.bloomlevel.astype(float).astype(int)

In [40]:
# and plot (with the blooming bacteria (no filtering))
# need to choose origexp as color field, bloomlevel as size
x

<emperor.core.Emperor at 0x121daa7f0>

In [41]:
# create the without the blooming bacteria (with filtering) emperor data
xnobloom = Emperor(resnobloom, pmf, remote=False)

In [42]:
# need to convert to int (otherwise problem with emperor size from field)
xnobloom.mf.bloomlevel=xnobloom.mf.bloomlevel.astype(float).astype(int)

In [44]:
# and plot (without the blooming bacteria (with filtering))
# need to choose origexp as color field, bloomlevel as size
xnobloom

<emperor.core.Emperor at 0x121a45940>