# Goal

* Making heatmap showing SV abundances (post-filtering) in each host
* `host (species-level) vs microbe (SV-level)`

# Var

In [158]:
work_dir = '/ebio/abt3_projects/Georg_animal_feces/data/host_specific/clado_map/'

# phyloseq
physeq_file = '/ebio/abt3_projects/Georg_animal_feces/data/16S/LLA/phyloseq//physeq_SpecD.RDS'

# host cladogram
clado_file = '/ebio/abt3_projects/Georg_animal_feces/data/animal/phylogeny/cladogram//unified_metadata_20171206_r5k_SpecD-rn.nwk'

# misc
sparsity_cutoff = 0.05

# Init

In [159]:
library(dplyr)
library(tidyr)
library(ggplot2)
library(stringr)
library(ape)
library(phyloseq)
source('/ebio/abt3_projects/Georg_animal_feces/code/init.R') 

In [160]:
make_dir(work_dir)
setwd(work_dir)

Directory already exists: /ebio/abt3_projects/Georg_animal_feces/data/host_specific/clado_map/ 


# Load

## host cladogram

In [161]:
host_clado = read.tree(clado_file)
host_clado


Phylogenetic tree with 131 tips and 58 internal nodes.

Tip labels:
	Eurasian.Beaver, Alpine.Marmot, Eurasian.Red.Squirrel, European.Ground.Squirrel, Long.tailed.Field.Mouse, Yellow.necked.Field.Mouse, ...

Unrooted; includes branch lengths.

## physeq

In [165]:
physeq = readRDS(physeq_file)
physeq

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 19190 taxa and 131 samples ]
sample_data() Sample Data:       [ 131 samples by 100 sample variables ]
tax_table()   Taxonomy Table:    [ 19190 taxa by 7 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 19190 tips and 18939 internal nodes ]

### filtering

In [166]:
# no-genus designation
# genera = physeq %>% tax_table %>% as.data.frame %>% .$Genus %>% unique %>% sort %>% as.vector
# to_rm = c('unclassified', 'uncultured', 'unidentified', 'Incertae Sedis', '') 
# to_rm = c(to_rm, genera[grepl('uncultured', genera)])
# to_rm

In [167]:
physeq = physeq %>% 
    transform_sample_counts(function(x) x / sum(x) * 100) %>%
    #subset_taxa(!Genus %in% to_rm) %>%
    filter_taxa(function(x) sum(x > 0) / length(x) >= sparsity_cutoff, TRUE) 
physeq

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 186 taxa and 131 samples ]
sample_data() Sample Data:       [ 131 samples by 100 sample variables ]
tax_table()   Taxonomy Table:    [ 186 taxa by 7 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 186 tips and 185 internal nodes ]

In [168]:
metadata = physeq %>% 
    sample_data %>% 
    as.matrix %>% 
    as.data.frame
metadata %>% status

“Setting class(x) to NULL;   result will no longer be an S4 object”

[1] 131 100


Unnamed: 0,BarcodeSequence,LinkerPrimerSequence,Primer,ReversePrimer,Barcode_ID,sample_number,host_subj_id,host_common_name,scientific_name,scientific_name_subspecies,⋯,PCR_dil_qPCR_2,Ct_A_2,Ct_B_2,Amount_SYBR_copies_in_5_ul_A_2,Amount_SYBR_copies_in_5_ul_B_2,Amount_geometric_mean_SYBR_2,Copies_in_5_ul_PCR_template_2,Copies_per_100_ul_extract_2,wellplate_Illumina,wellplate_position_Illumina
X1.Eurasian.Beaver,ACTCACGGTATG,CATGCTGCCTCCCGTAGGAGT,GCCTCCCTCGCGCCATCAG,AGAGTTTGATCCTGGCTCAG,338R_BC0049,1,1,Eurasian.Beaver,Castor fiber,,⋯,,,,,,,,,2,B12
X109.Red.Fox,ACTGTCGAAGCT,CATGCTGCCTCCCGTAGGAGT,GCCTCCCTCGCGCCATCAG,AGAGTTTGATCCTGGCTCAG,338R_BC0057,109,109,Red.Fox,Vulpes vulpes,,⋯,16.0,15.01,,2.340E+7X,,,374000000.0,7490000000.0,4,F2
X116.Common.Kestrel,ACACGGTGTCTA,CATGCTGCCTCCCGTAGGAGT,GCCTCCCTCGCGCCATCAG,AGAGTTTGATCCTGGCTCAG,338R_BC0011,116,116,Common.Kestrel,Falco tinnunculus,,⋯,16.0,20.79,,5.210E+5X,,,8340000.0,167000000.0,1,C11


# Getting occurrance of SVs in hosts

In [175]:
# otu format
otu = physeq %>%
    otu_table %>%
    as.matrix %>%
    as.data.frame 

otu$OTU = rownames(otu) 
rownames(otu) = 1:nrow(otu)
otu = otu %>%
    gather(sample, count, -OTU) %>%
    mutate(sample = gsub('^[A-Z][0-9]+b*\\.', '', sample)) %>%
    rename('count_perc' = count)

otu %>% status

[1] 24366     3


OTU,sample,count_perc
368bd5c2255f3a5ce0fef06ca6de1d04,Eurasian.Beaver,0
779324e5648460c5a67e577c45131b42,Eurasian.Beaver,0
d3d51a026150fca2a3400c59a6b311cb,Eurasian.Beaver,0


In [176]:
# adding taxonomy
tax = physeq %>%
    tax_table %>%
    as.matrix %>%
    as.data.frame %>%
    dplyr::select(-Species) 
    
tax$OTU = rownames(tax)
rownames(tax) = 1:nrow(tax)

# joining
otu = otu %>%
    inner_join(tax, c('OTU'='OTU')) %>%
    arrange(Domain, Phylum, Class, Order, Family, Genus) %>%
    mutate(Genus = Genus %>% as.character,
           Family = Family %>% as.character,
           Order = Order %>% as.character,
           Class = Class %>% as.character,
           Genus = ifelse(Genus == '', Family, Genus),
           Genus = ifelse(grepl('uncultured|insertia|unclassified', Genus), Family, Genus),
           Genus = ifelse(grepl('uncultured rumen|unclassified', Genus), Order, Genus),
           Genus = ifelse(grepl('uncultured bacterium|unclassified', Genus), Class, Genus),
           SV = OTU %>% as.factor %>% as.numeric,
           SV = gsub('^', 'SV', SV)) %>%
    unite(Genus_SV, Genus, SV) 

# transforming values
otu = otu %>%
    mutate(count_perc = log10(count_perc),
           count_perc = ifelse(is.infinite(count_perc), NA, count_perc),
           count_perc = ifelse(is.na(count_perc), 
                               min(count_perc, na.rm=TRUE),
                               count_perc)) 

tax = NULL

# status
otu$OTU %>% unique %>% length %>% print
otu$count_perc %>% summary %>% print
otu %>% status

[1] 186
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 -1.699  -1.699  -1.699  -1.613  -1.699   1.963 
[1] 24366     9


OTU,sample,count_perc,Domain,Phylum,Class,Order,Family,Genus_SV
368bd5c2255f3a5ce0fef06ca6de1d04,Eurasian.Beaver,-1.69897,Archaea,Euryarchaeota,Methanobacteria,Methanobacteriales,Methanobacteriaceae,Methanobrevibacter_SV50
779324e5648460c5a67e577c45131b42,Eurasian.Beaver,-1.69897,Archaea,Euryarchaeota,Methanobacteria,Methanobacteriales,Methanobacteriaceae,Methanobrevibacter_SV92
d3d51a026150fca2a3400c59a6b311cb,Eurasian.Beaver,-1.69897,Archaea,Euryarchaeota,Methanobacteria,Methanobacteriales,Methanobacteriaceae,Methanobrevibacter_SV150


In [177]:
gsub('_SV.+', '', otu$Genus) %>% table

.
              Acinetobacter                   Aeromonas 
                        917                         262 
                Akkermansia                   Alistipes 
                        131                         786 
                Anaerovorax                Arthrobacter 
                        131                         131 
                Bacteroides                   Bilophila 
                        786                         131 
                    Blautia               Cetobacterium 
                        131                         131 
            Christensenella         Christensenellaceae 
                        131                        1179 
                Citrobacter               Clostridiales 
                        262                         131 
Clostridium sensu stricto 1                 Coprococcus 
                       1048                         262 
              Desulfovibrio                       Dorea 
                        131  

In [178]:
# matching 16S tree
tree = physeq %>%
    phy_tree

# renaming tips based on genus-SV
tmp = otu %>%
    dplyr::distinct(OTU, Genus_SV) %>%
    as.data.frame
rownames(tmp) = tmp$OTU
tmp = tmp[tree$tip.label,]
tree$tip.label = tmp$Genus_SV

# status
tree


Phylogenetic tree with 186 tips and 185 internal nodes.

Tip labels:
	Methanobrevibacter_SV50, Methanobrevibacter_SV92, Methanobrevibacter_SV150, Methanobrevibacter_SV19, Akkermansia_SV12, Pseudomonas_SV60, ...
Node labels:
	0.435, 0.627, 0.783, 0.889, 0.863, 0.812, ...

Rooted; includes branch lengths.

# Formatting for iTOL

In [179]:
# ordering matrix
otu_m = otu %>%
    dplyr::select(Genus_SV, sample, count_perc) %>%
    spread(Genus_SV, count_perc) %>%
    as.data.frame

rownames(otu_m) = otu_m$sample
otu_m$sample = NULL

# reordering
otu_m = otu_m[,unique(otu$Genus_SV)]

# status
rowSums(otu_m) %>% summary %>% print
otu_m %>% status

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 -316.0  -310.5  -303.9  -300.1  -297.7  -232.2 
[1] 131 186


Unnamed: 0,Methanobrevibacter_SV50,Methanobrevibacter_SV92,Methanobrevibacter_SV150,Methanobrevibacter_SV19,Arthrobacter_SV29,Bacteroides_SV93,Bacteroides_SV78,Bacteroides_SV167,Bacteroides_SV38,Bacteroides_SV56,⋯,Acinetobacter_SV105,Acinetobacter_SV136,Acinetobacter_SV99,Pseudomonas_SV60,Pseudomonas_SV142,Pseudomonas_SV132,Mollicutes_SV114,Mollicutes_SV68,RF9_SV131,Akkermansia_SV12
Aesculapian.Snake,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,⋯,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897
African.Buffalo,-1.69897,-1.221849,-1.69897,-1.221849,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,⋯,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897
African.Bush.Elephant,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,⋯,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897,-1.69897


In [180]:
# checking sample overlap
setdiff(host_clado$tip.label, rownames(otu_m)) %>% print
setdiff(rownames(otu_m), host_clado$tip.label) %>% print

character(0)
character(0)


In [181]:
itol_htmp_file = file.path(work_dir, 'SpecD-SV_htmp.txt')
labs = gsub(' ', '_', colnames(otu_m))
labs = sprintf('FIELD_LABELS %s\n', paste(labs, collapse=' '))

cat('DATASET_HEATMAP\n', file=itol_htmp_file)
cat('SEPARATOR SPACE\n', file=itol_htmp_file, append=TRUE)
cat('DATASET_LABEL SV\n', file=itol_htmp_file, append=TRUE)
cat('COLOR #ff0000\n', file=itol_htmp_file, append=TRUE)
cat('FIELD_TREE ', file=itol_htmp_file, append=TRUE)
write.tree(tree, file=itol_htmp_file, append=TRUE)
cat(labs, file=itol_htmp_file, append=TRUE)
cat('DATA\n', file=itol_htmp_file, append=TRUE)
write.table(otu_m, file=itol_htmp_file, append=TRUE, sep=' ', 
            quote=FALSE, col.names=FALSE)
cat('File written:', itol_htmp_file, '\n')

File written: /ebio/abt3_projects/Georg_animal_feces/data/host_specific/clado_map//SpecD-SV_htmp.txt 


## Just methanogens

In [None]:
# ordering matrix
otu_m = otu %>%
    dplyr::select(OTU, sample, count_perc) %>%
    spread(OTU, count_perc) %>%
    as.data.frame

rownames(otu_m) = otu_m$sample
otu_m$sample = NULL

# reordering
otu_m = otu_m[,unique(otu$OTU)]

# filtering
otu_m = otu_m[,grepl('^Methano', colnames(otu_m))]

# status
rowSums(otu_m) %>% summary %>% print
otu_m %>% status

In [None]:
itol_htmp_file = file.path(work_dir, 'SpecD-SV_htmp-Meth.txt')
labs = gsub(' ', '_', colnames(otu_m))
labs = sprintf('FIELD_LABELS %s\n', paste(labs, collapse=' '))

cat('DATASET_HEATMAP\n', file=itol_htmp_file)
cat('SEPARATOR SPACE\n', file=itol_htmp_file, append=TRUE)
cat('DATASET_LABEL SV-meth\n', file=itol_htmp_file, append=TRUE)
cat('COLOR #ff0000\n', file=itol_htmp_file, append=TRUE)
cat(labs, file=itol_htmp_file, append=TRUE)
cat('DATA\n', file=itol_htmp_file, append=TRUE)
write.table(otu_m, file=itol_htmp_file, append=TRUE, sep=' ', 
            quote=FALSE, col.names=FALSE)
cat('File written:', itol_htmp_file, '\n')

# sessionInfo()

In [None]:
sessionInfo()