-
Notifications
You must be signed in to change notification settings - Fork 3
/
build_phyloseq.R
70 lines (57 loc) · 2.31 KB
/
build_phyloseq.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# Script to build phyloseq object
library(stringr)
library(phyloseq)
library(reshape2)
# Get command line arguments
args = commandArgs(trailingOnly=TRUE)
# If two arguments not provided, return an error
if (length(args) < 5) {
stop("must specify:
1: tax data (e.g., nw_tophits.tsv);
2: sample data (e.g., mapping_file.txt);
3: OTU table (.tsv);
4: duplicate reference taxa names (e.g., ITS2db_trimmed_notuniques_otus.txt)
5: output .RData filename", call.=FALSE)
}
# Import taxonomic assignment data from nw
read.nw <- function(file) {
nw <- read.table(file, quote="", stringsAsFactors=FALSE)
nw <- nw[order(nw$otu),]
return(nw)
}
tax <- read.nw(args[1])
# Deal with identical taxonomic assignments (because some reference sequences are not unique...)
# Create names for groups of identical sequences based on members of group...
ident <- readLines(args[4])
ident <- gsub("denovo[0-9]*\t", "", ident)
ident <- strsplit(ident, split="\t")
ident2 <- lapply(ident, str_match_all, pattern="[A-I]{1}[0-9]{1,3}.*[_/]")
ident2 <- lapply(ident2, function(g) gsub(pattern="\\..*_$", "", x=unlist(g)))
ident2 <- lapply(ident2, function(g) gsub(pattern="_$", "", g))
ident2 <- lapply(ident2, function(g) unlist(strsplit(g, split="/")))
subtypes <- lapply(ident2, function(x) levels(factor(unlist(x))))
subtypes <- lapply(subtypes, function(s) paste(paste(s, collapse="/"), "_multiple", sep=""))
names(ident) <- subtypes
ident <- melt(do.call(rbind, ident))
ident <- unique(ident[order(ident[,1]), c(3,1)])
# Replace any sequence name in taxonomy assignment that is a member of a group of identical sequences with the name of the group
collapse.idents <- function(df) {
within(df, {
for (i in 1:nrow(ident)) {
hit <- gsub(ident[i,1], ident[i,2], hit)
}
})
}
tax <- collapse.idents(tax)
tax <- with(tax, tax[order(otu, -sim), ])
tax <- tax[!duplicated(tax$otu), ]
rownames(tax) <- tax$otu
# Import sample data
sam <- sample_data(read.delim(args[2], sep="\t", header=T, row.names=1))
# Import OTU tables
otu <- otu_table(read.table(args[3], header=T, check.names=F, row.names=1,
skip=1, sep="\t", comment.char=""), taxa_are_rows=T)
# Build phyloseq objects
phy <- phyloseq(otu, tax_table(as.matrix(tax)), sam)
# Save phyloseq object
save(phy, file=file.path(args[5]))