/
panaroo_2_pagoo.R
173 lines (144 loc) · 5.72 KB
/
panaroo_2_pagoo.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#' @name panaroo_2_pagoo
#' @title Read panaroos's output into a pagoo's R6 class object
#' @description This function handle conversion of \href{https://gtonkinhill.github.io/panaroo/#/}{panaroo}'s output files into
#' a pagoo R6 class object. It takes the "gene_presence_absence.csv" file and
#' (optionally but recommended) gff input file paths, and returns an object of
#' class \code{\link[pagoo]{PgR6MS}} (or \code{\link[pagoo]{PgR6M}} if left
#' empty the \code{gffs} argument). Panaroo identifies some genes with unusual
#' lengths tagging them with 'stop', 'length', or 'refound' labels. In the
#' current version, this function discards those genes.
#'
#' @param gene_presence_absence_csv \code{character}, path to the
#' "gene_presence_absence.csv" file. (Do not confuse with the file with the
#' same name but with \code{.Rtab} extension).
#' @param gffs A \code{character} vector with paths to original gff files used
#' as roary's input. Typically the return value of \code{list.files()} function.
#' This parameter is optional but highly recommended if you want to manipulate
#' sequences.
#' @param sep \code{character}. Default: \code{"__"} (two underscores). See
#' \link[pagoo]{PgR6MS} for a more detail argument description.
#' @return A pagoo's R6 class object. Either \link[pagoo]{PgR6M}, if \code{gffs}
#' argument is left empty, or \link[pagoo]{PgR6MS} if path to gff files is
#' provided.
#' @references
#' Tonkin-Hill, G., MacAlasdair, N., Ruis, C. et al. Producing polished
#' prokaryotic pangenomes with the Panaroo pipeline. Genome Biol 21, 180 (2020).
#' https://doi.org/10.1186/s13059-020-02090-4
#' @examples
#' \dontrun{
#' gffs <- list.files(path = "path/to/gffs/",
#' pattern = "[.]gff$",
#' full.names = TRUE)
#' gpa_csv <- "path/to/gene_presence_absence.csv"
#'
#' library(pagoo)
#' pg <- panaroo_2_pagoo(gene_presence_absence_csv = gpa_csv,
#' gffs = gffs)
#' }
#' @importFrom utils read.csv
#' @importFrom Biostrings DNAStringSetList
#' @importFrom S4Vectors mcols
#' @importFrom stats setNames
#' @importFrom magrittr `%>%`
#' @export
panaroo_2_pagoo <- function(gene_presence_absence_csv, gffs, sep = '__'){
message('Reading csv file (panaroo).')
df <- read.csv(gene_presence_absence_csv,
header = TRUE,
sep = ',',
stringsAsFactors = FALSE,
check.names = FALSE,
colClasses = "character")
message('Processing csv file.')
infocols <- match(c("Gene", "Non-unique Gene name", "Annotation"), colnames(df), nomatch = 0)
df[, -infocols] <- lapply(df[, -infocols], strsplit, ";")
# Identify weird genes:
dfrm <- apply(df[, -infocols], 2,
# Look for weird genes:
function(x){
grep("(_refound|_stop|_len)_*", x)
}
) %>% {
# Create a data.frame with the coordinates of the match in the original
# data.frame:
data.frame(
columns = mapply(rep, names(.), lengths(.)) %>% unlist(use.names = FALSE),
rows = unlist(., use.names = FALSE)
)
} %>%
# Add the index of the match (It could be a paralogue):
{
idx <- mapply(function(row, column, x){
`[[`(x, row, column) %>% grep("(_refound|_stop|_len)_*", .)
}, row = .$rows, column = .$columns,
MoreArgs = list(x=df[, -infocols]))
.$index <- idx
.
}
# Remove those weird genes from the data.frame:
if (nrow(dfrm)) paste0("Removing ", nrow(dfrm), " genes tagged as 'refound', 'stop', and/or 'length' by panaroo.") %>%
message()
for (r in seq_len(nrow(dfrm))){
COL <- dfrm$columns[[r]]
ROW <- dfrm$rows[[r]]
INDEX <- dfrm$index[[r]]
df[[COL]][[ROW]] <- df[[COL]][[ROW]][-INDEX]
}
# remove clusters without genes (seems to be a panaroo's bug)
emptyclus <- which(rowSums(do.call(cbind, lapply(df[,- infocols], lengths))) == 0)
if (length(emptyclus)){
df <- df[-emptyclus, ,drop=FALSE]
}
cluster_meta <- df[, c('Gene', 'Annotation')]
colnames(cluster_meta) <- c('cluster', 'Annotation')
dims <- dim(df)
df <- df[,-infocols]
lp <- lapply(df, setNames, cluster_meta$cluster)
lths <- lapply(lp, lengths)
sums <- sapply(lths, sum)
mm <- cbind(
unlist(lp, use.names = FALSE),
unlist(lapply(lths, function(x){
unlist(rep(names(x), x), use.names = FALSE)
}), use.names = FALSE),
rep(names(sums), sums)
)
mm <- as.data.frame(mm)
colnames(mm) <- c('gene', 'cluster', 'org')
if(missing(gffs)){
message('Loading PgR6M class object.')
pg <- PgR6M$new(data = mm,
cluster_meta = cluster_meta,
sep = sep)
}else{
names(gffs) <- sub('[.]gff3*$', '', basename(gffs))
seqs <- lapply(gffs, function(x){
message(paste('Reading gff file', x))
sq <- read_gff(x)
#There are some tRNAs genes that are not listed in
# the gene_presence_absence.csv file
sq <- sq[which(names(sq) %in% mm$gene)]
sq
})
# Get gene information from gffs
mcls <- lapply(seqs, mcols)
mcls <- lapply(names(mcls), function(x){
mcls[[x]]$org <- x
mcls[[x]]
})
# Selected columns
cols <- c('seqid', 'type', 'start', 'end', 'strand', 'product', 'org', 'locus_tag')
mcls <- lapply(mcls, function(x) x[ , cols])
# Match !
mcls <- do.call(rbind, mcls)
ma <- match(paste(mm$org, mm$gene, sep=sep), paste(mcls$org, mcls$locus_tag, sep = sep))
mm <- cbind(mm, mcls[ma, c('seqid', 'type', 'start', 'end', 'strand', 'product')])
message('Loading PgR6MS class object.')
pg <- PgR6MS$new(data = mm,
cluster_meta = cluster_meta,
sequences = DNAStringSetList(seqs),
sep = sep)
}
message('Done.')
return(pg)
}