-
Notifications
You must be signed in to change notification settings - Fork 0
/
coil_processing.R
147 lines (127 loc) · 5.43 KB
/
coil_processing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# Script to identify indels and stop codons in COI sequences using coil
# Requires an ESV table in .xlsx with BOLDigger taxonomy results
# Note: the script leaves non-animal sequences untouched, only animal sequences are checked
library(seqinr)
library(readxl)
library(openxlsx)
library(dplyr)
library(coil)
# Options
## Give path to ESV table with BOLDigger taxonomy
esv_file = "/Users/christopherhempel/Desktop/RSDE COI water project/apscale/rsde-coi-water-otu_97_apscale/rsde-coi-water-otu_97_apscale_ESV_table_with_BOLDigger.xlsx"
## If you want to visually inspect sequences flagged by coil, set this to FALSE
# Otherwise, if TRUE, flagged seqs will be dropped automatically
auto_drop = TRUE
# ESVs or OTUs?
unit="ESV"
# Read in df
df <- read_excel(esv_file)
# Fix capitalization issue of ranks if ranks are not capitalized
if ("phylum" %in% colnames(df)) {
# Code to execute when ranks are capitalized
colnames(df)[colnames(df) == "phylum"] <- "Phylum"
colnames(df)[colnames(df) == "class"] <- "Class"
colnames(df)[colnames(df) == "order"] <- "Order"
colnames(df)[colnames(df) == "family"] <- "Family"
}
# Function to identify the genetic code of the lowest rank that is recognized by coil's function which_translate_table
genetic_code_lowest_rank <- function(row) {
result <- "Not identifiable"
for (col in rev(names(row))) {
value <- row[[col]]
if (is.na(value)) {
next # Skip processing if value is NA
}
if (value=="No Match") {
break # Skip processing if value is NA
}
tryCatch({
result <- which_trans_table(value)
break # Break out of the loop if processing succeeds
}, error = function(e) {
# Continue to the next column if an error occurs
warning(paste("Error for taxon", value, ":", e$message))
}, warning = function(w) {
# Continue to the next column if a warning occurs
message(paste("Warning for taxon", value, ":", w$message))
})
}
return(result)
}
# Apply the process_row function to each row of the dataframe
df$genetic_code <- apply(df[c("Phylum", "Class", "Order", "Family")], 1, genetic_code_lowest_rank)
# Create flag if coil can't process sequence (= not reliable)
# (likely because it's no animal sequenced due to no genetic code being found)
df <- df %>%
mutate(not_reliable = genetic_code == "Not identifiable")
# Turn unidentifiable genetic codes
df <- df %>%
mutate(genetic_code = ifelse(genetic_code == "Not identifiable", 0, genetic_code))
# Genetic code needs to be in format integer
df$genetic_code <- as.integer(df$genetic_code)
##### Formatting done
##### Start coil
# For primers mlCO1intF and jgHCO2198R, start position is 343/115 (nt/aa) and end is 657/218 (nt/aa)
# See coil vignette for more details
nt_start = 346
nt_end = 657
aa_start = 116
aa_end = 218
#Subsetting the PHHM to target length
meta_nt_phmm = subsetPHMM(nt_coi_PHMM, start = nt_start, end = nt_end)
meta_aa_phmm = subsetPHMM(aa_coi_PHMM, start = aa_start, end = aa_end)
# TO DO CUT DOWN DF
df_reliable = df[!df$not_reliable, ]
df_unreliable = df[df$not_reliable, ]
# Running coil
# Note: I turned on triple translate just in case the PHMM subsetting is not 100% accurate
# Runs longer but is safer
coil_result_df = flatten_coi5p(
lapply(1:length(df_reliable$ID), function(i){
coi5p_pipe(df_reliable$Seq[i],
name = df_reliable$ID[i],
trans_table = df_reliable$genetic_code[i],
nt_PHMM = meta_nt_phmm,
aa_PHMM = meta_aa_phmm,
triple_translate=TRUE)
})
)
# Concatenating df and coil output
df_with_coil = cbind(df_reliable, coil_result_df)
# Create coil flag columns that flags sequences that contain either an indel or stop codon
df_with_coil$coil_flag <- df_with_coil$indel_likely | df_with_coil$stop_codons
df_with_coil$coil_flag <- ifelse(df_with_coil$coil_flag == TRUE, "CONTAINS INDEL OR STOP CODON", NA)
# Concatenate dfs
df_unreliable <- df_unreliable %>%
rename_all(~ ifelse(. %in% colnames(df_with_coil), ., NA))
concatenated_df <- bind_rows(df_with_coil, df_unreliable)
# Sort df by ID names
concatenated_df <- concatenated_df %>%
arrange(as.numeric(gsub(paste0(unit, "_"), "", ID)))
if (auto_drop) {
# Set outfile name
outfile <- sub("\\.xlsx$", "_coil_filtered.xlsx", esv_file)
# Drop rows that are flagged
num_dropped = sum(!is.na(concatenated_df$coil_flag))
paste("Dropping", num_dropped, "flagged sequences.")
concatenated_df = subset(concatenated_df, is.na(coil_flag))
# Drop coil-related columns
concatenated_df = select(concatenated_df, -genetic_code:-coil_flag)
write.xlsx(concatenated_df, file = outfile)
} else {
# Set outfile name
outfile <- sub("\\.xlsx$", "_with_coilflag.xlsx", esv_file)
# Cut down df
concatenated_df = select(concatenated_df, -genetic_code:-stop_codons)
# Write the df to an Excel file and highlight flagged sequences in red
wb <- createWorkbook() # create a workbook
addWorksheet(wb, "Sheet") #add a worksheet to the workbook
writeData(wb, "Sheet", concatenated_df) # write data into the worksheet of the workbook
highlight_style <- createStyle(fontColour = "#000000", bgFill = "#FF0000") # create highlight style
# Apply style to cells containing flag
conditionalFormatting(wb, "Sheet", cols = 1:ncol(concatenated_df),
rows = 1:nrow(concatenated_df), rule = "CONTAINS INDEL OR STOP CODON",
style = highlight_style,
type = "contains")
saveWorkbook(wb, outfile, overwrite = TRUE)
}