Skip to content

Commit

Permalink
Move chem16S::calc_metrics() to calc.metrics()
Browse files Browse the repository at this point in the history
  • Loading branch information
jedick committed Mar 2, 2024
1 parent db4e503 commit a5cc89d
Show file tree
Hide file tree
Showing 9 changed files with 348 additions and 13 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Date: 2024-03-01
Date: 2024-03-02
Package: canprot
Version: 1.1.2-27
Version: 1.1.2-28
Title: Chemical Analysis of Proteins
Authors@R: c(
person("Jeffrey", "Dick", email = "j3ffdick@gmail.com", role = c("aut", "cre"),
Expand Down
4 changes: 3 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ export(
# 20240228 moved from CHNOSZ
"read.fasta", "count.aa", "aasum",
# 20240301
"V0", "add.cld"
"V0", "add.cld", "pV0",
# 20240302 moved from chem16S
"HC", "NC", "OC", "SC", "calc.metrics"
)

# Imports from default packages
Expand Down
2 changes: 1 addition & 1 deletion R/add.cld.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,5 @@ add.cld <- function(datlist, bp, dx = NULL, dy = NULL) {
# Add to plot
text((1:ngroup) + dx, bp$stats[4, ] + dy, letters)
# Return dx, dy, and letters
list(dx = dx, dy = dy, letters = letters)
invisible(list(dx = dx, dy = dy, letters = letters))
}
52 changes: 52 additions & 0 deletions R/calc.metrics.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# canprot/calc.metrics.R
# Calculate selected chemical metrics for proteins
# 20191027 initial version as canprot/metrics.R
# 20230704 adapted for chem16S/calc_metrics.R
# 20240302 moved to canprot
calc.metrics <- function(AAcomp, metrics = c("Zc", "nO2", "nH2O")) {

## Define objects used in various calculations
# The number of C in each amino acid residue; calculated in CHNOSZ:
# nC_AA <- sapply(makeup(info(info(aminoacids("")))$formula), "[", "C")
# nC_AA <- nC_AA
# names(nC_AA) <- aminoacids(3)
nC_AA <- c(Ala = 3, Cys = 3, Asp = 4, Glu = 5, Phe = 9, Gly = 2, His = 6,
Ile = 6, Lys = 6, Leu = 6, Met = 5, Asn = 4, Pro = 5, Gln = 5,
Arg = 6, Ser = 3, Thr = 4, Val = 5, Trp = 11, Tyr = 9)
# Identify columns with 3-letter abbreviations for the amino acids
isAA <- tolower(colnames(AAcomp)) %in% tolower(names(nC_AA))
iAA <- match(tolower(colnames(AAcomp)[isAA]), tolower(names(nC_AA)))

values <- lapply(metrics, function(metric) {

if(metric == "Zc") {
Zc(AAcomp)
} else if(metric == "nH2O") {
nH2O(AAcomp)
} else if(metric == "nO2") {
nO2(AAcomp)
} else if(metric == "GRAVY") {
GRAVY(AAcomp)
} else if(metric == "pI") {
pI(AAcomp)
} else if(metric == "MW") {
MW(AAcomp)
} else if(tolower(metric) %in% c("length", "plength")) {
plength(AAcomp)
} else if(metric %in% c("H/C", "H_C", "HC")) {
HC(AAcomp)
} else if(metric %in% c("N/C", "N_C", "NC")) {
NC(AAcomp)
} else if(metric %in% c("O/C", "O_C", "OC")) {
OC(AAcomp)
} else if(metric %in% c("S/C", "S_C", "SC")) {
SC(AAcomp)
} else stop(paste0("'", metric, "' is not an available metric"))

})

values <- do.call(cbind, values)
colnames(values) <- metrics
as.data.frame(values)

}
3 changes: 2 additions & 1 deletion R/cplab.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ cplab <- list(
DnC = quote(Delta*italic(n)[C] * "/AA"),
DnN = quote(Delta*italic(n)[N] * "/AA"),
DnS = quote(Delta*italic(n)[S] * "/AA"),
V0 = quote(list(italic("V") * degree, "cm" ^ 3 ~ "mol" ^ -1)),
V0 = quote("Volume per residue (cm" ^ 3 ~ "mol" ^ -1 * ")"),
pV0 = quote("Volume per protein (cm" ^ 3 ~ "mol" ^ -1 * ")"),
DV0 = quote(list(Delta * italic("V") * degree, "cm" ^ 3 ~ "mol" ^ -1)),
nAA = quote(italic(n)[AA]),
DnAA = quote(Delta*italic(n)[AA]),
Expand Down
142 changes: 134 additions & 8 deletions R/metrics.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# Carbon oxidation state 20180228
# An unused second argument (...) is provided for flexible do.call() constructions
Zc <- function(AAcomp, ...) {
# The number of carbons of the amino acids
# The number of carbon atoms in each amino acid
nC_AA <- c(Ala = 3, Cys = 3, Asp = 4, Glu = 5, Phe = 9, Gly = 2, His = 6,
Ile = 6, Lys = 6, Leu = 6, Met = 5, Asn = 4, Pro = 5, Gln = 5,
Arg = 6, Ser = 3, Thr = 4, Val = 5, Trp = 11, Tyr = 9)
Expand All @@ -28,7 +28,7 @@ Zc <- function(AAcomp, ...) {
Zctot / nCtot
}

# Stoichiometric hydration state 20181228
# Per-residue stoichiometric hydration state 20181228
# Add 'terminal_H2O' argument 20221018
nH2O <- function(AAcomp, basis = "QEC", terminal_H2O = 0) {
if(basis == "QEC") {
Expand Down Expand Up @@ -64,7 +64,7 @@ nH2O <- function(AAcomp, basis = "QEC", terminal_H2O = 0) {
nH2O / rowSums(AAcomp[, isAA, drop = FALSE])
}

# Stoichiometric oxidation state 20201016
# Per-residue stoichiometric oxidation state 20201016
nO2 <- function(AAcomp, basis = "QEC", ...) {
if(basis == "QEC") {
# To get the number of O2 in reactions to form amino acid residues from the "QEC" basis:
Expand Down Expand Up @@ -142,7 +142,7 @@ pI <- function(AAcomp, terminal_H2O = 1, ...) {
apply(myAA, 1, onepI)
}

# Molecular weight 20200501
# Per-residue molecular weight 20200501
MW <- function(AAcomp, terminal_H2O = 0, ...) {
# Mass per residue:
# MW_AA <- sapply(CHNOSZ::makeup(info(aminoacids(""))), mass) - mass("H2O")
Expand All @@ -154,7 +154,7 @@ MW <- function(AAcomp, terminal_H2O = 0, ...) {
Thr = 101.10508, Val = 99.13256, Trp = 186.2132, Tyr = 163.17596
)
# Find columns with names for the amino acids
isAA <- colnames(AAcomp) %in% names(MW_AA)
isAA <- tolower(colnames(AAcomp)) %in% tolower(names(MW_AA))
iAA <- match(colnames(AAcomp)[isAA], names(MW_AA))
# Calculate total MW of residues in each protein
MW <- rowSums(t(t(AAcomp[, isAA, drop = FALSE]) * MW_AA[iAA]))
Expand All @@ -166,7 +166,7 @@ MW <- function(AAcomp, terminal_H2O = 0, ...) {
MW / rowSums(AAcomp[, isAA, drop = FALSE])
}

# Volume 20240301
# Per-residue volume 20240301
V0 <- function(AAcomp, terminal_H2O = 0, ...) {
# Volume per residue using group contributions from Dick et al., 2006:
# i.e. residue = [sidechain group] + [backbone group]
Expand All @@ -178,7 +178,7 @@ V0 <- function(AAcomp, terminal_H2O = 0, ...) {
Ser = 53.338, Thr = 70.326, Val = 83.575, Trp = 136.341, Tyr = 117.2
)
# Find columns with names for the amino acids
isAA <- colnames(AAcomp) %in% names(V0_AA)
isAA <- tolower(colnames(AAcomp)) %in% tolower(names(V0_AA))
iAA <- match(colnames(AAcomp)[isAA], names(V0_AA))
# Calculate total V0 of residues in each protein
V0 <- rowSums(t(t(AAcomp[, isAA, drop = FALSE]) * V0_AA[iAA]))
Expand All @@ -190,18 +190,144 @@ V0 <- function(AAcomp, terminal_H2O = 0, ...) {
V0 / rowSums(AAcomp[, isAA, drop = FALSE])
}

# Per-protein volume 20240301
pV0 <- function(AAcomp, terminal_H2O = 0, ...) {
# Volume per residue using group contributions from Dick et al., 2006:
# i.e. residue = [sidechain group] + [backbone group]
# V0_AA <- info(info(paste0("[", aminoacids(3), "]")))$V + info(info("[UPBB]"))$V
# names(V0_AA) <- aminoacids(3)
V0_AA <- c(Ala = 53.16, Cys = 66.209, Asp = 67.412, Glu = 82.917, Phe = 114.841,
Gly = 35.902, His = 92.049, Ile = 98.5, Lys = 101.344, Leu = 100.496,
Met = 98.128, Asn = 70.122, Pro = 75.345, Gln = 86.374, Arg = 132.121,
Ser = 53.338, Thr = 70.326, Val = 83.575, Trp = 136.341, Tyr = 117.2
)
# Find columns with names for the amino acids
isAA <- tolower(colnames(AAcomp)) %in% tolower(names(V0_AA))
iAA <- match(colnames(AAcomp)[isAA], names(V0_AA))
# Calculate total V0 of residues in each protein
V0 <- rowSums(t(t(AAcomp[, isAA, drop = FALSE]) * V0_AA[iAA]))
# Add volume of H2O for each pair of terminal groups
# V0_H2O <- info(info("[AABB]"))$V - info(info("[UPBB]"))$V
V0_H2O <- 7.289
V0 + terminal_H2O * V0_H2O
}

# Protein length 20200501
plength <- function(AAcomp, ...) {
AA_names <- c(
"Ala", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile", "Lys", "Leu", "Met",
"Asn", "Pro", "Gln", "Arg", "Ser", "Thr", "Val", "Trp", "Tyr"
)
# Find columns with names for the amino acids
isAA <- colnames(AAcomp) %in% AA_names
isAA <- tolower(colnames(AAcomp)) %in% tolower(AA_names)
# Sum amino acid counts to get protein length
rowSums(AAcomp[, isAA])
}

# H/C (H:C ratio) 20230707
HC <- function(AAcomp, ...) {
# The number of H in each amino acid residue; calculated in CHNOSZ:
# nH_AA <- sapply(makeup(info(info(aminoacids("")))$formula), "[", "H")
# nH_AA <- nH_AA - 2 # Take H-OH off of amino acids to make residues
# names(nH_AA) <- aminoacids(3)
nH_AA <- c(Ala = 5, Cys = 5, Asp = 5, Glu = 7, Phe = 9, Gly = 3, His = 7,
Ile = 11, Lys = 12, Leu = 11, Met = 9, Asn = 6, Pro = 7, Gln = 8,
Arg = 12, Ser = 5, Thr = 7, Val = 9, Trp = 10, Tyr = 9)
# The number of carbon atoms in each amino acid
nC_AA <- c(Ala = 3, Cys = 3, Asp = 4, Glu = 5, Phe = 9, Gly = 2, His = 6,
Ile = 6, Lys = 6, Leu = 6, Met = 5, Asn = 4, Pro = 5, Gln = 5,
Arg = 6, Ser = 3, Thr = 4, Val = 5, Trp = 11, Tyr = 9)
# Find columns with names for the amino acids
isAA <- tolower(colnames(AAcomp)) %in% tolower(names(nH_AA))
iAA <- match(tolower(colnames(AAcomp)[isAA]), tolower(names(nH_AA)))
# Count the number of C in all residues
numC <- t(t(AAcomp[, isAA, drop = FALSE]) * nC_AA[iAA])
# Count the number of H in all residues
numH <- t(t(AAcomp[, isAA, drop = FALSE]) * nH_AA[iAA])
# Calculate the total number of H and C, then the overall H/C
Htot <- rowSums(numH)
Ctot <- rowSums(numC)
Htot / Ctot
}

# N/C (N:C ratio) 20230707
NC <- function(AAcomp, ...) {
# The number of N in each amino acid residue; calculated in CHNOSZ:
# nN_AA <- sapply(makeup(info(info(aminoacids("")))$formula), "[", "N")
# names(nN_AA) <- aminoacids(3)
nN_AA <- c(Ala = 1, Cys = 1, Asp = 1, Glu = 1, Phe = 1, Gly = 1, His = 3,
Ile = 1, Lys = 2, Leu = 1, Met = 1, Asn = 2, Pro = 1, Gln = 2,
Arg = 4, Ser = 1, Thr = 1, Val = 1, Trp = 2, Tyr = 1)
# The number of carbon atoms in each amino acid
nC_AA <- c(Ala = 3, Cys = 3, Asp = 4, Glu = 5, Phe = 9, Gly = 2, His = 6,
Ile = 6, Lys = 6, Leu = 6, Met = 5, Asn = 4, Pro = 5, Gln = 5,
Arg = 6, Ser = 3, Thr = 4, Val = 5, Trp = 11, Tyr = 9)
# Find columns with names for the amino acids
isAA <- tolower(colnames(AAcomp)) %in% tolower(names(nN_AA))
iAA <- match(tolower(colnames(AAcomp)[isAA]), tolower(names(nN_AA)))
# Count the number of C in all residues
numC <- t(t(AAcomp[, isAA, drop = FALSE]) * nC_AA[iAA])
# Count the number of N in all residues
numN <- t(t(AAcomp[, isAA, drop = FALSE]) * nN_AA[iAA])
# Calculate the total number of N and C, then the overall N/C
Ntot <- rowSums(numN)
Ctot <- rowSums(numC)
Ntot / Ctot
}

# O/C (O:C ratio) 20230707
OC <- function(AAcomp, ...) {
# The number of O in each amino acid residue; calculated in CHNOSZ:
# nO_AA <- sapply(makeup(info(info(aminoacids("")))$formula), "[", "O")
# nO_AA <- nO_AA - 1 # Take H-OH off of amino acids to make residues
# names(nO_AA) <- aminoacids(3)
nO_AA <- c(Ala = 1, Cys = 1, Asp = 3, Glu = 3, Phe = 1, Gly = 1, His = 1,
Ile = 1, Lys = 1, Leu = 1, Met = 1, Asn = 2, Pro = 1, Gln = 2,
Arg = 1, Ser = 2, Thr = 2, Val = 1, Trp = 1, Tyr = 2)
# The number of carbon atoms in each amino acid
nC_AA <- c(Ala = 3, Cys = 3, Asp = 4, Glu = 5, Phe = 9, Gly = 2, His = 6,
Ile = 6, Lys = 6, Leu = 6, Met = 5, Asn = 4, Pro = 5, Gln = 5,
Arg = 6, Ser = 3, Thr = 4, Val = 5, Trp = 11, Tyr = 9)
# Find columns with names for the amino acids
isAA <- tolower(colnames(AAcomp)) %in% tolower(names(nO_AA))
iAA <- match(tolower(colnames(AAcomp)[isAA]), tolower(names(nO_AA)))
# Count the number of C in all residues
numC <- t(t(AAcomp[, isAA, drop = FALSE]) * nC_AA[iAA])
# Count the number of O in all residues
numO <- t(t(AAcomp[, isAA, drop = FALSE]) * nO_AA[iAA])
# Calculate the total number of O and C, then the overall O/C
Otot <- rowSums(numO)
Ctot <- rowSums(numC)
Otot / Ctot
}

# S/C (S:C ratio) 20230707
SC <- function(AAcomp, ...) {
# The number of S in each amino acid residue; calculated in CHNOSZ:
# nS_AA <- sapply(makeup(info(info(aminoacids("")))$formula), "[", "S")
# nS_AA[is.na(nS_AA)] <- 0
# names(nS_AA) <- aminoacids(3)
nS_AA <- c(Ala = 0, Cys = 1, Asp = 0, Glu = 0, Phe = 0, Gly = 0, His = 0,
Ile = 0, Lys = 0, Leu = 0, Met = 1, Asn = 0, Pro = 0, Gln = 0,
Arg = 0, Ser = 0, Thr = 0, Val = 0, Trp = 0, Tyr = 0)
# The number of carbon atoms in each amino acid
nC_AA <- c(Ala = 3, Cys = 3, Asp = 4, Glu = 5, Phe = 9, Gly = 2, His = 6,
Ile = 6, Lys = 6, Leu = 6, Met = 5, Asn = 4, Pro = 5, Gln = 5,
Arg = 6, Ser = 3, Thr = 4, Val = 5, Trp = 11, Tyr = 9)
# Find columns with names for the amino acids
isAA <- tolower(colnames(AAcomp)) %in% tolower(names(nS_AA))
iAA <- match(tolower(colnames(AAcomp)[isAA]), tolower(names(nS_AA)))
# Count the number of C in all residues
numC <- t(t(AAcomp[, isAA, drop = FALSE]) * nC_AA[iAA])
# Count the number of S in all residues
numS <- t(t(AAcomp[, isAA, drop = FALSE]) * nS_AA[iAA])
# Calculate the total number of S and C, then the overall S/C
Stot <- rowSums(numS)
Ctot <- rowSums(numC)
Stot / Ctot
}


#########################
### UNEXPORTED OBJECT ###
### ( used in pI() ) ###
Expand Down
83 changes: 83 additions & 0 deletions inst/tinytest/test-calc.metrics.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
## Tests for Zc, nO2, and nH2O adapted from canprot/tests/test-metrics.R on 20230704

info <- "Results are as expected for Zc, nO2, and nH2O"

## Calculate metrics for a few proteins the "long way" (using functions in CHNOSZ)
#library(CHNOSZ)
#basis(c("glutamine", "glutamic acid", "cysteine", "H2O", "O2"))
#Zc.ref <- ZC(protein.formula(1:6))
#nO2.ref <- protein.basis(1:6)[, "O2"] / protein.length(1:6)
## NOTE: subtract 1 so as exclude terminal groups from calculation of nH2O
#nH2O.ref <- (protein.basis(1:6)[, "H2O"] - 1) / protein.length(1:6)

Zc.ref <- c(-0.11633875106929, -0.0272787757817698, -0.195689166193988, -0.0492957746478873, -0.170212765957447, 0.0163132137030995)
nO2.ref <- c(-0.699539170506912, -0.522294022617124, -0.81466049382716, -0.574137931034483, -0.716346153846154, -0.471317829457364)
nH2O.ref <- c(-1.17465437788018, -0.881098546042003, -0.941666666666667, -0.955172413793103, -0.730769230769231, -0.886821705426357)

# Calculate metrics using calc.metrics() function in chem16S
AAcomp <-
structure(list(protein = c("O08452", "AMY", "AMYA", "BPT1", "CYC",
"LYSC"), organism = c("PYRFU", "BACSU", "PYRFU", "BOVIN", "BOVIN",
"CHICK"), ref = c("UniProt", "UniProt", "UniProt", "UniProt",
"UniProt", "UniProt"), abbrv = c("O08452", "P00691", "P49067",
"P00974", "P62894", "P00698"), chains = c(1L, 1L, 1L, 1L, 1L,
1L), Ala = c(28, 49, 26, 6, 6, 12), Cys = c(5, 1, 2, 6, 2, 8),
Asp = c(33, 44, 35, 2, 3, 7), Glu = c(23, 23, 66, 2, 9, 2
), Phe = c(20, 20, 37, 4, 4, 3), Gly = c(45, 51, 44, 6, 14,
12), His = c(12, 16, 14, 0, 3, 1), Ile = c(25, 35, 41, 2,
6, 6), Lys = c(19, 30, 48, 4, 18, 6), Leu = c(27, 36, 59,
2, 6, 8), Met = c(4, 10, 12, 1, 2, 2), Asn = c(21, 54, 24,
3, 5, 14), Pro = c(20, 23, 28, 4, 4, 2), Gln = c(7, 29, 15,
1, 3, 3), Arg = c(14, 24, 35, 6, 2, 11), Ser = c(21, 55,
33, 1, 1, 10), Thr = c(16, 45, 12, 3, 8, 7), Val = c(31,
32, 59, 1, 3, 6), Trp = c(26, 14, 17, 0, 1, 6), Tyr = c(37,
28, 41, 4, 4, 3)), row.names = c(NA, 6L), class = "data.frame")

metrics <- calc.metrics(AAcomp)

# Perform the tests
expect_equivalent(metrics$Zc, Zc.ref, info = info)
expect_equivalent(metrics$nO2, nO2.ref, info = info)
expect_equivalent(metrics$nH2O, nH2O.ref, info = info)

## Tests for H/C, N/C, O/C, and S/C added on 20230707

AAcomp <-
structure(list(protein = c("LYSC", "RNAS1", "AMYA", "CSG"), organism = c("CHICK",
"BOVIN", "PYRFU", "HALJP"), ref = c("UniProt", "UniProt", "UniProt",
"UniProt"), abbrv = c("P00698", "P61823", "P49067", "Q9C4B4"),
chains = c(1L, 1L, 1L, 1L), Ala = c(12, 12, 26, 61), Cys = c(8,
8, 2, 0), Asp = c(7, 5, 35, 122), Glu = c(2, 5, 66, 86),
Phe = c(3, 3, 37, 20), Gly = c(12, 3, 44, 78), His = c(1,
4, 14, 4), Ile = c(6, 3, 41, 47), Lys = c(6, 10, 48, 4),
Leu = c(8, 2, 59, 47), Met = c(2, 4, 12, 0), Asn = c(14,
10, 24, 51), Pro = c(2, 4, 28, 29), Gln = c(3, 7, 15, 21),
Arg = c(11, 4, 35, 19), Ser = c(10, 15, 33, 74), Thr = c(7,
10, 12, 79), Val = c(6, 9, 59, 66), Trp = c(6, 0, 17, 2),
Tyr = c(3, 6, 41, 18)), row.names = c(6L, 9L, 3L, 14L), class = "data.frame")


# library(CHNOSZ)
# pf <- as.data.frame(protein.formula(AAcomp))
# pf$H <- pf$H - 2 # Remove terminal H-OH
# pf$O <- pf$O - 1 # Remove terminal H-OH
# HCref <- pf$H / pf$C
# OCref <- pf$O / pf$C
# NCref <- pf$N / pf$C
# SCref <- pf$S / pf$C

HC.ref <- c(1.56117455138662, 1.57739130434783, 1.50964265456608, 1.53856636685745)
OC.ref <- c(0.300163132137031, 0.333913043478261, 0.276517300056721, 0.405287544289997)
NC.ref <- c(0.314845024469821, 0.297391304347826, 0.250992626205332, 0.264649768329245)
SC.ref <- c(0.0163132137030995, 0.0208695652173913, 0.00397050482132728, 0)

metrics <- calc.metrics(AAcomp, c("HC", "OC", "NC", "SC"))
expect_equivalent(metrics$HC, HC.ref)
expect_equivalent(metrics$NC, NC.ref)
expect_equivalent(metrics$OC, OC.ref)
expect_equivalent(metrics$SC, SC.ref)

# Test for length added 20240302
length.ref <- c(129, 124, 648, 828)
length.calc <- calc.metrics(AAcomp, "Length")[, 1]
expect_equal(length.calc, length.ref)
Loading

0 comments on commit a5cc89d

Please sign in to comment.