# Extracting multihit nonsynonymous mutations from filtered variants

In [11]:
# Read in data

variants = read.table("../../data/deep_seq/filtered_variant_data.txt", 
                      sep = "\t", 
                      header = T)

In [12]:
# Subset only genes with nonsynonymous mutations

nonsynonymous = variants[variants$VARIANT_EFFECT_CAT == "nonsynonymous",]
head(nonsynonymous)

Unnamed: 0,SAMPLE,TIME,ANTIBIOTIC,IMMIGRATION,REPLICATE,SPECIES,CONTIG,CONTIG_LENGTH,POSITION,REF_ALLELE,⋯,VARIANT_EFFECT_CAT,VARIANT_IMPACT,GENE,GENEID,NA_CHANGE,AA_CHANGE,VARIANT_FREQUENCY,MEDIAN_COVERAGE,AMPLICON_READS,DISTANCE_FROM_PREV
139,T8_AB4_I1_REP2,8,4,1,2,HAMBI_1972,ABNHBPOL_11,154390,38145,T,⋯,nonsynonymous,MODERATE,vgrG1_2,ABNHBPOL_02456,c.340T>G,p.Ser114Ala,0.5116279,181,137,40
140,T8_AB4_I1_REP3,8,4,1,3,HAMBI_1972,ABNHBPOL_11,154390,38145,T,⋯,nonsynonymous,MODERATE,vgrG1_2,ABNHBPOL_02456,c.340T>G,p.Ser114Ala,0.6190476,184,95,40
141,T8_AB16_I0_REP1,8,16,0,1,HAMBI_1972,ABNHBPOL_11,154390,38145,T,⋯,nonsynonymous,MODERATE,vgrG1_2,ABNHBPOL_02456,c.340T>G,p.Ser114Ala,0.375,25,4,40
142,T8_AB16_I1_REP3,8,16,1,3,HAMBI_1972,ABNHBPOL_11,154390,38145,T,⋯,nonsynonymous,MODERATE,vgrG1_2,ABNHBPOL_02456,c.340T>G,p.Ser114Ala,0.75,23,8,40
143,T12_AB4_I1_REP2,12,4,1,2,HAMBI_1972,ABNHBPOL_11,154390,38145,T,⋯,nonsynonymous,MODERATE,vgrG1_2,ABNHBPOL_02456,c.340T>G,p.Ser114Ala,0.625,469,1129,40
144,T12_AB4_I1_REP3,12,4,1,3,HAMBI_1972,ABNHBPOL_11,154390,38145,T,⋯,nonsynonymous,MODERATE,vgrG1_2,ABNHBPOL_02456,c.340T>G,p.Ser114Ala,0.5421687,393,734,40


In [13]:
# Number of variants (two observations for each, one per time point)

nrow(nonsynonymous)/2 # 588 nonsynonymous variants

In [14]:
# Number of possible coding sequences

cds_numbers = read.table("../../data/deep_seq/cds_counts.txt", 
                      sep = "\t", 
                      header = F)
colnames(cds_numbers) = c("SPECIES", "CDS_COUNT")
head(cds_numbers)
sum(cds_numbers$CDS_COUNT[cds_numbers$SPECIES %in% nonsynonymous$SPECIES]) # 58,220 coding sequences

SPECIES,CDS_COUNT
HAMBI_6,5945
HAMBI_97,3137
HAMBI_105,5087
HAMBI_216,4825
HAMBI_262,3231
HAMBI_403,4905


In [16]:
# There are 58,220 unique coding sequences (items, n) and we are selecting 588 of them (m)
# The expected number of items appearing k times is therefore given by n * dbinom(k, m, 1 / n)

# probability of one hit:
58220 * dbinom(1, 588, 1 / 58220) / 588 # 0.989968125847872
# two hits:
58220 * dbinom(2, 588, 1 / 58220) / 588 # 0.00499073575527493
# three hits:
58220 * dbinom(3, 588, 1 / 58220) / 588 # 1.67446546808378e-05
# four hits:
58220 * dbinom(4, 588, 1 / 58220) / 588 # 4.2063686203345e-08

# The probability of two hits in the same gene is below alpha 0.01

# Based on literature we know that the rpsL gene is the most common target of streptomycin resistance mutations
# so let's select it in in addition to all genes with two or more hits

nonsynonymous$COUNT = 1
hit_counts = aggregate(COUNT ~ GENE, data = nonsynonymous, FUN = function(x) {sum(x)})

# There is a variant call for two time points per population, so two hits equal four occurrences

x = hit_counts[hit_counts$COUNT >= 4,]

multihit = nonsynonymous[nonsynonymous$GENE %in% x$GENE | nonsynonymous$GENE == "rpsL",]

# Number of nonsynonymous variants left

nrow(multihit)/2 # 547 (i.e. most variants, 577/588, occur in genes with more than one hit)

In [18]:
# Convert counts where both time points are included to actual counts

multihit$COUNT = ifelse(multihit$VARIANT_FREQUENCY == 0, 0, 1)
nrow(multihit[multihit$COUNT == 0,])

In [19]:
# Write out

write.table(multihit, row.names = FALSE, "../../data/deep_seq/multihit_nonsynonymous_variant_data.txt", sep = "\t")