# Extracting multihit nonsynonymous mutations from filtered variants

In [1]:
# Read in data

variants = read.table("../../../data/deep_seq/filtered_variant_data.txt", 
                      sep = "\t", 
                      header = T)

In [2]:
# Subset only genes with nonsynonymous mutations

nonsynonymous = variants[variants$VARIANT_EFFECT_CAT == "nonsynonymous",]
head(nonsynonymous)

Unnamed: 0,SAMPLE,TIME,ANTIBIOTIC,IMMIGRATION,REPLICATE,SPECIES,CONTIG,CONTIG_LENGTH,POSITION,REF_ALLELE,⋯,VARIANT_EFFECT_CAT,VARIANT_IMPACT,GENE,GENEID,NA_CHANGE,AA_CHANGE,VARIANT_FREQUENCY,MEDIAN_COVERAGE,AMPLICON_READS,DISTANCE_FROM_PREV
139,T8_AB4_I1_REP2,8,4,1,2,HAMBI_1972,ABNHBPOL_11,154390,38145,T,⋯,nonsynonymous,MODERATE,vgrG1_2,ABNHBPOL_02456,c.340T>G,p.Ser114Ala,0.5116279,181,137,40
140,T8_AB4_I1_REP3,8,4,1,3,HAMBI_1972,ABNHBPOL_11,154390,38145,T,⋯,nonsynonymous,MODERATE,vgrG1_2,ABNHBPOL_02456,c.340T>G,p.Ser114Ala,0.6190476,184,95,40
141,T8_AB16_I0_REP1,8,16,0,1,HAMBI_1972,ABNHBPOL_11,154390,38145,T,⋯,nonsynonymous,MODERATE,vgrG1_2,ABNHBPOL_02456,c.340T>G,p.Ser114Ala,0.375,25,4,40
142,T8_AB16_I1_REP3,8,16,1,3,HAMBI_1972,ABNHBPOL_11,154390,38145,T,⋯,nonsynonymous,MODERATE,vgrG1_2,ABNHBPOL_02456,c.340T>G,p.Ser114Ala,0.75,23,8,40
143,T12_AB4_I1_REP2,12,4,1,2,HAMBI_1972,ABNHBPOL_11,154390,38145,T,⋯,nonsynonymous,MODERATE,vgrG1_2,ABNHBPOL_02456,c.340T>G,p.Ser114Ala,0.625,469,1129,40
144,T12_AB4_I1_REP3,12,4,1,3,HAMBI_1972,ABNHBPOL_11,154390,38145,T,⋯,nonsynonymous,MODERATE,vgrG1_2,ABNHBPOL_02456,c.340T>G,p.Ser114Ala,0.5421687,393,734,40


In [3]:
# Number of variants (two observations for each, one per time point)

nrow(nonsynonymous)/2 # 588 nonsynonymous variants

In [4]:
# Number of possible coding sequences

cds_numbers = read.table("../../../data/deep_seq/cds_counts.txt", 
                      sep = "\t", 
                      header = F)
colnames(cds_numbers) = c("SPECIES", "CDS_COUNT")
head(cds_numbers)
sum(cds_numbers$CDS_COUNT[cds_numbers$SPECIES %in% nonsynonymous$SPECIES]) # 58,220 coding sequences

SPECIES,CDS_COUNT
HAMBI_6,5945
HAMBI_97,3137
HAMBI_105,5087
HAMBI_216,4825
HAMBI_262,3231
HAMBI_403,4905


In [5]:
library(stringi)
genes = stri_rand_strings(58220,50, pattern = "[A-Za-z0-9]")
head(genes)
length(unique(genes))

gene_sample_list = list()
for(i in 1:10000){
    gene_sample = sample(genes, size = 588, replace =T)
    hits = as.data.frame(table(gene_sample))
    a = length(hits$gene_sample[hits$Freq == 1])
    b = length(hits$gene_sample[hits$Freq == 2])
    c = length(hits$gene_sample[hits$Freq == 3])
    d = length(hits$gene_sample[hits$Freq == 4])
    e = length(hits$gene_sample[hits$Freq == 5])
    hits_summary = data.frame(one_hit = a, two_hits = b, three_hits = c, four_hits = d, five_hits = e)
    gene_sample_list[[i]] = hits_summary
}

gene_hit_summary = Reduce(function(...) merge(..., all = T), gene_sample_list)
head(gene_hit_summary)

colMeans(gene_hit_summary)

one_hit,two_hits,three_hits,four_hits,five_hits
566,11,0,0,0
568,10,0,0,0
569,8,1,0,0
570,9,0,0,0
571,7,1,0,0
572,8,0,0,0


In [6]:
nonsynonymous$COUNT = 1
hit_counts = aggregate(COUNT ~ GENE, data = nonsynonymous, FUN = function(x) {sum(x)})
head(hit_counts)                        
                          
# total number of nonsynonymous mutations
length(hit_counts$COUNT) # 74
# hit in two or more populations
sum(hit_counts$COUNT[hit_counts$COUNT > 4]) # 1092 mutations (expected 5)
length(hit_counts$COUNT[hit_counts$COUNT > 4]) # 47 genes (expected 5)
# hit in three or more populations
length(hit_counts$COUNT[hit_counts$COUNT > 6]) # 39 (expected < 1)
                          
# We would like to test these observations against the null hypothesis that those genes 
# recurrently mutated happen by chance. This model is equivalent to the case of comparing biased 
# and unbiased multinomial sampling of balls of different colour from an urn. 
# Out of all coding genes in the all the genomes, 
# we draw mutations from the multinomial distribution with replacement, 
# based on the number of observed nonsynonymous coding mutations in each population. 
# If these 588 mutations were randomly distributed over the 58,220 coding genes in the genomes, 
# we would expect only 5 genes mutated in two or more populations. 
# In total, there were 1092 coding nonsynonymous mutations across 47 genes 
# independently mutated in two or more populations.
# Therefore, we focus on multi-hit genes which are independently mutated in two or more populations, 
# which are putatively beneficial.

GENE,COUNT
ABNHBPOL_00951,98
ABNHBPOL_04066,40
CKLFCLNC_01286,18
CKLFCLNC_02614,10
CKLFCLNC_02634,20
CKLFCLNC_03362,12


In [7]:
hit_counts

GENE,COUNT
ABNHBPOL_00951,98
ABNHBPOL_04066,40
CKLFCLNC_01286,18
CKLFCLNC_02614,10
CKLFCLNC_02634,20
CKLFCLNC_03362,12
CKLFCLNC_03364,2
CKLFCLNC_05792,6
CKLFCLNC_06220,2
CKLFCLNC_06346,2


In [8]:
# There are 58,220 unique coding sequences (items, n) and we are selecting 588 of them (m)
# The expected number of items appearing k times is therefore given by n * dbinom(k, m, 1 / n)

# probability of one hit:
58220 * dbinom(1, 588, 1 / 58220) / 588 # 0.989968125847872
# two hits:
58220 * dbinom(2, 588, 1 / 58220) / 588 # 0.00499073575527493
# three hits:
58220 * dbinom(3, 588, 1 / 58220) / 588 # 1.67446546808378e-05
# four hits:
58220 * dbinom(4, 588, 1 / 58220) / 588 # 4.2063686203345e-08

# The probability of two hits in the same gene is below alpha 0.01

# Based on literature we know that the rpsL gene is the most common target of streptomycin resistance mutations
# so let's select it in in addition to all genes with two or more hits

nonsynonymous$COUNT = 1
hit_counts = aggregate(COUNT ~ GENE, data = nonsynonymous, FUN = function(x) {sum(x)})

# There is a variant call for two time points per population, so two hits equal four occurrences

x = hit_counts[hit_counts$COUNT >= 4,]

multihit = nonsynonymous[nonsynonymous$GENE %in% x$GENE | nonsynonymous$GENE == "rpsL",]

# Number of nonsynonymous variants left

nrow(multihit)/2 # 547 (i.e. most variants, 577/588, occur in genes with more than one hit)

In [9]:
# Convert counts where both time points are included to actual counts

multihit$COUNT = ifelse(multihit$VARIANT_FREQUENCY == 0, 0, 1)
nrow(multihit[multihit$COUNT == 0,])

In [10]:
# Write out

write.table(multihit, row.names = FALSE, "../../../data/deep_seq/multihit_nonsynonymous_variant_data.txt", sep = "\t")