# Extracting multihit nonsynonymous mutations from filtered variants

In [1]:
# Read in data

variants = read.table("../../../data/deep_seq/downsampled_filtered_variant_data.txt", 
                      sep = "\t", 
                      header = T)

In [2]:
# Subset only genes with nonsynonymous mutations

nonsynonymous = variants[variants$VARIANT_EFFECT_CAT == "nonsynonymous",]
head(nonsynonymous)

Unnamed: 0,SAMPLE,TIME,ANTIBIOTIC,IMMIGRATION,REPLICATE,SPECIES,CONTIG,CONTIG_LENGTH,POSITION,REF_ALLELE,⋯,VARIANT_EFFECT_CAT,VARIANT_IMPACT,GENE,GENEID,NA_CHANGE,AA_CHANGE,VARIANT_FREQUENCY,MEDIAN_COVERAGE,AMPLICON_READS,DISTANCE_FROM_PREV
1,T12_AB16_I0_REP3,12,16,0,3,HAMBI_1972,ABNHBPOL_1,404185,130054,G,⋯,nonsynonymous,MODERATE,ABNHBPOL_00101,ABNHBPOL_00101,c.653C>A,p.Thr218Asn,0.5,439,550,65183
2,T8_AB4_I1_REP1,8,4,1,1,HAMBI_1972,ABNHBPOL_1,404185,204875,G,⋯,nonsynonymous,MODERATE,hscA,ABNHBPOL_00169,c.1013G>T,p.Arg338Leu,0.6666667,163,169,74821
116,T8_AB4_I0_REP1,8,4,0,1,HAMBI_1972,ABNHBPOL_1,404185,28988,C,⋯,nonsynonymous,MODERATE,carB,ABNHBPOL_00028,c.1113C>A,p.Asn371Lys,0.6666667,200,98,0
119,T12_AB0_I0_REP2,12,0,0,2,HAMBI_1972,ABNHBPOL_1,404185,372096,G,⋯,nonsynonymous,MODERATE,trxC,ABNHBPOL_00315,c.112C>A,p.Gln38Lys,0.5,569,2768,6573
120,T8_AB4_I1_REP2,8,4,1,2,HAMBI_1972,ABNHBPOL_1,404185,59337,A,⋯,nonsynonymous,MODERATE,bepC,ABNHBPOL_00049,c.1199A>G,p.Glu400Gly,0.6666667,181,137,30349
132,T8_AB4_I0_REP3,8,4,0,3,HAMBI_1972,ABNHBPOL_1,404185,64871,CCAACGATGATG,⋯,nonsynonymous,HIGH,cya,ABNHBPOL_00052,c.840_850delCAACGATGATGinsGGTGAGCGACGTCAAC,p.Asn281fs,0.8333333,268,129,395


In [3]:
# Number of variants (some represent one variant in two time points)

nrow(nonsynonymous) # 2295 nonsynonymous variants

In [4]:
# Number of possible coding sequences

cds_numbers = read.table("../../../data/deep_seq/cds_counts.txt", 
                      sep = "\t", 
                      header = F)
colnames(cds_numbers) = c("SPECIES", "CDS_COUNT")
head(cds_numbers)
sum(cds_numbers$CDS_COUNT[cds_numbers$SPECIES %in% nonsynonymous$SPECIES]) # 53,916 coding sequences

SPECIES,CDS_COUNT
HAMBI_6,5945
HAMBI_97,3137
HAMBI_105,5087
HAMBI_216,4825
HAMBI_262,3231
HAMBI_403,4905


In [5]:
# There are 53,916 unique coding sequences (items, n) and we are selecting 2295 of them (m)
# The expected number of items appearing k times is therefore given by n * dbinom(k, m, 1 / n)

# probability of one hit:
53916 * dbinom(1, 2295, 1 / 53916) / 2295 # 0.95834440511755
# two hits:
53916 * dbinom(2, 2295, 1 / 53916) / 2295 # 0.0203880373304244
# three hits:
53916 * dbinom(3, 2295, 1 / 53916) / 2295 # 0.000289033785271033
# four hits:
53916 * dbinom(4, 2295, 1 / 53916) / 2295 # 3.07180485876475e-06

# The probability of three hits in the same gene is below alpha 0.01
# Let's use three as criterion instead of two which can result from same variant occurring in two time points

# Based on literature we know that the rpsL gene is the most common target of streptomycin resistance mutations
# so let's select it in in addition to all genes with two or more hits

nonsynonymous$COUNT = 1
hit_counts = aggregate(COUNT ~ GENE, data = nonsynonymous, FUN = function(x) {sum(x)})

x = hit_counts[hit_counts$COUNT >= 3,]

multihit = nonsynonymous[nonsynonymous$GENE %in% x$GENE | nonsynonymous$GENE == "rpsL",]

# Number of nonsynonymous variants left

nrow(multihit) # 1764 (i.e. most variants, 1764/2295, occur in genes with more than one hit)

In [6]:
# Convert counts where both time points are included to actual counts

multihit$COUNT = ifelse(multihit$VARIANT_FREQUENCY == 0, 0, 1)
nrow(multihit[multihit$COUNT == 0,])
head(multihit)

Unnamed: 0,SAMPLE,TIME,ANTIBIOTIC,IMMIGRATION,REPLICATE,SPECIES,CONTIG,CONTIG_LENGTH,POSITION,REF_ALLELE,⋯,VARIANT_IMPACT,GENE,GENEID,NA_CHANGE,AA_CHANGE,VARIANT_FREQUENCY,MEDIAN_COVERAGE,AMPLICON_READS,DISTANCE_FROM_PREV,COUNT
132,T8_AB4_I0_REP3,8,4,0,3,HAMBI_1972,ABNHBPOL_1,404185,64871,CCAACGATGATG,⋯,HIGH,cya,ABNHBPOL_00052,c.840_850delCAACGATGATGinsGGTGAGCGACGTCAAC,p.Asn281fs,0.8333333,268,129,395,1
199,T12_AB128_I1_REP2,12,128,1,2,HAMBI_1972,ABNHBPOL_11,154390,37915,G,⋯,MODERATE,vgrG1_2,ABNHBPOL_02456,c.110G>A,p.Ser37Asn,0.5714286,326,578,16,1
200,T12_AB128_I1_REP3,12,128,1,3,HAMBI_1972,ABNHBPOL_11,154390,37915,GC,⋯,MODERATE,vgrG1_2,ABNHBPOL_02456,c.110_111delGCinsAT,p.Ser37Asn,0.6666667,450,848,16,1
201,T12_AB16_I0_REP1,12,16,0,1,HAMBI_1972,ABNHBPOL_11,154390,37915,G,⋯,MODERATE,vgrG1_2,ABNHBPOL_02456,c.110G>A,p.Ser37Asn,0.5833333,344,631,16,1
202,T12_AB16_I0_REP2,12,16,0,2,HAMBI_1972,ABNHBPOL_11,154390,37915,G,⋯,MODERATE,vgrG1_2,ABNHBPOL_02456,c.110G>A,p.Ser37Asn,0.6666667,262,374,16,1
203,T12_AB16_I1_REP1,12,16,1,1,HAMBI_1972,ABNHBPOL_11,154390,37915,G,⋯,MODERATE,vgrG1_2,ABNHBPOL_02456,c.110G>A,p.Ser37Asn,0.8571429,341,579,16,1


In [7]:
# Write out

write.table(multihit, row.names = FALSE, "../../../data/deep_seq/downsampled_multihit_nonsynonymous_variant_data.txt", sep = "\t")