In [6]:
import numpy as np
import pathogenprofiler as pp
from sklearn.mixture import GaussianMixture
import fastq2matrix as fm
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import json
from scipy.stats import norm

In [25]:
#vcf_file = 'test_data/ERR6635386.vcf.gz'
vcf_file = 'test/ERR6634978-ERR6635032-5050.vcf.gz'
json_file = 'test_data/ERR6635386.results.json'

In [26]:
def plot_gm(gm,data):
    gm.covariances_[0][0][0]
    std0 = np.sqrt(gm.covariances_[0][0][0])
    mu0 = gm.means_[0][0]
    std1 = np.sqrt(gm.covariances_[1][0][0])
    mu1 = gm.means_[1][0]
    x = np.linspace(0, 1, 100)
    p0 = norm.pdf(x, mu0, std0)
    p1 = norm.pdf(x, mu1, std1)
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    fig.add_trace(go.Histogram(x=[x[0] for x in data]),secondary_y=False)
    fig.add_trace(go.Scatter(x=x, y=p0, mode='lines'),secondary_y=True)
    fig.add_trace(go.Scatter(x=x, y=p1, mode='lines'),secondary_y=True)
    fig.show()

In [27]:
def vcf_to_mix_model(vcf_file,plot=False,tail_cutoff=0.05,title="AF Histogram",return_freqs = False):
    pos = []
    freqs = []
    for l in pp.cmd_out("bcftools view -c 1 -m2 -M2 -T ^new_exclusion.bed %s | bcftools query -f '%%POS\\t%%REF\\t%%ALT[\\t%%GT\\t%%AD\\n]'" % vcf_file):
        row = l.strip().split()
        ads = [int(x) for x in row[4].split(",")]
        afs = [x/sum(ads) for x in ads]
        if afs[1]>1-tail_cutoff or afs[1]<tail_cutoff:
            continue
        pos.append(int(row[0]))
        freqs.append([afs[1]])

    
    # freqs = [[0.7],[0.6],[0.4]]    
    gm = GaussianMixture(n_components=2, random_state=0).fit(freqs)
    if plot:
        plot_gm(gm,freqs)
    if return_freqs:
        return (gm,list(zip(pos,freqs)))
    else:
        return gm



In [28]:
def assign_variant_to_distrib(gm,freq,cutoff=0.95):
    probs = gm.predict_proba([[freq]])
    pred = gm.predict([[freq]])
    if probs[0][pred][0]>cutoff:
        return pred[0]
    else:                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             
        return None

In [29]:
gm,freqs = vcf_to_mix_model(vcf_file,plot=True,return_freqs=True,tail_cutoff=0.05)



Running command:
set -u pipefail; bcftools view -c 1 -m2 -M2 -T ^new_exclusion.bed test/ERR6634978-ERR6635032-5050.vcf.gz | bcftools query -f '%POS\t%REF\t%ALT[\t%GT\t%AD\n]'


In [30]:
f = np.asarray([x[1] for x in freqs])
components = list(range(1,5))
bic = []
for i in components:
    bic.append(GaussianMixture(n_components=i, random_state=0).fit(f).bic(f))

px.line(x=components,y=bic)

In [32]:
[f for f in freqs if f[1][0]<0.3]

[(24698, [0.22580645161290322]),
 (71584, [0.1323529411764706]),
 (257788, [0.12389380530973451]),
 (263149, [0.28695652173913044]),
 (336557, [0.29577464788732394]),
 (500523, [0.10948905109489052]),
 (595432, [0.1276595744680851]),
 (598020, [0.10638297872340426]),
 (598029, [0.1059322033898305]),
 (598043, [0.11440677966101695]),
 (598076, [0.2607003891050584]),
 (598095, [0.19913419913419914]),
 (598104, [0.16883116883116883]),
 (598111, [0.1592920353982301]),
 (598117, [0.12437810945273632]),
 (598128, [0.115]),
 (663410, [0.14606741573033707]),
 (663418, [0.15476190476190477]),
 (663429, [0.15053763440860216]),
 (888774, [0.2764976958525346]),
 (1094472, [0.23076923076923078]),
 (1121074, [0.28225806451612906]),
 (1224331, [0.10714285714285714]),
 (1224340, [0.11304347826086956]),
 (1224347, [0.1]),
 (1414629, [0.27314814814814814]),
 (1414653, [0.29333333333333333]),
 (1415097, [0.2898550724637681]),
 (1416222, [0.24890829694323144]),
 (1416232, [0.25877192982456143]),
 (1443428

In [44]:
json_results = json.load(open(json_file))
strain0 = []
strain1 = []
strainU = []

for var in json_results['dr_variants']:
    cluster = assign_variant_to_distrib(gm,var['freq'])
    if cluster == 0:
        strain0.append(var)
    elif cluster == 1:
        strain1.append(var)
    else:
        strainU.append(var)

In [45]:
[(v['gene'],v['change'],v['freq']) for v in strain0]


[('gyrA', 'p.Ala90Val', 0.3160569105691057),
 ('rpoB', 'p.His445Asp', 0.13383458646616542),
 ('rpsL', 'p.Lys43Arg', 0.103125),
 ('rrs', 'n.514A>C', 0.43105446118192353),
 ('rrs', 'n.1401A>G', 0.4260355029585799),
 ('fabG1', 'c.-15C>T', 0.22884012539184953),
 ('pncA', 'p.Leu172Pro', 0.43089430894308944),
 ('folC', 'p.Glu40Gly', 0.12627291242362526),
 ('embB', 'p.Met306Val', 0.4157416750756811)]

In [46]:
[(v['gene'],v['change'],v['freq']) for v in strain1]


[('rpoB', 'p.Ser450Leu', 0.6067567567567568)]

In [47]:
[(v['gene'],v['change'],v['freq']) for v in strainU]


[('katG', 'p.Ser315Thr', 0.8132596685082873)]