In [1]:
from __future__ import print_function
import pysam
import pandas as pd
import os
import math
import click
import sys

In [8]:
mil = 1000000
hund = 100

def populate_phylum_dict(pname):
    with open(pname) as f:
        data = pd.read_table(f).set_index('EMBL ID').drop(["Strain/species details"], axis=1).to_dict()["Phylum"]
    id2phlm = {}
    for k,v in data.items():
        nk = k.split(".")[0]
        id2phlm[nk] = v
        if (nk == "CM000636"):
            id2phlm["CP006835"] = v
        elif v == "Rhizobium_Bradyrhizobium":
            id2phlm[nk] = "Proteobacteria"
        elif v == "Pathogens":
            id2phlm[nk] = "Proteobacteria"
    id2phlm["Rose"] = "Rose"
    id2phlm["Eukaryotes"] = "Eukaryotes"
    del data
    return id2phlm

def get_ref_id(aln):
    # get id for the mapped reference
    rname = aln.reference_name

    euList = ["Arabidopsis", "Human", "Lizard", "Chicken", "Eagle", "Turtle", "Yeast"]
    for eu in euList:
        if eu in rname:
            return "Eukaryotes"

    if "Rose" in rname:
        return "Rose"
    try:
        return aln.reference_name.split("|")[1]
    except:
        return aln.reference_name.split(".")[0]

def get_query_id(aln):
    qname = aln.query_name

    # get ground truth id
    if "Eukaryotes" in qname:
        return "Eukaryotes"
    elif "Rose" in qname:
        return "Rose"

    qId = qname.split('-')[0]
    if "|" in qId:
        qId = qId.split("|")[1]
    elif "_" in qId:
        qId = qId.split("_")[0]

    return qId

def print_details(qId, rIds, aln):
    print ("ALIGNMENT", file=sys.stderr)
    print (aln, file=sys.stderr )
    print ("PHYLA", file=sys.stderr)
    print ("QUERY:\t" + qId + "\tMAPPINGS:", end="\t", file=sys.stderr)
    for rId in rIds:
        print ( rId , end="\t", file=sys.stderr)
    print ("\n", file=sys.stderr)

def parse_fq(rname):
    totReads = 0
    TN = 0
    reads_list = []
    with open(rname) as f:
        for line in f:
            # counting total reads
            totReads += 1
            # Progress Monitoring
            if(totReads % mil == 0):
                print ("\r Done reading {} Million reads from FASTQ.".format(int(round(totReads)/1000000)), end="")
                sys.stdout.flush()

            #extracting relevant part of read
            read = line.strip().replace("/1","").replace("@","")

            if "Random" in read:
                TN += 1

            #making a list of read id
            reads_list.append(read)

            # skip next 4 lines
            for _ in range(3):
                f.next()

    if len(reads_list) != len(set(reads_list)):
        print ("ERROR: Repeating reads found")
        exit(1)
    return totReads, TN, reads_list

def print_stats(singCount, totCount, totReads, TP, FP, TN, orphanCount):
    mmCount = round(totCount - singCount)
    unmapCount = totReads - totCount
    FN = totReads - TP - FP - TN
    cwd = os.getcwd()

    sen = TP/float(TP+FN)
    spec = TN/float(TN+FP)
    ppv = TP/float(TP+FP)
    npv = TN/float(TN+FN)
    mcc = ((TP*TN)-(FP*FN)) / math.sqrt( (TP+FP)*(TP+FN)*(TN+FP)*(TN+FN)  )
    stText = "\n\n" + \
    "====================================================================================\n" + \
    "Total Number of reads: {0} ({1:.2f}M)\n".format(totReads, totReads/mil)+ \
    "Number of Unmapped reads: {0} ({1:.2f}M, {2:.2f}%)\n".format(unmapCount, unmapCount/mil, unmapCount*hund/totReads)+ \
    "Number of Mapped reads {0}({1:.2f}M, {2:.2f}%)\n".format(totCount, totCount/mil, totCount*hund/totReads)+ \
    "\n\n"+ \
    "============================ OUT OF MAPPED READS ===================================\n"+ \
    "Number of Singly Mapped reads: {0} ({1:.2f}M, {2:.2f}%)\n".format(singCount, singCount/mil, singCount*hund/totReads)+ \
    "Number of Multimapped reads: {0} ({1:.2f}M, {2:.2f}%)\n".format(mmCount, mmCount/mil, mmCount*hund/totReads)+ \
    "Number of Orphaned (Ignored)ALIGNMENTS (Should be significantly low): {}\n".format(orphanCount)+ \
    "====================================================================================\n"+ \
    "\n ===================== \n ACCURACY METRIC \n =====================\n"+ \
    "Number of True positives(TP) reads: {0} ({1:.2f}M, {2:.2f}%)\n".format(TP, TP/mil, TP*hund/totReads)+ \
    "Number of False Negatives(FN) reads: {0} ({1:.2f}M, {2:.2f}%)\n".format(FN, FN/mil, FN*hund/totReads)+ \
    "Number of False positives(FP) reads: {0} ({1:.2f}M, {2:.2f}%)\n".format(FP, FP/mil, FP*hund/totReads)+ \
    "Number of True Negatives(TN) reads: {0} ({1:.2f}M, {2:.2f}%)\n".format(TN, TN/mil, TN*hund/totReads)+ \
	"\n ===================== \n SECONDARY ACCURACY METRIC \n =====================\n"+ \
    "Senstivity: {}\n".format(sen)+ \
    "Specificity: {}\n".format(spec)+ \
    "Precision: {}\n".format(ppv)+ \
    "Neg Pred. Value: {}\n".format(npv)+ \
    "MCC: {}\n".format(mcc)+ \
    "====================================================================================\n\n\n\n"

    filename = cwd + "/report.txt"
    with open(filename, 'w') as f:
        f.write(stText)
    return sen, spec, ppv, npv, mcc

def perform_counting(fname, totReads, TN, reads_list, id2phlm):
    with pysam.AlignmentFile(fname) as f:
        TP = 0
        FP = 0
        totCount = 0.0
        singCount = 0
        orphanCount = 0
        for aln in f:
            #get mate of the read
            mate_aln = f.next()

            # count total Number of reads
            totCount += 1

            # get number of alignments
            n_alns = aln.get_tag('NH')

            # for singly mapped reads only
            if n_alns == 1:
                # Increment the single count
                singCount += 1

            # Ignoring Orphan alignments for now
            if(aln.reference_name != mate_aln.reference_name):
                orphanCount += 1
                print ("WARNING: ORPHANS Detected statistics Neess to be re-evaluated")
                continue

            # Progress Monitoring
            if(round(totCount) % mil == 0):
                print ("\r Done reading {} Million reads from BAM....".format(int(round(totCount)/1000000)), end="")
                sys.stdout.flush()

            qId = get_query_id(aln)

            # list of all alignments
            rIds = [get_ref_id(aln)]

            # iterate over all alignments
            for _ in range(1, n_alns):
                aln = f.next()
                mate_aln = f.next()

                # Ignoring Orphan alignments for now
                if(aln.reference_name != mate_aln.reference_name):
                    orphanCount += 1
                else:
                    rIds.append(get_ref_id(aln))

            # skip the whole alignment list of it's a Random read
            if "Random" in aln.query_name:
                TN -= 1
                FP += 1
                continue

            plist = set([])
            try:
                qId_plm = id2phlm[qId]
                for rId in rIds:
                    rId_plm = id2phlm[rId]
                    plist.add( rId_plm )
                    if len(plist) > 1:
                        break
            except:
                print (qId, rIds)
                print_details(qId_plm, plist, aln)
                break

            if(len(plist) == 1 and list(plist)[0] == qId_plm):
                TP += 1
            else:
#                 print_details(qId_plm, plist, aln)
                FP += 1
    return singCount, totCount, TP, FP, TN, orphanCount


def get_stats(fq, sam):
    pname = "/mnt/scratch2/avi/meta-map/reads/meta/s2.tsv"

    # populate orgaism id to phylum dictionary
    id2phlm = populate_phylum_dict(pname)

    # parse the fastq file for TN calculations
    totReads, TN, reads_list = parse_fq(fq)

    # Parse the BAM and perform the counting
    singCount, totCount, TP, FP, TN, orphanCount = perform_counting(sam, totReads, TN, reads_list, id2phlm)

    # Calculate the stats and print it
    sen, spec, ppv, npv, mcc = print_stats(singCount, totCount, totReads, TP, FP, TN, orphanCount)
    print ("\noutput written to {}".format(os.getcwd()+"/reports.txt"))

    return sen, spec, ppv, npv, mcc

 Done reading 20 Million reads from BAM....

Total Number of reads: 28912773 (28.00M)
Number of Unmapped reads: 8164378.0 (8.16M, 28.24%)
Number of Mapped reads 20748395.0(20.75M, 71.76%)


Number of Singly Mapped reads: 17442310 (17.00M, 60.00%)
Number of Multimapped reads: 3306085.0 (3.31M, 11.43%)
Number of Orphaned (Ignored)ALIGNMENTS (Should be significantly low): 0

 ACCURACY METRIC 
Number of True positives(TP) reads: 20713041 (20.00M, 71.00%)
Number of False Negatives(FN) reads: 2381814 (2.00M, 8.00%)
Number of False positives(FP) reads: 35354 (0.00M, 0.00%)
Number of True Negatives(TN) reads: 5782564 (5.00M, 20.00%)

 ACCURACY METRIC 
Senstivity: 0.896868198566
Specificity: 0.993923255708
Precision: 0.998296060972
Neg Pred. Value: 0.708267549592
MCC: 0.793347859511


In [10]:
sen, spec, ppv, npv, mcc = get_stats("../reads/A1_1.fastq", "../pipeline/output/A1.sam")

 Done reading 20 Million reads from BAM....
output written to /mnt/scratch2/avi/meta-map/src-py/reports.txt


In [11]:
puffMap = (sen, spec, ppv, mcc)

In [12]:
# puffMap = (sen, spec, ppv, mcc)
CLARK = (1.0000, 0.8081, 0.9528, 0.8775)
OneCodex = (0.9197, 1.0000, 1.0000, 0.8342)
Kraken = (0.8984, 1.0000, 1.0000, 0.7993)
MG_RAST = (0.7903, 0.9930, 0.9978, 0.6515)
MEGAN = (0.5622, 0.9904, 0.9957, 0.4459)
GOTTCHA = (0.5388, 1.0000, 1.0000, 0.4352)
genomata = (0.4651, 0.9869, 0.9929, 0.3756)
LMAT = (0.6442, 0.7360, 0.9052, 0.3089)
taxator_tk = (0.5577, 0.7657, 0.8707, 0.2845)
MetaPhlan = (0.0604, 1.0000, 1.0000, 0.1126)
MetaPhyler = (0.0057, 0.9999, 0.9949, 0.0331)
mOTU = (0.0020, 1.0000, 1.0000, 0.0201)
QIIME = (0.0005, 1.0000, 0.9981, 0.0100)
EBI = (0.0006, 0.9984, 0.5884, -0.0146)

In [16]:
%matplotlib inline



In [18]:
from ggplot import *

In [13]:
df = pd.DataFrame([puffMap, taxator_tk, QIIME, OneCodex, mOTU, MG_RAST, MetaPhlan, MetaPhyler, MEGAN, LMAT, Kraken, GOTTCHA, genomata, EBI, CLARK])

In [14]:
df.columns = ['Senstivity', 'Specificity', 'Precision', 'Matthews correlation coefficient ']

In [15]:
df.index = ["PuffMap", "taxator_tk", "QIIME", "OneCodex", "mOTU", "MG_RAST", "MetaPhlan", "MetaPhyler", "MEGAN", "LMAT", "Kraken", "GOTTCHA", "genomata", "EBI", "CLARK"]

In [16]:
# df.to_csv('stats.tsv')

In [17]:
df

Unnamed: 0,Senstivity,Specificity,Precision,Matthews correlation coefficient
PuffMap,0.896868,0.993923,0.998296,0.793348
taxator_tk,0.5577,0.7657,0.8707,0.2845
QIIME,0.0005,1.0,0.9981,0.01
OneCodex,0.9197,1.0,1.0,0.8342
mOTU,0.002,1.0,1.0,0.0201
MG_RAST,0.7903,0.993,0.9978,0.6515
MetaPhlan,0.0604,1.0,1.0,0.1126
MetaPhyler,0.0057,0.9999,0.9949,0.0331
MEGAN,0.5622,0.9904,0.9957,0.4459
LMAT,0.6442,0.736,0.9052,0.3089
