In [1]:
import pysam
import pandas as pd
import numpy as np
import sys
from __future__ import print_function
import cPickle
import math

In [2]:
with open("../reads/meta/s2.tsv") as f:
    data = pd.read_table(f).set_index('EMBL ID').drop(["Strain/species details"], axis=1).to_dict()["Phylum"]
id2phlm = {}
for k,v in data.items():
    nk = k.split(".")[0]
    id2phlm[nk] = v
    if (nk == "CM000636"):
        id2phlm["CP006835"] = v
    elif v == "Rhizobium_Bradyrhizobium":
        id2phlm[nk] = "Proteobacteria"
    elif v == "Pathogens":
        id2phlm[nk] = "Proteobacteria"
del data

In [3]:
mil = 1000000
hund = 100

def get_ref_id(aln):
    # get id for the mapped reference
    try:
        return aln.reference_name.split("|")[1]
    except:
        return aln.reference_name.split(".")[0]
    
def print_details(qId, rIds, aln):
    print("PHYLA")
    print ("QUERY:\t" + qId + "\tMAPPINGS:\t")
    for rId in rIds:
        print ( rId , end="\t")
    print ("\n")


def get_stats(fname, rname):
    totReads = 0
    TN = 0
    FN = 0
    reads_list = []
    roseCount = 0
    euCount = 0
    with open(rname) as f:        
        for line in f:
            # counting total reads
            totReads += 1
            # Progress Monitoring
            if(totReads % mil == 0):
                print ("\r Done reading {} Million reads.".format(int(round(totReads)/1000000)), end="")
            
            #extracting relevant part of read
            read = line.strip().replace("/1","").replace("@","")
            
            if "Random" in read:
                TN += 1
            if "Eukaryotes" in read:
                euCount += 1
            if "Rose" in read:
                roseCount += 1
            
            #making a list of read id
            reads_list.append(read)
            
            # skip next 4 lines
            for _ in range(3):
                f.next()
    
    if len(reads_list) != len(set(reads_list)):
        print ("ERROR: Repeating reads found")
        return 0

# ++++++++++++++++++++++++++++++++++++++++++++++++++++
# Repeating Block
# ++++++++++++++++++++++++++++++++++++++++++++++++++++
#     print("\nSaving Pickle")
#     with open(r"reads_list.pickle", "wb") as f:
#         cPickle.dump(reads_list, f)
#     print("Done Saving Pickle")
# ++++++++++++++++++++++++++++++++++++++++++++++++++++

# ++++++++++++++++++++++++++++++++++++++++++++++++++++
# Repeating Block
# ++++++++++++++++++++++++++++++++++++++++++++++++++++
#     print("Reading Pickle")
#     with open(r"reads_list.pickle", "rb") as f:
#         reads_list = cPickle.load(f)
#     totReads = len(reads_list)
# ++++++++++++++++++++++++++++++++++++++++++++++++++++
    
    with pysam.AlignmentFile(fname) as f:
        TP = 0
        FP = 0
        totCount = 0.0
        singCount = 0
        orphanCount = 0
        skipCount = 0.0
        for aln in f:
            #get mate of the read
            mate_aln = f.next()
            
            # count total Number of reads
            totCount += 1
            
            # get number of alignments
            n_alns = aln.get_tag('NH')
            
            # for singly mapped reads only
            if n_alns == 1:
                # Increment the single count
                singCount += 1
            
            #ignoring Rose Sequence
            if "Rose" in aln.query_name or "Eukaryotes" in aln.query_name:
                skipCount += 1.0/n_alns
                continue
            
            # Ignoring Orphan alignments for now
            if(aln.reference_name != mate_aln.reference_name):
                orphanCount += 1
                print ("WARNING: ORPHANS Detected statistics Neess to be re-evaluated")
                continue
            
            # Progress Monitoring
            if(round(totCount) % mil == 0):
                print ("\r Done reading {} Million reads.".format(int(round(totCount)/1000000)), end="")

            # get ground truth id
            qId = aln.query_name.split('-')[0]
            if "|" in qId:
                qId = qId.split("|")[1]
            elif "_" in qId:
                qId = qId.split("_")[0]
                
            # list of all alignments
            rIds = [get_ref_id(aln)]

            # iterate over all alignments
            for _ in range(1, n_alns):
                aln = f.next()
                mate_aln = f.next()

                # Ignoring Orphan alignments for now
                if(aln.reference_name != mate_aln.reference_name):
                    orphanCount += 1
                else:
                    rIds.append(get_ref_id(aln))
            
            # skip the whole alignment list of it's a Random read
            if "Random" in aln.query_name:
                TN -= 1
                FP += 1
                continue
                
            flag = False
            plist = set([])
            try:
                qId_plm = id2phlm[qId]
                for rId in rIds:
                    rId_plm = id2phlm[rId]
                    plist.add( rId_plm )
                    if len(plist) > 1:
                        break
            except:
                print_details(qId_plm, rIds, aln)

            if(len(plist) == 1 and list(plist)[0] == qId_plm):
                TP += 1
            else:
#                 print_details(qId, rId, aln)
                FP += 1
    
    mmCount = round(totCount - singCount)
    unmapCount = totReads - totCount
    FN = totReads - TP - FP - TN - roseCount - euCount
    sen = TP/float(TP+FN)
    spec = TN/float(TN+FP)
    ppv = TP/float(TP+FP)
    npv = TN/float(TN+FN)
    mcc = ((TP*TN)-(FP*FN)) / math.sqrt( (TP+FP)*(TP+FN)*(TN+FP)*(TN+FN) )
    print ("\n")
    print ("====================================================================================")
    print ("Total Number of reads: {0} ({1:.2f}M)".format(totReads, totReads/mil))    
    print ("Number of Unmapped reads: {0} ({1:.2f}M, {2:.2f}%)".format(unmapCount, unmapCount/mil, unmapCount*hund/totReads))    
    print ("Number of Mapped reads {0}({1:.2f}M, {2:.2f}%)".format(totCount, totCount/mil, totCount*hund/totReads))
    print ("\n")
    print ("============================ OUT OF MAPPED READS ===================================")
    print ("Number of Singly Mapped reads: {0} ({1:.2f}M, {2:.2f}%)".format(singCount, singCount/mil, singCount*hund/totReads))
    print ("Number of Multimapped reads: {0} ({1:.2f}M, {2:.2f}%)".format(mmCount, mmCount/mil, mmCount*hund/totReads))
    print ("Number of Mapped but Skipped reads: {0} ({1:.2f}M, {2:.2f}%)".format(skipCount, skipCount/mil, skipCount*hund/totReads))
    print ("Number of Orphaned (Ignored)ALIGNMENTS (Should be significantly low): {}".format(orphanCount))
    print ("====================================================================================")
    print ("\n ===================== \n ASSUMPTIONS \n =====================")
    print ("1: Any Multi-mapped read has the Original Phyla in ATLEAST 1 alignment")
    print ("2: Don't know what to do with Rose Sequence Ignoring for now")
    print ("3: No reference of Eukaryotes added")
    print ("OVERALL: Atmost 10% Reads could have been mapped more.")
    print ("Eukaryotes Counts: {0}({1:.2f}%)".format(euCount, euCount*hund/totReads))
    print ("Rose Counts: {0}({1:.2f}%)".format(roseCount, roseCount*hund/totReads))
    print ("====================================================================================")
    print ("\n ===================== \n ACCURACY METRIC \n =====================")
    print ("Number of True positives(TP) reads: {0} ({1:.2f}M, {2:.2f}%)".format(TP, TP/mil, TP*hund/totReads))
    print ("Number of False Negatives(FN) reads: {0} ({1:.2f}M, {2:.2f}%)".format(FN, FN/mil, FN*hund/totReads))
    print ("Number of False positives(FP) reads: {0} ({1:.2f}M, {2:.2f}%)".format(FP, FP/mil, FP*hund/totReads))
    print ("Number of True Negatives(TN) reads: {0} ({1:.2f}M, {2:.2f}%)".format(TN, TN/mil, TN*hund/totReads))
    print ("\n ===================== \n ACCURACY METRIC \n =====================")
    print ("Senstivity: {}".format(sen))
    print ("Specificity: {}".format(spec))
    print ("Precision: {}".format(ppv))
    print ("Neg Pred. Value: {}".format(npv))
    print ("MCC: {}".format(mcc))
    print ("====================================================================================")
    return TP,FP,FN,TN
            
TP,FP,FN,TN = get_stats("../bam/A1.sam", "../reads/A1_1.fastq")

 Done reading 19 Million reads.

Total Number of reads: 28912773 (28.00M)
Number of Unmapped reads: 9577588.0 (9.58M, 33.13%)
Number of Mapped reads 19335185.0(19.34M, 66.87%)


Number of Singly Mapped reads: 16087082 (16.00M, 55.00%)
Number of Multimapped reads: 3248103.0 (3.25M, 11.23%)
Number of Mapped but Skipped reads: 43.0 (0.00M, 0.00%)
Number of Orphaned (Ignored)ALIGNMENTS (Should be significantly low): 0

 ASSUMPTIONS 
1: Any Multi-mapped read has the Original Phyla in ATLEAST 1 alignment
2: Don't know what to do with Rose Sequence Ignoring for now
3: No reference of Eukaryotes added
OVERALL: Atmost 10% Reads could have been mapped more.
Eukaryotes Counts: 1445638(4.00%)
Rose Counts: 1445638(4.00%)

 ACCURACY METRIC 
Number of True positives(TP) reads: 19299621 (19.00M, 66.00%)
Number of False Negatives(FN) reads: 903982 (0.00M, 3.00%)
Number of False positives(FP) reads: 35330 (0.00M, 0.00%)
Number of True Negatives(TN) reads: 5782564 (5.00M, 20.00%)


In [5]:
sen = TP/float(TP+FN)
spec = TN/float(TN+FP)
ppv = TP/float(TP+FP)
npv = TN/float(TN+FN)
mcc = ((TP*TN)-(FP*FN)) / math.sqrt( (TP+FP)*(TP+FN)*(TN+FP)*(TN+FN) )
(sen, spec, ppv, mcc)

(0.9552563965942115,
 0.9939273558438844,
 0.9981727390982268,
 0.9050553804552882)

In [16]:
puffMap = (sen, spec, ppv, mcc)

In [17]:
# puffMap = (sen, spec, ppv, mcc)
taxator_tk = (0.5577, 0.7657, 0.8707, 0.2845)
QIIME = (0.0005, 1.0000, 0.9981, 0.0100)
OneCodex = (0.9197, 1.0000, 1.0000, 0.8342)
mOTU = (0.0020, 1.0000, 1.0000, 0.0201)
MG_RAST = (0.7903, 0.9930, 0.9978, 0.6515)
MetaPhlan = (0.0604, 1.0000, 1.0000, 0.1126)
MetaPhyler = (0.0057, 0.9999, 0.9949, 0.0331)
MEGAN = (0.5622, 0.9904, 0.9957, 0.4459)
LMAT = (0.6442, 0.7360, 0.9052, 0.3089)
Kraken = (0.8984, 1.0000, 1.0000, 0.7993)
GOTTCHA = (0.5388, 1.0000, 1.0000, 0.4352)
genomata = (0.4651, 0.9869, 0.9929, 0.3756)
EBI = (0.0006, 0.9984, 0.5884, -0.0146)
CLARK = (1.0000, 0.8081, 0.9528, 0.8775)

In [16]:
%matplotlib inline



In [18]:
from ggplot import *

In [18]:
df = pd.DataFrame([puffMap, taxator_tk, QIIME, OneCodex, mOTU, MG_RAST, MetaPhlan, MetaPhyler, MEGAN, LMAT, Kraken, GOTTCHA, genomata, EBI, CLARK])

In [19]:
df.columns = ['Senstivity', 'Specificity', 'Precision', 'Matthews correlation coefficient ']

In [20]:
df.index = ["PuffMap", "taxator_tk", "QIIME", "OneCodex", "mOTU", "MG_RAST", "MetaPhlan", "MetaPhyler", "MEGAN", "LMAT", "Kraken", "GOTTCHA", "genomata", "EBI", "CLARK"]

In [21]:
# df.to_csv('stats.tsv')

In [22]:
df

Unnamed: 0,Senstivity,Specificity,Precision,Matthews correlation coefficient
PuffMap,0.955256,0.993927,0.998173,0.905055
taxator_tk,0.5577,0.7657,0.8707,0.2845
QIIME,0.0005,1.0,0.9981,0.01
OneCodex,0.9197,1.0,1.0,0.8342
mOTU,0.002,1.0,1.0,0.0201
MG_RAST,0.7903,0.993,0.9978,0.6515
MetaPhlan,0.0604,1.0,1.0,0.1126
MetaPhyler,0.0057,0.9999,0.9949,0.0331
MEGAN,0.5622,0.9904,0.9957,0.4459
LMAT,0.6442,0.736,0.9052,0.3089
