### CONFIGURATION

In [None]:
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import numpy as np
import copy
import sys
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Hartwig imports
sys.path.append("/Users/peterpriestley/hmf/repos/scripts/analysisscripts") 
import analyseVCF as aVCF
import venn as vn

In [None]:
#CHROM SLICING
minChromFrac = 0
maxChromFrac = 26

In [None]:
# BED FILE
BED_PATH = "/Users/peterpriestley/hmf/analyses/giabTruthsets/70-30mixin/"
BED_FILE_NAME = "na12878-na24385-somatic-truth-regionsSORTED.bed"

# TRUTH SET
SAMPLE_NAMES_TRUTH = {'NA12878':'truth'}
VCF_PATH_TRUTH = "/Users/peterpriestley/hmf/analyses/giabTruthsets/70-30mixin/"
VCF_FILE_NAME_TRUTH = "na12878-na24385-somatic-truth.vcf"

# COMBINED VCF CONFIG
VCF_SAMPLE = "CPCT11111111"
VCF_PATH = "/Users/peterpriestley/hmf/analyses/70-30sample/170329_StrelkaNoRepeatFilter/"
VCF_FILE_NAME = VCF_SAMPLE + "R_"+ VCF_SAMPLE + "T_merged_somatics.vcf"#_somatic_filtered.vcf"#
#VCF_FILE_NAME = VCF_SAMPLE + "R_"+ VCF_SAMPLE + "T_melted.vcf"#_somatic_filtered.vcf"#
SAMPLE_NAMES = {VCF_SAMPLE + 'T.mutect':'mutect', VCF_SAMPLE + 'T.freebayes':'freebayes', \
                'TUMOR.strelka':'strelka', 'TUMOR.varscan':'varscan'}
#SAMPLE_NAMES = {VCF_SAMPLE + 'T':'melted'}
#VCF_FILE_NAME = "all.somatic.indels.vcf"
#SAMPLE_NAMES = {'TUMOR':'strelka'}

### Functions

In [None]:
def filterByChromFrac(df):
    return df[(df.chromFrac > minChromFrac)&(df.chromFrac < maxChromFrac)]

In [None]:
def calculateTruth(df,dfTruth):
    df = pd.merge(df,dfTruth,how='left', left_index=True,right_index=True,suffixes=('', '_Truth'))
    df['hasTP'] = False
    df['hasFP'] = False
    for columnName in list(df):
        if columnName.endswith('allele') and not columnName.startswith('truth'):
            df['hasTP'] = (df['hasTP']) | ((df[columnName[:-6]+'indelDiff'] == df['truthindelDiff']) \
                    & (~pd.isnull(df['truthindelDiff']) & (df['variantType'] == 'INDEL'))) |((df[columnName] == df['truthallele']) \
                    & (df['variantType'] == 'SNP'))
            df['hasFP'] = (df['hasFP']) | ((df[columnName[:-6]+'indelDiff'] != df['truthindelDiff']) \
                    & (df['variantType'] == 'INDEL') & (df[columnName[:-6]+'indelDiff'] != '')& (~pd.isnull(df['truthallele']))) |((df[columnName] != df['truthallele']) \
                    & (df['variantType'] == 'SNP') & (df[columnName] != '')& (~pd.isnull(df['truthallele'])))
    df['Truth'] = (df['hasTP']) &  (df['hasFP'] == False)
    return df

In [None]:
def calcuatePrecisionSensivityMatrix(df):
    outputdata = []
    for columnName in list(df):
        if columnName.endswith('allele') & ~columnName.endswith('truthallele'):
            myCaller = columnName[:-6]
            variantTypes = df[(df[myCaller+'allele'] != '')].variantType.unique()
            for variantType in variantTypes:
                truePositives = len(df[(df[myCaller+'allele'] != '') & (df['Truth'] == True) &(df['variantType'] == variantType)])
                positives = len(df[(df[myCaller+'allele'] != '')&(df['variantType'] == variantType)])
                truthSet = len(dfTruth[dfTruth['variantType'] == variantType]) 
                falseNegatives = truthSet - truePositives
                if positives > 0 and truthSet > 0:
                    outputdata.append([variantType, myCaller, truthSet,truePositives,positives-truePositives, falseNegatives, \
                                   round(truePositives/float(positives),4),round(truePositives/float(truthSet),4)])
    
    outputDF = pd.DataFrame(outputdata)
    outputDF.columns = (['variantType','caller','truthSet','truePositives','falsePositives','falseNegatives','precision','sensitivity'])
    return outputDF.sort_values(['variantType','caller'])

<h3> Load VCFs and Prepare DF

In [None]:
## LOAD TRUTH SET VCF
bed = aVCF.loadBEDFile(BED_PATH,BED_FILE_NAME)
dfTruth = aVCF.loadVariantsFromVCF(VCF_PATH_TRUTH,VCF_FILE_NAME_TRUTH, \
                                   SAMPLE_NAMES_TRUTH,"Mix-in Truth Set",True,True,bed)
dfTruth = filterByChromFrac(dfTruth)
dfTruth = dfTruth[['chrom','pos','variantType','ref','truthallele','truthindelDiff','bedRegion']]
dfTruth = dfTruth.set_index(['chrom','pos'])

In [None]:
# LOAD SAMPLE VCF + match to truth set
bed = aVCF.loadBEDFile(BED_PATH,BED_FILE_NAME)
dfProd = aVCF.loadVariantsFromVCF(VCF_PATH,VCF_FILE_NAME,SAMPLE_NAMES,VCF_SAMPLE,False,True,bed)
dfProd = filterByChromFrac(dfProd)
dfProd = dfProd.set_index(['chrom','pos'])
dfProd = calculateTruth(dfProd,dfTruth)

### PRECISION + SENSITIVITY|

In [None]:
outputDF = calcuatePrecisionSensivityMatrix(dfProd)
outputDF

In [None]:
dftemp2 = dfProd.reset_index()
pd.pivot_table(dftemp2, values='pos', index=['filter'], columns=['variantType','Truth'], aggfunc='count')

In [None]:
pd.pivot_table(dftemp2, values='pos', index=['consensus','filter'], columns=['variantType','Truth'], aggfunc='count')

In [None]:
dftemp2 = dfProd.reset_index()
pd.pivot_table(dftemp2[(dftemp2.variantType=='INDEL')|(dftemp2.variantType=='SNP')], values='pos', index=['numCallers'], columns=['variantType','Truth'], aggfunc='count')

In [None]:
len(dftemp2[(dftemp2.hasFP==True)])

In [None]:
pd.pivot_table(dftemp2, values='pos', index=['hasFP','numCallers'], columns=['variantType','Truth'], aggfunc='count')

<h3> SNP

In [None]:
snpdf = dfProd[(dfProd.variantType == 'SNP')]

In [None]:
vn.venn([snpdf[snpdf.mutectallele != '']['chromPos'], \
         snpdf[snpdf.strelkaallele != '']['chromPos'], \
        snpdf[snpdf.freebayesallele != '']['chromPos'], \
        snpdf[snpdf.varscanallele != '']['chromPos'] \
        ],['mutect','strelka','freebayes','varscan'],figsize=(6,6))

<h3> Allelic Depth

In [None]:
#Alllele Freq By Caller
for truth in [True,False]:
    ser = dfProd[(dfProd.Truth == truth)&(dfProd['filter']=='QSI_ref')]['strelkaAF']
    ser = ser.sort_values()
    cum_dist = np.linspace(0.,1.,len(ser))
    ser_cdf = pd.Series(cum_dist, index=ser,name=str(truth)+": c="+str(ser.count())+" m="+str(round(ser.median(),2)))
    ser_cdf.plot(drawstyle='steps',legend=True,title=" Allelic Frequency by Caller (AllelicFreq > 0.0)",figsize=[15,6],xlim=[0,1])

In [None]:
truths = [True,False]
for truth in truths:
    #Alllele Freq By Caller
    for columnName in list(dfProd):
        if columnName.endswith('strelkaQS'):
            ser = dfProd[(dfProd.Truth == truth)&(dfProd['filter']=='QSI_ref')][columnName]
            ser = ser.sort_values()
            cum_dist = np.linspace(0.,1.,len(ser))
            ser_cdf = pd.Series(cum_dist, index=ser,name=columnName[:-2]+": c="+str(ser.count())+ " "+ str(truth) +" m="+str(round(ser.median(),2)))
            ser_cdf.plot(drawstyle='steps',legend=True,title=" Allelic Frequency by Caller (AllelicFreq > 0.0)",figsize=[15,6],xlim=[0,100],ylim=[0,1])

### SCRATCH

In [None]:
dfProd[(dfProd.variantType=='INDEL')&(dfProd.Truth==False)&(dfProd.freebayesallele<>'')&(dfProd.chromPos.str.contains('1:'))].head(20)

In [None]:
dftemp[(dftemp.vennSegment.str.contains('mutect'))&(dftemp.Truth==False)].head(200)

In [None]:
pd.set_option('display.max_rows', 500)
dftemp[((dftemp.vennSegment.str.contains('mutect'))|(dftemp.vennSegment=='Intersection'))&(dftemp.Truth==True)].head(40)

In [None]:
dftemp2[((dftemp2.vennSegment.str.contains('mutect'))|(dftemp2.vennSegment=='Intersection'))&(dftemp2.Truth==True)].head(40)

In [None]:
len(dftemp)

In [None]:
len(dftemp2)