### CONFIGURATION

In [None]:
import os
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import numpy as np
import copy
import sys
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Hartwig imports
sys.path.append("/Users/peterpriestley/hmf/repos/scripts/analysisscripts") 
import analyseVCF as aVCF
import venn as vn

In [None]:
#CHROM SLICING
minChromFrac = 0#17.443476
maxChromFrac = 26#17.491846

#minChromFrac = 17.689695
#maxChromFrac = 17.73896

In [None]:
# BED FILE
BED_PATH = "/Users/peterpriestley/hmf/analyses/giabTruthsets/"
BED_FILE_NAME = "NA12878_GIAB_highconf_IllFB-IllGATKHC-CG-Ion-Solid_ALLCHROM_v3.2.2_highconf.bed"
USE_BED = True
LOAD_FULL_FILE = True

# COMBINED VCF CONFIG
#VCF_SAMPLE = "12878"
#VCF_PATH = "/Users/peterpriestley/hmf/analyses/NA12878vsNA12878/Jun17/"
#VCF_PATH = "/Users/peterpriestley/hmf/analyses/fpfilterAnalysis/"
#VCF_FILE_NAME = VCF_SAMPLE + "R_"+ VCF_SAMPLE + "T_merged_somatics.vcf"
#VCF_FILE_NAME = "fpNA12878vsNA12878NORMMELT.vcf"
#SAMPLE_NAMES = {VCF_SAMPLE + 'T.mutect':'mutect', \
#                VCF_SAMPLE + 'T.freebayes':'freebayes', \
#                'TUMOR.strelka':'strelka', \
#                'TUMOR.varscan':'varscan'}

#STRELKA ONLY
PATH = "/Users/peterpriestley/hmf/analyses/NA12878vsNA12878/Jun17/"
#PATH = "/Users/peterpriestley/hmf/analyses/NA12878vsNA12878/Mar22/"

#VCF_FILE_NAME ="somatic.indels.vcf"
#SAMPLE_NAMES = {'TUMOR':'strelka'}
# TRUTH SET
#SAMPLE_NAMES_TRUTH = {'INTEGRATION':'truth'}
#VCF_PATH_TRUTH = "/Users/peterpriestley/hmf/analyses/giabTruthsets/"
#VCF_FILE_NAME_TRUTH = "NA12878_GIAB_highconf_IllFB-IllGATKHC-CG-Ion-Solid_ALLCHROM_v3.2.2_highconf.vcf"

#MELT PARAMS
#VCF_FILE_NAME = "GIAB12878R_GIAB12878T_melted.vcf"
#SAMPLE_NAMES = {'GIAB12878T':'melted'}

### Functions

In [None]:
def filterByChromFrac(df):
    return df[(df.chromFrac > minChromFrac)&(df.chromFrac < maxChromFrac)]

In [None]:
def calculateTruth(df,dfTruth):
    df = pd.merge(df,dfTruth,how='left', left_index=True,right_index=True,suffixes=('', '_Truth'))
    df['hasTP'] = False
    df['hasFP'] = False
    for columnName in list(df):
        if columnName.endswith('allele'):
            df['hasTP'] = (df['hasTP']) | ((df[columnName[:-6]+'indelDiff'] == df['truthindelDiff']) \
                    & (~pd.isnull(df['truthindelDiff']) & (df['variantType'] == 'INDEL'))) |((df[columnName] == df['truthallele']) \
                    & (df['variantType'] == 'SNP'))
            df['hasFP'] = (df['hasFP']) | ((df[columnName[:-6]+'indelDiff'] != df['truthindelDiff']) \
                    & (df['variantType'] == 'INDEL') & (df[columnName[:-6]+'indelDiff'] != '')) |((df[columnName] != df['truthallele']) \
                    & (df['variantType'] == 'SNP') & (df[columnName] != ''))
    df['Truth'] = (df['hasTP']) &  (df['hasFP'] == False)
    return df

In [None]:
def calcuatePrecisionSensivityMatrix(df):
    outputdata = []
    for columnName in list(df):
        if columnName.endswith('allele') & ~columnName.endswith('_allele'):
            myCaller = columnName[:-6]
            variantTypes = df[(df[myCaller+'allele'] != '')].variantType.unique()
            for variantType in variantTypes:
                truePositives = len(df[(df[myCaller+'allele'] != '') & (df['Truth'] == True) &(df['variantType'] == variantType)])
                positives = len(df[(df[myCaller+'allele'] != '')&(df['variantType'] == variantType)])
                truthSet = 0
                falseNegatives = truthSet - truePositives
                if positives > 0:
                    outputdata.append([variantType, myCaller, truthSet,truePositives,positives-truePositives, falseNegatives, \
                                   round(truePositives/float(positives),4),0])
    
    outputDF = pd.DataFrame(outputdata)
    outputDF.columns = (['variantType','caller','truthSet','truePositives','falsePositives','falseNegatives','precision','sensitivity'])
    return outputDF.sort_values(['variantType','caller'])

In [None]:
def consensusAlt(a1,a2,a3,a4):
    if a1 <> "":
        return a1
    elif a2 <> "":
        return a2
    elif a3 <> "":
        return a3
    else:
        return a4

In [None]:
def loadPON(aPath,aPONFile):
    myPON = []
    with open(aPath + aPONFile, 'r') as f:
        for line in f:
            line = line.strip('\n')
            splitLine = line.split('\t')
            myPON.append(splitLine)
    dfPON = pd.DataFrame(myPON)
    dfPON.columns = ['chrom','pos','ref','alt','PONCount']
    return dfPON

In [None]:
def loadPONvcf(PONFile):
    numHeaderRows = 0
    with open(PONFile) as fp:
        while fp.readline()[0]=='#':
            numHeaderRows = numHeaderRows+1
    dfPON = pd.read_table(PONFile,skiprows =numHeaderRows-1, dtype={'#CHROM':'str','POS':'str'})  #names = CNV_COLUMNS
    dfPON['PON_COUNT'] = dfPON['INFO'].apply(lambda x: x.split('=')[1])
    dfPON.rename(columns={'#CHROM': 'chrom', 'POS': 'pos','REF':'ref','ALT':'alt'}, inplace=True)
    return dfPON

In [None]:
def loadVCF(path,filename,sampleNames,patientID,bedPath,bedFileName):
    df = pd.DataFrame()
    if USE_BED:
        bed = aVCF.loadBEDFile(bedPath,bedFileName)
        return pd.concat([df,aVCF.loadVariantsFromVCF(path,filename,sampleNames,patientID,True,True,bed,LOAD_FULL_FILE)])
    else:
        return pd.concat([df,aVCF.loadVariantsFromVCF(path,filename,sampleNames,patientID,True)])

In [None]:
def findFiles(path,suffix):
    files=[]
    for x in os.listdir(path):
        if x[-len(suffix):] == suffix:
            files.append(x)
    return files

<h3> Load VCFs and Prepare DF

In [None]:
somaticVCFFilename = findFiles(PATH,"melted.vcf")[0]
patientID = "GIAB12878"

In [None]:
df = loadVCF(PATH,somaticVCFFilename,{'GIAB12878T':'melted'},patientID,BED_PATH,BED_FILE_NAME)

### PON

In [None]:
dfPON = loadPONvcf("/Users/peterpriestley/hmf/analyses/PON/PON.vcf")

In [None]:
df= df.reset_index()
df['alt'] = df['meltedallele']
df = pd.merge(df,dfPON,how='left', on=['chrom','pos','ref','alt'])
df['PON_COUNT'].fillna(0, inplace=True)
df['inPON'] = pd.to_numeric(df.PON_COUNT,errors=coerce)>4

### PRECISION + SENSITIVITY|

In [None]:
df['inPON'] = pd.to_numeric(df.PON_COUNT,errors=coerce)>6

In [None]:
#PRECISION + SENSITIVITY by caller - PROD
df['Truth']=False
outputDF = calcuatePrecisionSensivityMatrix(df)#[df.dbSNP==True])
outputDF

In [None]:
pd.pivot_table(df[df.consensus==True], values='pos', index=['variantType'], columns=['bedRegion','inPON'], aggfunc='count')

In [None]:
pd.pivot_table(df[(df.vennSegment.str.contains('Intersection'))|((df.vennSegment.str.contains('strelka')))&(df.meltedQS>20)&(df.meltedAF>0.1)], values='pos', index=['variantType'], columns=['bedRegion','inPON'], aggfunc='count')

In [None]:
pd.pivot_table(df[(df.vennSegment.str.contains('Intersection'))|((df.vennSegment.str.contains('strelka')))&(df.meltedQS>0)], values='pos', index=['variantType'], columns=['bedRegion','inPON'], aggfunc='count')

In [None]:
pd.pivot_table(df[(df.vennSegment.str.contains('Intersection'))|((df.vennSegment.str.contains('strelka'))&(df.vennSegment.str.contains('mutect')))], values='pos', index=['inDBSNP','variantType'], columns=['bedRegion','inPON'], aggfunc='count')

In [None]:
pd.pivot_table(df, values='pos', index=['variantType','numCallers','vennSegment'], columns=['bedRegion','inPON'], aggfunc='count')

In [None]:
dftemp = df#.reset_index()
pd.pivot_table(dftemp[dftemp.consensus==True], values='pos', index=['variantType'], columns=['inPON'], aggfunc='count')

### INDIVIDUAL VARIANTS

In [None]:
pd.set_option('display.max_rows', 500)

In [None]:
df[((df.vennSegment.str.contains('Intersection'))|((df.vennSegment.str.contains('strelka'))))&(df.variantType=='SNP')&(df.inPON==False)]