### CONFIGURATION

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import numpy as np
import copy
import sys
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Hartwig imports
sys.path.append("/Users/peterpriestley/hmf/repos/scripts/analysisscripts") 
import analyseVCF as aVCF
import venn as vn

In [3]:
#CHROM SLICING
minChromFrac = 17.689695
maxChromFrac = 17.73896
#minChromFrac = 0
#maxChromFrac = 26

In [4]:
# COMBINED VCF CONFIG
VCF_SAMPLE = "CPCT02010267"
VCF_PATH = "/Users/peterpriestley/hmf/analyses/cancerSample/"
VCF_FILE_NAME = VCF_SAMPLE + "R_" + VCF_SAMPLE + "T_merged_somatics.vcf"
SAMPLE_NAMES = {VCF_SAMPLE + 'T.mutect':'mutect', \
                VCF_SAMPLE + 'T.freebayes':'freebayes', \
                'TUMOR.strelka':'strelka', \
                'TUMOR.varscan':'varscan'}

<h3> Load VCFs and Prepare DF

In [5]:
# LOAD PROD VCF and filter
df = aVCF.loadVaraintsFromVCF(VCF_PATH,VCF_FILE_NAME,SAMPLE_NAMES,VCF_SAMPLE,True,False)#True,sampleBed)
df = df[(df.chromFrac > minChromFrac)&(df.chromFrac < maxChromFrac)]


reading vcf file: CPCT02010267R_CPCT02010267T_merged_somatics.vcf
Number variants loaded: 44651


<h3> SLICE ANALYSIS

In [10]:
# SLICE SETTINGS
SLICE_VCF_PATH = "/Users/peterpriestley/hmf/slice" + VCF_SAMPLE + "/"
SLICE_VCF_FILE_NAME = "somatics.merged.vcf"
SLICE_SAMPLE_NAMES = SAMPLE_NAMES 
dfSlice = aVCF.loadVaraintsFromVCF(SLICE_VCF_PATH,SLICE_VCF_FILE_NAME,SLICE_SAMPLE_NAMES,VCF_SAMPLE,True,False)
dfSlice['Prod']=dfSlice.chromPos.isin(df['chromPos']) 
df['Slice']=df.chromPos.isin(dfSlice['chromPos']) 
dfSlice = dfSlice[(dfSlice.chromFrac > minChromFrac)&(dfSlice.chromFrac < maxChromFrac)]

reading vcf file: somatics.merged.vcf
Number variants loaded: 281


In [11]:
#PRECISION + SENSITIVITY by caller
outputdata = []
for columnName in list(df):
    if columnName.endswith('allele'):
        myCaller = columnName[:-6]
        variantTypes = df[(df[myCaller+'allele'] != '')].variantType.unique()
        for variantType in variantTypes:
            truePositives = len(df[(df[myCaller+'allele'] != '') &(df['variantType'] == variantType)])
            positives = len(df[(df[myCaller+'allele'] != '')&(df['variantType'] == variantType)])
            truthSet = len(df[df['variantType'] == variantType]) 
            falseNegatives = truthSet - truePositives
            if positives > 0:
                outputdata.append([variantType, myCaller, truthSet,truePositives,positives-truePositives, falseNegatives, \
                                   round(truePositives/float(positives),4),round(truePositives/float(truthSet),4)])
    
outputDF = pd.DataFrame(outputdata)
outputDF.columns = (['varType','caller','truth','TP','FP','FN','precision','sensitivity'])

#PRECISION + SENSITIVITY by caller - SLICE
outputdata = []
for columnName in list(dfSlice):
    if columnName.endswith('allele'):
        myCaller = columnName[:-6]
        variantTypes = dfSlice[(dfSlice[myCaller+'allele'] != '')].variantType.unique()
        for variantType in variantTypes:
            truePositives = len(dfSlice[(dfSlice[myCaller+'allele'] != '') & (dfSlice['Prod'] == True) &(dfSlice['variantType'] == variantType)])
            positives = len(dfSlice[(dfSlice[myCaller+'allele'] != '')&(dfSlice['variantType'] == variantType)])
            truthSet = len(df[df['variantType'] == variantType]) 
            falseNegatives = truthSet - truePositives
            if positives > 0:
                outputdata.append([variantType, myCaller, truthSet,truePositives,positives-truePositives, falseNegatives, \
                                   round(truePositives/float(positives),4),round(truePositives/float(truthSet),4)])
    
outputSliceDF = pd.DataFrame(outputdata)
outputSliceDF.columns = (['varType','caller','truth','TP','FP','FN','precision','sensitivity'])


In [12]:
outputDF.sort_values(['varType','caller'])


Unnamed: 0,varType,caller,truth,TP,FP,FN,precision,sensitivity
2,INDEL,varscan,1,1,0,0,1.0,1.0
4,SNP,freebayes,40,7,0,33,1.0,0.175
3,SNP,mutect,40,30,0,10,1.0,0.75
0,SNP,strelka,40,8,0,32,1.0,0.2
1,SNP,varscan,40,2,0,38,1.0,0.05


In [13]:
outputSliceDF.sort_values(['varType','caller'])

Unnamed: 0,varType,caller,truth,TP,FP,FN,precision,sensitivity
5,INDEL,freebayes,1,0,126,1,0.0,0.0
1,INDEL,strelka,1,0,6,1,0.0,0.0
2,INDEL,varscan,1,1,11,0,0.0833,1.0
6,SNP,freebayes,40,10,75,30,0.1176,0.25
4,SNP,mutect,40,22,5,18,0.8148,0.55
0,SNP,strelka,40,10,26,30,0.2778,0.25
3,SNP,varscan,40,7,6,33,0.5385,0.175


In [22]:
dfSlice[dfSlice.chromFrac > 17.7045].head(5)

Unnamed: 0,chrom,pos,chromPos,chromFrac,ref,vennSegment,numCallers,variantType,variantSubType,filter,strelkaallele,strelkaallelicFreq,strelkareadDepth,strelkaqualityScore,strelkasomaticGenotype,varscanallele,varscanallelicFreq,varscanreadDepth,varscanqualityScore,varscansomaticGenotype,mutectallele,mutectallelicFreq,mutectreadDepth,mutectqualityScore,mutectsomaticGenotype,freebayesallele,freebayesallelicFreq,freebayesreadDepth,freebayesqualityScore,freebayessomaticGenotype,patientName,Prod
97,17,57231533,17:57231533,17.704863,GAAAAAAAAAAAAAAAGAAAAGAAAAGAAAAAAAAGATAATGCT,,1,INDEL,INSERT,.,,,,,,,,,,,,,,,,AAAAAAAAAAAAAAAAGAAAAGAAAAGAAAAAAAAGATAATGCT,0.75,58.0,23.1937,somatic,CPCT02010267,False
98,17,57238793,17:57238793,17.704953,T,,1,SNP,,PASS,A,0.137681,146.0,17.0,ref,,,,,,,,,,,,,,,,CPCT02010267,False
99,17,57254045,17:57254045,17.705141,CTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGAGAAAAAGTC,,1,INDEL,DELETE,.,,,,,,,,,,,,,,,,TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGAGAAAAAGTC,0.8,35.0,0.45111,somatic,CPCT02010267,False
100,17,57257880,17:57257880,17.705188,GTTTTTTTTGCC,,1,INDEL,INSERT,.,,,,,,,,,,,,,,,,GTTTTTTTTGTTTTTTTTTGCC,0.983333,75.0,42.9374,somatic,CPCT02010267,False
101,17,57263742,17:57263742,17.70526,ATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTAGACAGAATC,,1,INDEL,DELETE,.,,,,,,,,,,,,,,,,ATTTTTTTTTTTTTTTTTTTTTAGACAGAATC,1.0,40.0,20.9204,somatic,CPCT02010267,False


In [16]:
df[df.mutectallele != '']

Unnamed: 0,chrom,pos,chromPos,chromFrac,ref,vennSegment,numCallers,variantType,variantSubType,filter,strelkaallele,strelkaallelicFreq,strelkareadDepth,strelkaqualityScore,strelkasomaticGenotype,varscanallele,varscanallelicFreq,varscanreadDepth,varscanqualityScore,varscansomaticGenotype,mutectallele,mutectallelicFreq,mutectreadDepth,mutectqualityScore,mutectsomaticGenotype,freebayesallele,freebayesallelicFreq,freebayesreadDepth,freebayesqualityScore,freebayessomaticGenotype,patientName,Slice
36288,17,56090088,17:56090088,17.690805,C,,1,SNP,,PASS,,,,,,,,,,,T,0.066,137,-1,ref-het,,,,,,CPCT02010267,True
36294,17,57284767,17:57284767,17.705519,G,,1,SNP,,PASS,,,,,,,,,,,A,0.085,82,-1,ref-het,,,,,,CPCT02010267,False
36295,17,57366693,17:57366693,17.706528,C,,1,SNP,,PASS,,,,,,,,,,,A,0.049,122,-1,ref-het,,,,,,CPCT02010267,True
36296,17,57570981,17:57570981,17.709044,A,,1,SNP,,PASS,,,,,,,,,,,C,0.063,97,-1,ref-het,,,,,,CPCT02010267,False
36297,17,57872376,17:57872376,17.712756,G,,1,SNP,,PASS,,,,,,,,,,,T,0.133,90,-1,ref-het,,,,,,CPCT02010267,False
36300,17,58067165,17:58067165,17.715155,A,,1,SNP,,PASS,,,,,,,,,,,G,0.061,179,-1,ref-het,,,,,,CPCT02010267,True
36301,17,58095584,17:58095584,17.715505,C,,1,SNP,,PASS,,,,,,,,,,,T,0.053,132,-1,ref-het,,,,,,CPCT02010267,True
36302,17,58137947,17:58137947,17.716027,A,,2,SNP,,PASS,G,0.042328,190.0,15.0,ref,,,,,,G,0.043,185,-1,ref-het,,,,,,CPCT02010267,True
36304,17,58241731,17:58241731,17.717305,G,,1,SNP,,PASS,,,,,,,,,,,T,0.103,108,-1,ref-het,,,,,,CPCT02010267,True
36305,17,58287434,17:58287434,17.717868,A,,2,SNP,,PASS,C,0.0503145,161.0,16.0,ref,,,,,,C,0.081,160,-1,ref-het,,,,,,CPCT02010267,False


# False Negative Analysis

In [None]:
unfilteredBed = copy.deepcopy(bed)

In [None]:
# SETTINGS
myCaller = 'varscan'
UNFILTERED_VCF_PATH = "/Users/peterpriestley/hmf/slice2-70-30/"
if myCaller == 'varscan':
    UNFILTERED_VCF_FILE_NAME = "varscan.snp.vcf"
    UNFILTERED_SAMPLE_NAMES = {"TUMOR":myCaller}
elif myCaller == 'freebayes':
    UNFILTERED_VCF_FILE_NAME = "freebayes.somatic.vcf"
    UNFILTERED_SAMPLE_NAMES = {'CPCT11111111T':myCaller}  
elif myCaller == 'strelka':
    UNFILTERED_VCF_FILE_NAME = "/strelka/results/all.somatic.snvs.vcf"
    UNFILTERED_SAMPLE_NAMES = {"TUMOR":myCaller}
elif myCaller == 'mutect':
    UNFILTERED_VCF_FILE_NAME = "mutect.vcf"
    UNFILTERED_SAMPLE_NAMES = {'CPCT11111111T':myCaller}


dfUnfiltered = aVCF.loadVaraintsFromVCF(UNFILTERED_VCF_PATH,UNFILTERED_VCF_FILE_NAME,UNFILTERED_SAMPLE_NAMES,VCF_SAMPLE,False,True,unfilteredBed)
dfUnfiltered['Truth']=dfUnfiltered.chromPos.isin(dfTruth['chromPos']) 
dfUnfiltered = dfUnfiltered[(dfUnfiltered.chromFrac > minChromFrac)&(dfUnfiltered.chromFrac < maxChromFrac)]

In [None]:
dfTruth['inUnfiltered'+myCaller]=dfTruth.chromPos.isin(dfUnfiltered[dfUnfiltered[myCaller+'allele'] != '']['chromPos'])
dfTruth['inSlice']=dfTruth.chromPos.isin(dfSlice['chromPos'])

In [None]:
#False Positives - pre-somatic
dfTruth[['inSlice','pos','inUnfiltered'+myCaller,'variantType']].groupby(['variantType','inUnfiltered'+myCaller]).agg('count')


In [None]:
dfUnfiltered['inSlice'+myCaller]=dfUnfiltered.chromPos.isin(dfSlice[dfSlice[myCaller+'allele'] != '']['chromPos'])
dfUnfiltered['inSlice']=dfUnfiltered.chromPos.isin(dfSlice['chromPos'])

In [None]:
dfUnfiltered[['inSlice'+myCaller,'pos','Truth','variantType','filter',myCaller+'somaticGenotype']].groupby(['variantType','inSlice'+myCaller,'Truth','filter',myCaller+'somaticGenotype']).agg('count')


In [None]:

dfUnfiltered[(dfUnfiltered['Truth'] == True)&
            (dfUnfiltered['inSlice'+myCaller] == True)& 
             (dfUnfiltered.variantType == 'SNP') &
             (dfUnfiltered['filter'] != 'PETE')]['pos']

In [None]:
#FALSE NEGATIVES IN UNFILTERED
dfUnfiltered[(dfUnfiltered['inSlice'+myCaller] == False)& 
             (dfUnfiltered[myCaller+'somaticGenotype'] != '2')&
             (dfUnfiltered.variantType == 'SNP') &
             (dfUnfiltered['filter'] != 'str10')].sort_values([myCaller+'qualityScore'],ascending=False)

In [None]:
dfSlice[dfSlice.pos=='38906100'].head(10)

In [None]:
#FALSE POSITIVES`
dfUnfiltered[(dfUnfiltered['inSlice'+myCaller] == True) & (dfUnfiltered.variantType == 'SNP') &
             (dfUnfiltered['Truth'] == False)].sort_values([myCaller+'qualityScore'],ascending=False)

In [None]:
#df_pivot = df[['numCallers','pos','variantType']].groupby(['variantType','numCallers',]).agg('count')
#df_pivot.groupby(level=0).transform(lambda x: x/x.sum())

<h3> SNP

In [None]:
snpdf = df[(df.variantType == 'SNP')]

In [None]:
snpdf[['Truth','pos']].groupby(['Truth']).agg('count')

In [None]:
pd.pivot_table(snpdf, values='pos', index=['numCallers'], columns=['Truth'], aggfunc='count')

In [None]:
vn.venn([snpdf[snpdf.mutectallele != '']['chromPos'], \
         snpdf[snpdf.strelkaallele != '']['chromPos'], \
        snpdf[snpdf.freebayesallele != '']['chromPos'], \
        snpdf[snpdf.varscanallele != '']['chromPos'] \
        ],['mutect','strelka','freebayes','varscan'],figsize=(6,6))

### <hd3> Indel Venn

In [None]:
indeldf= df[(df.variantType == 'INDEL')]

In [None]:
pd.pivot_table(indeldf, values='pos', index=['numCallers','vennSegment'], columns=['Truth'], aggfunc='count')

In [None]:
vn.venn([indeldf[indeldf.strelkaallele != '']['chromPos'], \
        indeldf[indeldf.freebayesallele != '']['chromPos'], \
        indeldf[indeldf.varscanallele != '']['chromPos'] \
        ],['strelka','freebayes','varscan'],figsize=(6,6))

### <hd3> Read Depth

In [None]:
for columnName in list(df):
    if columnName.endswith('allelicFreq'):
        df[df[columnName] != ''].hist(column=columnName,by="Truth",bins=40,figsize=(10,5))
        

In [None]:
df[df.freebayesreadDepth != ''].hist(column="freebayesallelicFreq",by="Truth",bins=30)

<h3> Allelic Depth

In [None]:
#Alllele Freq By Caller
for columnName in list(snpdf):
    if columnName.endswith('allele'):
        ser = snpdf[(snpdf[columnName] != '') &(snpdf[columnName[:-6] + 'allelicFreq'] > 0.0)][columnName[:-6] + 'allelicFreq']
        ser = ser.sort_values()
        #ser[len(ser)] = ser.iloc[-1]
        cum_dist = np.linspace(0.,1.,len(ser))
        ser_cdf = pd.Series(cum_dist, index=ser,name=columnName[:-6]+": c="+str(ser.count())+" m="+str(round(ser.median(),2)))
        ser_cdf.plot(drawstyle='steps',legend=True,title=" Allelic Frequency by Caller (AllelicFreq > 0.0)",figsize=[15,6])

In [None]:
# BY CALLER BY TRUTH
truths = snpdf.Truth.unique()
for truth in truths:
    for columnName in list(snpdf):
        if columnName.endswith('allele'):
            ser = snpdf[(snpdf.Truth == truth) &(snpdf[columnName] != '')][columnName[:-6] + 'allelicFreq']
            ser = ser.sort_values()
            #ser[len(ser)] = ser.iloc[-1]
            cum_dist = np.linspace(0.,1.,len(ser))
            ser_cdf = pd.Series(cum_dist, index=ser,name=columnName[:-6]+" "+str(truth)+": c="+str(ser.count())+" m="+str(round(ser.median(),2)))
            ser_cdf.plot(drawstyle='steps',legend=True,title=" Allelic Frequency by Caller and Truth",figsize=[15,6])

In [None]:
#  BY VENN SEGMENT - Single Caller - FALSE POSITIVES
myCaller = 'freebayes'
vennSegments = snpdf[(snpdf[myCaller+'allele'] != '') & (snpdf.Truth == False)].vennSegment.unique()
for vennSegment in vennSegments:
    ser = snpdf[(snpdf.vennSegment == vennSegment) & (snpdf[myCaller+'allele'] != '') & (snpdf.Truth == False) ][myCaller+'allelicFreq']
    ser = ser.sort_values()
    ser[len(ser)] = ser.iloc[-1]
    cum_dist = np.linspace(0.,1.,len(ser))
    ser_cdf = pd.Series(cum_dist, index=ser,name=vennSegment+": c="+str(ser.count())+" m="+str(round(ser.median(),2)))
    ser_cdf.plot(drawstyle='steps',legend=True,title=myCaller+" FP by Venn Segment",figsize=[15,6])

### Allelic Depth Scatter Plot

In [None]:
caller1 = 'varscan'
caller2 = 'strelka'
tempdf = df[(df[caller1+'allele']!='')&(df[caller2+'allele']!='')]
tempdf.head()
tempdf.plot.scatter(caller1+'allelicFreq',caller2+'allelicFreq' \
                        ,figsize=[6,6] \
                        ,title="Comparison of "+caller1+" and "+caller2+" Allelic Frequency" \
                        ,xlim=[0,1],ylim=[0,1])

<h3> RAINBOW CHARTS

In [None]:
## RAINBOW
caller = 'freebayes'
plt.scatter(df[(df[caller+'allele']!='')& (df['Truth'] == True)&(df['chrom'] == '22')]['chromFrac'], \
            df[(df[caller+'allele']!='')& (df['Truth'] == True)&(df['chrom'] == '22')][caller+'allelicFreq'], \
            s=10, c='b', marker="s")

In [None]:
## RAINBOW
caller = 'freebayes'
plt.scatter(df[(df[caller+'allele']!='')& (df['Truth'] == False)&(df['chrom'] == '22')]['chromFrac'], \
            df[(df[caller+'allele']!='')& (df['Truth'] == False)&(df['chrom'] == '22')][caller+'allelicFreq'], \
            s=10, c='b', marker="s")

<h3> Overlapping Analysis

In [None]:
#NEEDS REFACTORING TO SUPPORT NEW DF FORMAT
'''
sorteddf = df.sort(['chromFrac'], ascending=1)
sorteddf['nextRef'] = sorteddf['ref'].shift(-2)
sorteddf['nextPos'] = sorteddf['pos'].shift(-2)
sorteddf['nextChrom'] = sorteddf['chrom'].shift(-2)
sorteddf['nextVT'] = sorteddf['variantType'].shift(-2)
sorteddf['nextAlleleTumor1'] = sorteddf['alleleTumor1'].shift(-2)
sorteddf['nextAlleleTumor2'] = sorteddf['alleleTumor2'].shift(-2)
sorteddf['nextVennSegment'] = sorteddf['vennSegment'].shift(-2)
sorteddf['lengthRef'] = sorteddf['ref'].str.len()
sorteddf['Dist2Next'] = -sorteddf['pos'].astype(int).diff(-2)

overlapdf = sorteddf[(sorteddf['lengthRef']+0>sorteddf['Dist2Next']) & (sorteddf['Dist2Next']>0)]
overlapdf.head(20)
'''

In [None]:
#NEEDS REFACTORING TO SUPPORT NEW DF FORMAT
'''
filtereddf = overlapdf[(~overlapdf.vennSegment.isin(['freebayes','varscan','strelka1'])) \
                       & (overlapdf.vennSegment.str.contains('varscan')) \
                       #& (~overlapdf.nextVennSegment.str.contains('varscan')) \
                       & (overlapdf.variantType == 'INDEL') \
                       #& (overlapdf.variantSubType == 'DELETE')
                       & (overlapdf.variantType == overlapdf.nextVT)]
filtereddf[['nextVennSegment','vennSegment','variantType','nextVT','Dist2Next','lengthRef','ref','alleleTumor2', \
            'nextRef','nextAlleleTumor2','variantSubType','chrom','pos']].count()
'''