### CONFIGURATION

In [None]:
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 200)
pd.set_option('display.width', 1000)
import numpy as np
import os
import copy
import sys
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Hartwig imports
sys.path.append("/Users/peterpriestley/hmf/repos/scripts/analysisscripts") 
import analyseVCF as aVCF
import venn as vn

In [None]:
# BED FILE
USE_BED = False
LOAD_NON_BED_MUTATIONS = False
BED_PATH = "/Users/peterpriestley/hmf/analyses/giabTruthsets/"
BED_FILE_NAME = "NA12878_GIAB_highconf_IllFB-IllGATKHC-CG-Ion-Solid_ALLCHROM_v3.2.2_highconf.bed"

# TRUTH SET
SAMPLE_NAMES_TRUTH = {'SPIKEIN':'truth'}
VCF_PATH_TRUTH = "/Users/peterpriestley/hmf/analyses/dreamChallenge/"
VCF_FILE_NAME_TRUTH = "synthetic.challenge.set3.tumor.20pctmasked.truth.vcf"

# COMBINED VCF CONFIG
#VCF_SAMPLE = "IS3.snv.indel.sv"
VCF_PATH = "/Users/peterpriestley/hmf/analyses/dreamChallenge/sv_analysis/"
VCF_FILE_NAME = "somaticSV.vcf"
#SAMPLE_NAMES = {'G15512.prenormal.sorted': 'normal', 'IS3.snv.indel.sv':'tumor'}
SAMPLE_NAMES = {'IS3.snv.indel.sv':'tumor'}

In [None]:
# SPECIFY SAMPLES
VCF_SAMPLES=[]
for x in os.listdir(VCF_PATH):
    #if x[-10:] == "sliced.vcf":
    #    VCF_SAMPLES.append(x[40:52] + "R_" + x[40:52] + "T_merged_somatics_snpEff_dbSNP_Cosmicv76_melted.vcf")
    if x[-13:] == "somaticSV.vcf":
        VCF_SAMPLES.append(x)
VCF_SAMPLES

### Functions

In [None]:
def calculateTruth(sv_caller,sv_type,chrom,startMin,startMax,lenMin,lenMax):
    if sv_caller == "MANTA" and sv_type == 'INS':   #offset for MANTA INS + Does not give a lenth
        startMin = startMin - 0.5 * (lenMax)
        return dfTruth[(dfTruth.chrom==chrom)&(dfTruth.truthSVStartMax>=startMin-300)&(dfTruth.truthSVStartMin<=startMax+300)]['id'].count()
    elif sv_type == 'BND' or sv_type == 'TRA':
        if sv_caller == "DELLY":
            startMin = startMin - 100
        return dfTruth[(dfTruth.chrom==chrom)&(dfTruth.truthSVStartMax>=startMin)&(dfTruth.truthSVStartMin<=startMax)]['id'].count()
    elif sv_caller == "DELLY" and (sv_type == 'DUP' or sv_type == 'INV'):
        lenMax = lenMax + 125
        lenMin = lenMin - 150
        startMax = startMax + 200
        startMin = startMin - 200
    elif sv_caller == "DELLY" and sv_type == 'DEL':
        lenMin = lenMin - 150
        startMax = startMax + 200
    return dfTruth[(dfTruth.chrom==chrom)&(dfTruth.truthSVStartMax>=startMin-300)& \
            (dfTruth.truthSVStartMin<=startMax+300)&(dfTruth.truthSVLenMax>=lenMin-300)&(dfTruth.truthSVLenMin<=lenMax+300)]['id'].count()
def calculateTruthSubVariantType(sv_caller,sv_type,chrom,startMin,startMax,lenMin,lenMax):
    if sv_caller == "MANTA" and sv_type == 'INS':   #offset for MANTA INS + Does not give a lenth
        startMin = startMin - 0.5 * (lenMax)
        startMax = startMax
    elif sv_caller == "DELLY" and sv_type == 'DEL':
        startMax = startMax + 200
    return dfTruth[(dfTruth.chrom==chrom)&(dfTruth.truthSVStartMax>=startMin)& \
            (dfTruth.truthSVStartMin<=startMax)]['variantSubType'].max() 

In [None]:
def calcuatePrecisionSensivityMatrix(df):
    outputdata = []
    variantSubTypes = df['variantSubType'].unique()
    for variantSubType in variantSubTypes:
        truePositives = len(df[(df['truth'] == True) &(df['variantSubType'] == variantSubType)])
        positives = len(df[(df['variantSubType'] == variantSubType)])
        truthSet = len(dfTruth[(dfTruth.variantType =="SV")&(dfTruth.variantSubType ==variantSubType)]) 
        falseNegatives = truthSet - truePositives
        if positives > 0:
            outputdata.append([variantSubType, truthSet,truePositives,positives-truePositives, falseNegatives, \
                                   round(truePositives/float(positives),4),round(truePositives/float(max(truthSet,1)),4)])
    
    outputDF = pd.DataFrame(outputdata)
    outputDF.columns = (['variantSubType','truthSet','truePositives','falsePositives','falseNegatives','precision','sensitivity'])
    return outputDF.sort_values(['variantSubType'])

<h3> Load VCFs and Prepare DF

In [None]:
## LOAD TRUTH SET VCF
if USE_BED:
    bed = aVCF.loadBEDFile(BED_PATH,BED_FILE_NAME)
    dfTruth = aVCF.loadVariantsFromVCF(VCF_PATH_TRUTH,VCF_FILE_NAME_TRUTH,SAMPLE_NAMES_TRUTH,"Mix-in Truth Set",True,True,bed,LOAD_NON_BED_MUTATIONS)
else:
    dfTruth = aVCF.loadVariantsFromVCF(VCF_PATH_TRUTH,VCF_FILE_NAME_TRUTH,SAMPLE_NAMES_TRUTH,"Mix-in Truth Set",True,False)
dfTruth = dfTruth[(dfTruth.variantType=="SV")&(dfTruth.variantSubType<>"IGN")&(dfTruth.variantSubType<>"MSK")]

In [None]:
dfTruth[dfTruth.variantType=='SV'][['variantType','variantSubType','ref']].groupby(['variantType','variantSubType']).agg('count')

In [None]:
#Load all VCF_SAMPLES to DF
dfProd = pd.DataFrame({})
for VCF_SAMPLE in VCF_SAMPLES:
    VCF_FILE_NAME = VCF_SAMPLE
    if USE_BED:
        bed = aVCF.loadBEDFile(BED_PATH,BED_FILE_NAME)
        dfProd = pd.concat([dfProd,aVCF.loadVariantsFromVCF(VCF_PATH,VCF_FILE_NAME,SAMPLE_NAMES,VCF_SAMPLE,True,True,bed,LOAD_NON_BED_MUTATIONS)])
    else:
        dfProd = pd.concat([dfProd,aVCF.loadVariantsFromVCF(VCF_PATH,VCF_FILE_NAME,SAMPLE_NAMES,VCF_SAMPLE,True,False)])
dfProd = dfProd[(dfProd.variantType=="SV")&(dfProd.variantSubType<>"IGN")]

In [None]:
dfProd[['variantType','variantSubType','ref']].groupby(['variantType','variantSubType']).agg('count')

In [None]:
SV_CALLER = 'MANTA'
#SV_CALLER = 'DELLY'
dfProd['truth'] = np.vectorize(calculateTruth)(SV_CALLER,dfProd['variantSubType'],dfProd['chrom'], dfProd['tumorSVStartMin'],dfProd['tumorSVStartMax'],dfProd['tumorSVLenMin'],dfProd['tumorSVLenMax'])
dfProd['truthVariantSubType'] = np.vectorize(calculateTruthSubVariantType)(SV_CALLER,dfProd['variantSubType'],dfProd['chrom'], dfProd['tumorSVStartMin'],dfProd['tumorSVStartMax'],dfProd['tumorSVLenMin'],dfProd['tumorSVLenMax'])

### PRECISION + SENSITIVITY

In [None]:
##### NOTES
# don't display duplicate BNDs)
# Delly INS???

In [None]:
#PRECISION + SENSITIVITY by caller - PROD
outputDF = calcuatePrecisionSensivityMatrix(dfProd)
outputDF

In [None]:
pd.pivot_table(dfProd[dfProd.variantType=='SV'], values='id', index=['variantSubType'], columns=['truth','truthVariantSubType'], aggfunc='count').fillna("")

In [None]:
pd.pivot_table(dfProd[dfProd.variantType=='SV'], values='id', index=['variantSubType'], columns=['truth','truthVariantSubType'], aggfunc='count').fillna("")

In [None]:
dfProd[((dfProd.variantSubType == 'DUP'))].head(50)

In [None]:
dfTruth[(dfTruth.chromFrac>2.65)].head(2)

### 

In [None]:
def findMatchingBNDChromPos(df):
    df['matchingBNDChromPos'] = ""
    for i in range(0,df.shape[0]):
        if df.iloc[i]['variantSubType'] == "BND":
            tmp = df.iloc[i]['tumorallele'].find("]")
            if tmp >= 0:
                df.iloc[i,df.columns.get_loc('matchingBNDChromPos')] = df.iloc[i]['tumorallele'][tmp+1:df.iloc[i]['tumorallele'].find("]",tmp+1)]
    return df
        

In [None]:
dfProd = findMatchingBNDChromPos(dfProd)

In [None]:
len(dfProd[(dfProd.variantSubType=='BND')&(dfProd.matchingBNDChromPos<>'')])

In [None]:
dfProd[dfProd.matchingBNDChromPos=='8:128753374'].truth

In [None]:
dfProd["duplicate"] = AND (dfProd.duplicated(subset=['matchingBNDChromPos'],keep=False))

In [None]:
pd.pivot_table(dfProd[dfProd.variantType=='SV'], values='id', index=['chrom'], columns=['truth','variantSubType'], aggfunc='count').fillna("")

In [None]:
dfProd[(dfProd.variantSubType<>'BND')&(dfProd.truth==0)]