### CONFIGURATION

In [None]:
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 200)
pd.set_option('display.width', 20)
import numpy as np
import os
import copy
import sys
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
# Hartwig imports
sys.path.append("/Users/peterpriestley/hmf/repos/scripts/analysisscripts") 
import analyseVCF as aVCF
import venn as vn

In [None]:
# LOAD BED
#BED_PATH = "/Users/peterpriestley/hmf/analyses/giabTruthsets/"
#BED_FILE_NAME = "NA12878_GIAB_highconf_IllFB-IllGATKHC-CG-Ion-Solid_ALLCHROM_v3.2.2_highconf.bed"
BED_PATH = "/Users/peterpriestley/hmf/analyses/slices/"  
#BED_FILE_NAME = "HMF_Slicing.bed"
BED_FILE_NAME = "cosmicCancerCensus.bed"
USE_BED = True
LOAD_FULL_FILE = True

### Functions

In [None]:
def findFiles(path,suffix):
    files=[]
    for x in os.listdir(path):
        if x[-len(suffix):] == suffix:
            files.append(x)
    return files

In [None]:
def AFByPosPlot(pos,AF,startChartPos,endChartPos):
    plt.figure(figsize=[18,5])
    plt.scatter(pos, AF)
    plt.grid(b=True, which='both', color='0.65',linestyle='-')
    plt.axis([startChartPos, endChartPos,0, 1.0])
    plt.show()

In [None]:
def loadVCF(path,filename,sampleNames,patientID):
    df = pd.DataFrame()
    if USE_BED:
        bed = aVCF.loadBEDFile(BED_PATH,BED_FILE_NAME)
        return pd.concat([df,aVCF.loadVariantsFromVCF(path,filename,sampleNames,patientID,True,True,bed,LOAD_FULL_FILE)])
    else:
        return pd.concat([df,aVCF.loadVariantsFromVCF(path,filename,sampleNames,patientID,True)])

<h3> Load VCFs and Prepare DF

In [None]:
PATH = "/Users/peterpriestley/hmf/analyses/v1.12samples/CPCT02010506/"
patientID = "CPCT02010506T"

In [None]:
svVCFFilename = findFiles(PATH,"somaticSV.vcf")[0]
print "SV =",svVCFFilename
print "patient =",patientID

In [None]:
dfSV = loadVCF(PATH,svVCFFilename,{patientID:'tumor'},patientID)

In [None]:
pd.pivot_table(dfSV[(dfSV.variantSubType<>"IGN")&(dfSV.tumorAF>=0.12)], values='id', index=['variantSubType'], columns=['variantType'], aggfunc='count').fillna("")

### SLICING_REGION_VARIANTS

In [None]:
dfSV[(dfSV.bedRegion<>"")&(dfSV.variantSubType<>"IGN")][['chrom','pos','chromFrac','variantSubType','bedRegion','tumorallele','tumorAF','tumorSGT','tumorSVLenMin','tumorSVLenMax','tumorSVStartMin','tumorSVStartMax']]

### TRANSLOCATIONS

In [None]:
def findendBreakpointPos():
    # For del, ins, inv
    pass


In [None]:
def findMatchingBNDChromPos(df):
    # can 
    df['BNDMatchPos'] = ""
    for i in range(0,df.shape[0]):
        if df.iloc[i]['variantSubType'] == "BND":
            tmp = df.iloc[i]['tumorallele'].find("]")
            if tmp >= 0:
                df.iloc[i,df.columns.get_loc('BNDMatchPos')] = df.iloc[i]['tumorallele'][tmp+1:df.iloc[i]['tumorallele'].find("]",tmp+1)]
    return df

In [None]:
dfSV = findMatchingBNDChromPos(dfSV)

In [None]:
dfSV[(dfSV.variantSubType=="BND")&(dfSV.tumorAF>=0.08)][['chrom','pos','chromFrac','variantSubType','bedRegion','BNDMatchPos','tumorallele','tumorAF','tumorDP','tumorSGT','tumorSVLenMin','tumorSVLenMax','tumorSVStartMin','tumorSVStartMax']]

In [None]:
11/180.0

In [None]:
for variantSubType in dfSV.variantSubType.unique():
    ser = dfSV[(dfSV.variantSubType == variantSubType)]["tumorAF"].sort_values()
    cum_dist = np.linspace(0.,1.,len(ser))
    ser_cdf = pd.Series(cum_dist, index=ser,name=variantSubType)
    ser_cdf.plot(drawstyle='steps',legend=True,title="AF",figsize=[13,3],xlim=[0.0,0.8],ylim=[0.0,1.0])