### CONFIGURATION

In [1]:
import os
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 200)
import numpy as np
import copy
import sys
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Hartwig imports
sys.path.append("/Users/peterpriestley/hmf/repos/scripts/analysisscripts") 
import analyseVCF as aVCF
import venn as vn

In [3]:
#VCF
VCF_PATH = "/Users/peterpriestley/hmf/analyses/ensembleRuleTesting/"

# BED
#BED_PATH = "/Users/peterpriestley/hmf/analyses/giabTruthsets/"
#BED_FILE_NAME = "NA12878_GIAB_highconf_IllFB-IllGATKHC-CG-Ion-Solid_ALLCHROM_v3.2.2_highconf.bed"
BED_PATH = "/Users/peterpriestley/hmf/analyses/slices/"
#BED_FILE_NAME = "cosmicCancerCensus.bed" 
BED_FILE_NAME = "CPCT_Slicing.bed"
USE_BED = True

# HEALTH CHECKS
HEALTH_CSV_PATH="/Users/peterpriestley/hmf/analyses/healthCheckAnalysis/"
HEALTH_CSV_FILENAME="healthCheckFile.csv"

In [46]:
# SPECIFY SAMPLES
VCF_SAMPLES=[]
for x in os.listdir(VCF_PATH):
    if x[-10:] == "sliced.vcf":
        VCF_SAMPLES.append(x)
#VCF_SAMPLES = ['160903_HMFregCPCT_FR10302737_FR10304566_CPCT02070012.filtered_variants_snpEff_snpSift_Cosmicv76_GoNLv5_sliced.vcf']

<h3> LOAD VCF

In [5]:
#Load all VCF_SAMPLES to DF
df = pd.DataFrame({})
for VCF_SAMPLE in VCF_SAMPLES:
    VCF_FILE_NAME = VCF_SAMPLE
    SAMPLE_NAMES = {VCF_SAMPLE[40:52]+'R':'normal',VCF_SAMPLE[40:52]+'T':'tumor'}
    if USE_BED:
        bed = aVCF.loadBEDFile(BED_PATH,BED_FILE_NAME)
        df = pd.concat([df,aVCF.loadVariantsFromVCF(VCF_PATH,VCF_FILE_NAME,SAMPLE_NAMES,VCF_SAMPLE[40:52],True,True,bed,False)])
    else:
        df = pd.concat([df,aVCF.loadVariantsFromVCF(VCF_PATH,VCF_FILE_NAME,SAMPLE_NAMES,VCF_SAMPLE[40:52],True)])

reading BED file
Bed File Loaded
reading vcf file: 160903_HMFregCPCT_FR10302737_FR10304566_CPCT02070012.filtered_variants_snpEff_snpSift_Cosmicv76_GoNLv5_sliced.vcf
Number variants loaded: 67
reading BED file
Bed File Loaded
reading vcf file: 160903_HMFregCPCT_FR10302740_FR10304570_CPCT02050049.filtered_variants_snpEff_snpSift_Cosmicv76_GoNLv5_sliced.vcf
Number variants loaded: 65
reading BED file
Bed File Loaded
reading vcf file: 160905_HMFregCPCT_FR10301943_FR12251966_CPCT02160001.filtered_variants_snpEff_snpSift_Cosmicv76_GoNLv5_sliced.vcf
Number variants loaded: 73
reading BED file
Bed File Loaded
reading vcf file: 160905_HMFregCPCT_FR10302743_FR10304571_CPCT02050050.filtered_variants_snpEff_snpSift_Cosmicv76_GoNLv5_sliced.vcf
Number variants loaded: 57
reading BED file
Bed File Loaded
reading vcf file: 160905_HMFregCPCT_FR12244549_FR12251919_CPCT02160002.filtered_variants_snpEff_snpSift_Cosmicv76_GoNLv5_sliced.vcf
Number variants loaded: 72
reading BED file
Bed File Loaded
reading

In [6]:
#LOAD HEALTH_CHECKS
hc = pd.read_csv(HEALTH_CSV_PATH + HEALTH_CSV_FILENAME)
hc = hc[['SAMPLE','TUMOR_PERCENTAGE','TUMOR_TYPE','PIPELINE_VERSION']]
hc = hc[hc['SAMPLE'].str[12]=="T"]   # only tumor rows
hc['SAMPLE']=hc['SAMPLE'].str[:12]
hc = hc.set_index(['SAMPLE'])
df = pd.merge(df,hc,how='left', left_on='patientName',right_index=True)
df.fillna("Unknown", inplace=True)

<h3> High Level Stats

### LOH variants

In [47]:
pd.pivot_table(df[(df.normalSGT<>"")&(df.normalSGT<>df.tumorSGT)],\
               values='pos', index=['chromPos','patientName','annGene','normalSGT','tumorSGT'], columns=['consensus'], aggfunc='count',margins=True).fillna("")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,consensus,False,True,All
chromPos,patientName,annGene,normalSGT,tumorSGT,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10:89720633,CPCT02040035,PTEN,0/1,,1.0,,1.0
10:89720633,CPCT02050048,PTEN,0/1,,1.0,,1.0
10:89720633,CPCT02060018,PTEN,0/1,,1.0,,1.0
11:534242,CPCT02010393,HRAS,0/1,1/1,,1.0,1.0
13:32914236,CPCT02020357,BRCA2,0/1,,1.0,,1.0
14:105246325,CPCT02010393,AKT1,0/1,1/1,1.0,,1.0
14:105246407,CPCT02160003,AKT1,0/1,1/1,1.0,,1.0
14:105246565,CPCT02160003,AKT1,0/1,,1.0,,1.0
14:105258892,CPCT02010393,AKT1,0/1,,1.0,,1.0
17:37868189,CPCT02020293,ERBB2,0/1,,1.0,,1.0


### HIGH and MODERATE impact NON-missense

In [44]:
pd.pivot_table(df[((df['annWorstImpact']=='MODERATE')|(df['annWorstImpact']=='HIGH'))&(~df.annAllEffects.str.contains("missense"))&(df.normalSGT=="")],\
               values='pos', index=['chromPos','patientName','annGene','annWorstEffect','annWorstImpact','ref','normalallele'], columns=['consensus'], aggfunc='count',margins=True).fillna("")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,consensus,False,All
chromPos,patientName,annGene,annWorstEffect,annWorstImpact,ref,normalallele,Unnamed: 7_level_1,Unnamed: 8_level_1
10:89692784,CPCT02140013,PTEN,disruptive_inframe_insertion,MODERATE,T,,1.0,1.0
10:89717731,CPCT02060015,PTEN,frameshift_variant,HIGH,T,,1.0,1.0
10:89720741,CPCT02140015,PTEN,stop_gained,HIGH,C,,1.0,1.0
10:89720798,CPCT02160001,PTEN,frameshift_variant,HIGH,GTACT,,1.0,1.0
16:68856020,CPCT02160001,CDH1,stop_gained,HIGH,C,,1.0,1.0
17:7573976,CPCT02060019,TP53,stop_gained,HIGH,T,,1.0,1.0
17:7576851,CPCT02140015,TP53,splice_donor_variant&intron_variant,HIGH,A,,1.0,1.0
17:7576855,CPCT02010397,TP53,stop_gained&splice_region_variant,HIGH,G,,1.0,1.0
17:7577085,CPCT02030261,TP53,stop_gained,HIGH,C,,1.0,1.0
17:7577552,CPCT02060014,TP53,frameshift_variant,HIGH,CA,,1.0,1.0


### Missense

In [33]:
pd.pivot_table(df[(df.annAllEffects.str.contains("missense"))&(df.normalSGT=="")],\
               values='pos', index=['chromPos','patientName','annGene','normalSGT','tumorSGT'], columns=['consensus'], aggfunc='count',margins=True).fillna("")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,consensus,False,All
chromPos,patientName,annGene,normalSGT,tumorSGT,Unnamed: 5_level_1,Unnamed: 6_level_1
10:89624305,CPCT02110007,PTEN,,0/1,1.0,1.0
10:89720677,CPCT02060015,PTEN,,0/1,1.0,1.0
12:25398284,CPCT02110006,KRAS,,0/1,1.0,1.0
13:32910968,CPCT02030261,BRCA2,,0/1,1.0,1.0
13:49033926,CPCT02020357,RB1,,0/1,1.0,1.0
16:68842328,CPCT02050048,CDH1,,0/1,1.0,1.0
17:37881000,CPCT02060019,ERBB2,,0/1,1.0,1.0
17:7577536,CPCT02110006,TP53,,0/1,1.0,1.0
17:7577538,CPCT02010390,TP53,,0/1,1.0,1.0
17:7577539,CPCT02050048,TP53,,0/1,1.0,1.0


### SUMMARY OF TYPE

In [13]:
pd.pivot_table(df, values='pos', index=['normalSGT',], columns=['tumorSGT'], aggfunc='count').fillna("")

tumorSGT,Unnamed: 1_level_0,0/1,1/1
normalSGT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,,67.0,7.0
0/1,22.0,1406.0,6.0
1/1,3.0,,1236.0


<h3> Allelic Depth

In [None]:
#Alllele Freq By Caller
for columnName in list(df):
    if columnName.endswith('allele'):
        ser = df[df[columnName] != ''][columnName[:-6] + 'AF']
        ser = ser.sort_values()
        #ser[len(ser)] = ser.iloc[-1]
        cum_dist = np.linspace(0.,1.,len(ser))
        ser_cdf = pd.Series(cum_dist, index=ser,name=columnName[:-6]+": c="+str(ser.count())+" m="+str(round(ser.median(),2)))
        ser_cdf.plot(drawstyle='steps',legend=True,title=" Allelic Frequency by Caller",figsize=[15,6])

### SCRATCH

In [53]:
df[(df.patientName=='CPCT02010393')&(df.chrom=='17')]

Unnamed: 0,chrom,pos,chromPos,chromFrac,id,ref,vennSegment,numCallers,variantType,variantSubType,filter,bedRegion,inDBSNP,inCOSMIC,annGene,annWorstImpact,annWorstEffect,annAllEffects,consensus,tumorallele,tumorAF,tumorDP,tumorQS,tumorSGT,tumorindelDiff,normalallele,normalAF,normalDP,normalQS,normalSGT,normalindelDiff,patientName,TUMOR_PERCENTAGE,TUMOR_TYPE,PIPELINE_VERSION
53,17,7578115,17:7578115,17.093332,rs1625895,T,test,2,SNP,,PASS,TP53,True,False,TP53,LOW,sequence_feature,sequence_feature|sequence_feature|sequence_fea...,False,C,0.0866142,127.0,-1.0,0/1,,C,0.435897,39.0,-1.0,0/1,,CPCT02010393,40,Melanoma,v1.9
54,17,7578212,17:7578212,17.093333,COSM10654,G,test,1,SNP,,PASS,TP53,False,True,TP53,HIGH,stop_gained,stop_gained|stop_gained|stop_gained|stop_gaine...,False,A,0.838983,118.0,-1.0,0/1,,,,,,,,CPCT02010393,40,Melanoma,v1.9
55,17,7578645,17:7578645,17.093339,rs2909430,C,test,1,SNP,,PASS,TP53,True,False,TP53,LOW,sequence_feature,sequence_feature|sequence_feature|sequence_fea...,False,,,,,,,T,0.548387,31.0,-1.0,0/1,,CPCT02010393,40,Melanoma,v1.9
56,17,7579472,17:7579472,17.093349,rs1042522;COSM3766190,G,test,2,SNP,,PASS,TP53,True,True,TP53,MODERATE,missense_variant,missense_variant|missense_variant|missense_var...,True,C,0.0915033,153.0,-1.0,0/1,,C,0.40625,32.0,-1.0,0/1,,CPCT02010393,40,Melanoma,v1.9
57,17,37884037,17:37884037,17.46658,rs61552325;COSM5436892,C,test,1,SNP,,PASS,ERBB2,True,True,ERBB2,MODERATE,missense_variant,missense_variant|missense_variant|missense_var...,False,,,,,,,G,0.481481,27.0,-1.0,0/1,,CPCT02010393,40,Melanoma,v1.9
58,17,38556770,17:38556770,17.474865,rs471692,T,test,2,SNP,,PASS,TOP2A,True,False,TOP2A,LOW,sequence_feature,sequence_feature|intron_variant,False,C,1.0,185.0,-1.0,1/1,,C,1.0,39.0,-1.0,1/1,,CPCT02010393,40,Melanoma,v1.9
59,17,41223094,17:41223094,17.507704,rs1799966;COSM3755559,T,test,1,SNP,,PASS,BRCA1,True,True,BRCA1,MODERATE,missense_variant,missense_variant|missense_variant|missense_var...,False,,,,,,,C,0.522727,44.0,-1.0,0/1,,CPCT02010393,40,Melanoma,v1.9
60,17,41234470,17:41234470,17.507844,rs1060915;COSM4416375,A,test,1,SNP,,PASS,BRCA1,True,True,BRCA1,LOW,synonymous_variant,synonymous_variant|synonymous_variant|synonymo...,False,,,,,,,G,0.566667,30.0,-1.0,0/1,,CPCT02010393,40,Melanoma,v1.9
61,17,41244000,17:41244000,17.507961,rs16942;COSM148277,T,test,2,SNP,,PASS,BRCA1,True,True,BRCA1,MODERATE,missense_variant,missense_variant|missense_variant|missense_var...,True,C,0.0779221,154.0,-1.0,0/1,,C,0.486486,37.0,-1.0,0/1,,CPCT02010393,40,Melanoma,v1.9
62,17,41244435,17:41244435,17.507966,rs16941;COSM3755563,T,test,1,SNP,,PASS,BRCA1,True,True,BRCA1,MODERATE,missense_variant,missense_variant|missense_variant|missense_var...,False,,,,,,,C,0.577778,45.0,-1.0,0/1,,CPCT02010393,40,Melanoma,v1.9
