### CONFIGURATION

In [1]:
import os
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', 200)
import numpy as np
import copy
import sys
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#PYFAIDX
from pyfaidx import Fasta
genes = Fasta('/Users/peterpriestley/hmf/data/refgenomes/Homo_sapiens.GRCh37.GATK.illumina/Homo_sapiens.GRCh37.GATK.illumina.fasta')

In [3]:
# Hartwig imports
sys.path.append("/Users/peterpriestley/hmf/repos/scripts/analysisscripts") 
import analyseVCF as aVCF
import venn as vn
import chromosomeDefinition as cd

In [43]:
#VCF
VCF_PATH = "/Users/peterpriestley/hmf/analyses/170219_CRC_liver_biopsy_analysis/"
# BED
BED_PATH = "/Users/peterpriestley/hmf/analyses/giabTruthsets/"
BED_FILE_NAME = "NA12878_GIAB_highconf_IllFB-IllGATKHC-CG-Ion-Solid_ALLCHROM_v3.2.2_highconf.bed"
BED_PATH = "/Users/peterpriestley/hmf/analyses/slices/"
BED_FILE_NAME = "cosmicCancerCensus.bed" 
#BED_FILE_NAME = "CPCT_Slicing.bed"
USE_BED = True
LOAD_NON_BED_MUTATIONS = False

# HEALTH CHECKS
HEALTH_CSV_PATH="/Users/peterpriestley/hmf/analyses/healthCheckAnalysis/"
HEALTH_CSV_FILENAME="healthCheckFile.csv"

In [30]:
def loadHealthChecksAndMerge(df):
    hc = pd.read_csv(HEALTH_CSV_PATH + HEALTH_CSV_FILENAME)
    hc = hc[['SAMPLE','TUMOR_PERCENTAGE','TUMOR_TYPE','PIPELINE_VERSION']]
    hc = hc[hc['SAMPLE'].str[12]=="T"]   # only tumor rows
    hc['SAMPLE']=hc['SAMPLE'].str[:12]
    hc = hc.set_index(['SAMPLE'])
    df = pd.merge(df,hc,how='left', left_on='patientName',right_index=True)
    df.fillna("Unknown", inplace=True)
    return df

In [31]:
# SPECIFY SAMPLES
VCF_SAMPLES=[]
for x in os.listdir(VCF_PATH):
    if x[-12:] == "d_melted.vcf":
        VCF_SAMPLES.append(x)

print VCF_SAMPLES

['CPCT02040058R_CPCT02040058T_merged_somatics_annotated_melted.vcf', 'CPCT02040062R_CPCT02040062T_merged_somatics_annotated_melted.vcf', 'CPCT02040067R_CPCT02040067T_merged_somatics_annotated_melted.vcf', 'CPCT02040068R_CPCT02040068T_merged_somatics_annotated_melted.vcf', 'CPCT02050068R_CPCT02050068T_merged_somatics_annotated_melted.vcf', 'CPCT02050085R_CPCT02050085T_merged_somatics_annotated_melted.vcf', 'CPCT02050093R_CPCT02050093T_merged_somatics_annotated_melted.vcf', 'CPCT02050097R_CPCT02050097T_merged_somatics_annotated_melted.vcf', 'CPCT02070066R_CPCT02070066T_merged_somatics_annotated_melted.vcf', 'CPCT02100042R_CPCT02100042T_merged_somatics_annotated_melted.vcf', 'CPCT02100047R_CPCT02100047T_merged_somatics_annotated_melted.vcf', 'CPCT02180008R_CPCT02180008T_merged_somatics_annotated_melted.vcf', 'CPCT02230003R_CPCT02230003T_merged_somatics_annotated_melted.vcf', 'CPCT02330007R_CPCT02330007T_merged_somatics_annotated_melted.vcf', 'CPCT02330009R_CPCT02330009T_merged_somatics_an

<h3> LOAD VCF

In [32]:
for VCF_SAMPLE in VCF_SAMPLES:
    print VCF_SAMPLE[14:27]

CPCT02040058T
CPCT02040062T
CPCT02040067T
CPCT02040068T
CPCT02050068T
CPCT02050085T
CPCT02050093T
CPCT02050097T
CPCT02070066T
CPCT02100042T
CPCT02100047T
CPCT02180008T
CPCT02230003T
CPCT02330007T
CPCT02330009T


In [44]:
#Load all VCF_SAMPLES to DF
df = pd.DataFrame({})
for VCF_SAMPLE in VCF_SAMPLES:
    VCF_FILE_NAME = VCF_SAMPLE
    SAMPLE_NAMES = {VCF_SAMPLE[14:27]:'melted'}
    #SAMPLE_NAMES = {VCF_SAMPLE[11:27:'melted'}
    #SAMPLE_NAMES = {"/sample/output/160604_HMFreg0056_FR10302029_FR10302030_CPCT02010299/somaticVariants/CPCT02010299R_CPCT02010299T/CPCT02010299R_CPCT02010299T_merged_somatics_snpEff_dbSNP_Cosmicv76":'melted'}
    if USE_BED:
        bed = aVCF.loadBEDFile(BED_PATH,BED_FILE_NAME)
        df = pd.concat([df,aVCF.loadVariantsFromVCF(VCF_PATH,VCF_FILE_NAME,SAMPLE_NAMES,VCF_SAMPLE[14:27],True,True,bed,LOAD_NON_BED_MUTATIONS)])
        #df = pd.concat([df,aVCF.loadVariantsFromVCF(VCF_PATH,VCF_FILE_NAME,{'CPCT02070011T':'melted'},"CPCT02070011",True,True,bed,LOAD_NON_BED_MUTATIONS)])
    else:
        df = pd.concat([df,aVCF.loadVariantsFromVCF(VCF_PATH,VCF_FILE_NAME,SAMPLE_NAMES,VCF_SAMPLE[14:27],True)])

reading vcf file: CPCT02040058R_CPCT02040058T_merged_somatics_annotated_melted.vcf
reading VCF File line: 1
Number variants loaded: 868
reading vcf file: CPCT02040062R_CPCT02040062T_merged_somatics_annotated_melted.vcf
reading VCF File line: 1
Number variants loaded: 847
reading vcf file: CPCT02040067R_CPCT02040067T_merged_somatics_annotated_melted.vcf
reading VCF File line: 1
Number variants loaded: 817
reading vcf file: CPCT02040068R_CPCT02040068T_merged_somatics_annotated_melted.vcf
reading VCF File line: 1
Number variants loaded: 826
reading vcf file: CPCT02050068R_CPCT02050068T_merged_somatics_annotated_melted.vcf
reading VCF File line: 1
Number variants loaded: 778
reading vcf file: CPCT02050085R_CPCT02050085T_merged_somatics_annotated_melted.vcf
reading VCF File line: 1
Number variants loaded: 1018
reading vcf file: CPCT02050093R_CPCT02050093T_merged_somatics_annotated_melted.vcf
reading VCF File line: 1
Number variants loaded: 996
reading vcf file: CPCT02050097R_CPCT02050097T_m

In [None]:
df = loadHealthChecksAndMerge(df)

### PON FILTERING

In [None]:
dfPON = aVCF.loadPON("/Users/peterpriestley/hmf/analyses/PON/","PON.tsv")

In [None]:
#del df['PONCount']
#del df['inPON']

In [None]:
df = pd.merge(df,dfPON,how='left', on=['chrom','pos'])
df['PONCount'].fillna(0, inplace=True)
df['inPON'] = df.PONCount>0

In [None]:
pd.pivot_table(df[df.variantType == 'INDEL'], values='pos', index=['numCallers','vennSegment'], columns=['consensus'], aggfunc='count',margins=False).fillna("")

<h3> VAF

In [None]:
for variantType in ['SNP','INDEL']:
    ser = df[(df['consensus']==False)&(df['inDBSNP']==False)&(df.variantType==variantType)]['meltedAF']
    ser = ser.sort_values()
    cum_dist = np.linspace(0.,1.,len(ser))
    ser_cdf = pd.Series(cum_dist, index=ser,name=variantType+": c="+str(ser.count())+" m="+str(round(ser.median(),2)))
    ser_cdf.plot(drawstyle='steps',legend=True,title=" Allelic Frequency by sample (FAILED CONSENSUS)",figsize=[15,6])

In [None]:
for variantType in ['SNP','INDEL']:
    ser = df[(df['consensus']==True)&(df['inDBSNP']==False)&(df.variantType==variantType)]['meltedAF']
    ser = ser.sort_values()
    cum_dist = np.linspace(0.,1.,len(ser))
    ser_cdf = pd.Series(cum_dist, index=ser,name=variantType+": c="+str(ser.count())+" m="+str(round(ser.median(),2)))
    ser_cdf.plot(drawstyle='steps',legend=True,title=" Allelic Frequency by sample (PASSED CONSENSUS + NOT IN DBSNP)",figsize=[15,6])

In [None]:
pd.pivot_table(df[(df.variantType=='SNP')&(df.meltedallele.str.len()==1)&(df.meltedAF<0.07)&(df.consensus==True)], values='pos', index=['ref'], columns=['meltedallele'], aggfunc='count',margins=True).fillna("")

### HIGH and MODERATE impact NON-missense

In [None]:
#pd.pivot_table(df, values='pos', index=['chromPos','patientName','TUMOR_TYPE','annGene','annWorstImpact','annWorstEffect','consensus','meltedAF'], columns=['inCOSMIC'], aggfunc='count',margins=True).fillna("")

### Missense

In [None]:
df[(df.annAllEffects.str.contains("missense"))]

In [None]:
pd.pivot_table(df[(df.annAllEffects.str.contains("missense"))], values='pos', index=['variantType','numCallers'], columns=['consensus','inDBSNP','bedRegion'], aggfunc='count',margins=True).fillna("")

In [None]:
pd.pivot_table(df, values='pos', index=['variantType','numCallers'], columns=['consensus','inDBSNP','bedRegion'], aggfunc='count',margins=True).fillna("")

In [47]:
len(df[(df.annAllEffects.str.contains("missense"))&(df.bedRegion <> '')])

0

### Consensus

In [55]:
pd.pivot_table(df[(df.consensus==True)&((df.annWorstImpact=="HIGH")|(df.annWorstImpact=="MODERATE"))], values='pos', index=['annGene'], columns=['patientName'], aggfunc='count',margins=True).fillna("").sort_values("All",ascending=False)

patientName,CPCT02040058T,CPCT02040062T,CPCT02040067T,CPCT02040068T,CPCT02050068T,CPCT02050085T,CPCT02050093T,CPCT02050097T,CPCT02070066T,CPCT02100042T,CPCT02100047T,CPCT02180008T,CPCT02230003T,CPCT02330007T,CPCT02330009T,All
annGene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
All,9.0,8.0,12.0,6.0,11.0,13.0,10.0,7.0,15.0,12.0,3.0,13.0,22.0,8.0,9.0,158.0
APC,,3.0,2.0,,2.0,1.0,2.0,1.0,,2.0,,2.0,2.0,,1.0,18.0
TP53,1.0,,,2.0,1.0,1.0,1.0,,1.0,1.0,,,1.0,1.0,1.0,11.0
PIK3CA,,,,,,1.0,,1.0,,,,2.0,2.0,,,6.0
PTPRT,,,,1.0,2.0,,,,,1.0,1.0,,,,,5.0
KRAS,,,,,,1.0,,,,,,1.0,1.0,1.0,,4.0
SMAD4,,,,,,,,1.0,,1.0,,,2.0,,,4.0
UBR5,,,,,,1.0,,,1.0,,,,1.0,,,3.0
KIAA1549,,,1.0,,,,,,1.0,,,,,1.0,,3.0
GRIN2A,,,,,,,,,1.0,1.0,,,,,1.0,3.0


### Mutational Patterns

In [None]:
pd.pivot_table(df[(df.ref.str.len()==1)&(df.variantType == 'SNP')], values='pos', index=['ref'], columns=['meltedallele'], aggfunc='count',margins=True).fillna("")

In [None]:
pd.pivot_table(df[df.annAllEffects.str.contains("missense")], values='pos', index=['chromPos','patientName','annGene','meltedAF','numCallers','inCOSMIC'], columns=['consensus'], aggfunc='count',margins=True).fillna("")

### SINGLE PATIENT ANALYSIS

In [None]:
df[(df.patientName=='CPCT02140013')]#[['patientName','TUMOR_TYPE','pos','chrom','id','variantType','ref','meltedallele','meltedDP','meltedAF','vennSegment','annGene']].head(20)

### SCRATCH

In [None]:
pd.pivot_table(df, values='pos', index=['numCallers','vennSegment'], columns=['consensus'], aggfunc='count',margins=True).fillna("")

In [None]:
df[df.numCallers ==1]

In [None]:
def refgenomeContext(chromPos):
    chrom = chromPos.split(':')[0]
    pos = int(chromPos.split(':')[1])
    try:
        return str(genes[chrom][pos-2:pos+1][:3])
    except TypeError:
        return -1

In [None]:
refgenomeContext('7:467652')

In [None]:
df['refTriplet'] =df['chromPos'].apply(lambda x: refgenomeContext(x))

In [None]:
df.head(1)

In [None]:
pd.pivot_table(df[df.variantType=='SNP'], values='pos', index=['refTriplet','meltedallele'], columns=['inDBSNP'], aggfunc='count',margins=True).fillna("").sort_values('All',ascending=False)

### VENN

In [None]:
dftemp = df[(df.consensus== True)&(df.variantType == "SNP")&(df.annAllEffects.str.contains('missense'))&(df.inPON==False)]
vn.venn([dftemp[dftemp.patientName == 'PMC010001T']['chromPos'],dftemp[dftemp.patientName == 'PMC010001TII']['chromPos'], \
        dftemp[dftemp.patientName == 'PMC010001TIII']['chromPos']],['PMC010001T','PMC010001TII','PMC010001TIII'],figsize=(9,9))

### Common mutations

In [None]:
len(df)

In [None]:
# Calls with more than 1 instance
dfTemp = df[['consensus','inPON','numCallers','vennSegment','variantType','chrom','pos','bedRegion','annGene','annWorstEffect','chromPos','id']].groupby(['inPON','consensus','numCallers','vennSegment','variantType','chrom','pos','bedRegion','annGene','annWorstEffect','id']).agg('count').sort_values('chromPos',ascending=False)
dfTemp = dfTemp[dfTemp.chromPos>1]
len(dfTemp)

In [None]:
#dfTemp = pd.pivot_table(df[(df.bedRegion=='Default')], values='chromPos', index=['chrom','pos','annGene','annWorstEffect','id'], columns=['bedRegion'], aggfunc='count',margins=True).fillna("")
#dfTemp = dfTemp[dfTemp.All>1].sort_values('All',ascending=False)
#len(dfTemp)

In [None]:
dfTemp = dfTemp.reset_index()

In [None]:
dfTemp[(dfTemp.consensus==True)][['inPON','variantType','numCallers','chromPos']].groupby(['inPON','variantType','numCallers']).agg('count')  #.reset_index().sort_values('numCallers',ascending=False)

In [None]:
dfTemp
dfTemp.groupby(['TUMOR_TYPE']).agg('count')['patientName']

In [None]:
#temp.reset_index
#groupby(['bedRegion']).agg('count')

In [None]:
temp.sort_values('All',ascending=False)

In [None]:
pd.pivot_table(df, values='pos', index=['annWorstEffect'], columns=['consensus'], aggfunc='count',margins=True).fillna("")

### SOMATIC AF

In [None]:
############### SOMATIC AF ##############
c = 1
fractionMin = 0
fractionMax = 22
dfSom = df[(df['variantType']=="SNP")&(df['inDBSNP']==False)&(df['inPON']==False)&(df['consensus']==True)&(df.chromFrac>c+fractionMin)&(df.chromFrac<c+fractionMax)]
plt.figure(figsize=[18,5])
plt.scatter(dfSom.chromFrac, dfSom.meltedAF)
plt.grid(b=True, which='both', color='0.65',linestyle='-')
plt.axis([c+fractionMin, c+fractionMax,0, 1.0])
plt.show()

In [None]:
ser = dfSom.meltedAF
ser = ser.sort_values()
cum_dist = np.linspace(0.,1.,len(ser))
ser_cdf = pd.Series(cum_dist, index=ser,name=": c="+str(ser.count())+" m="+str(round(ser.median(),2)))
ser_cdf.plot(drawstyle='steps',legend=True,title=" Allelic Frequency by ",figsize=[15,6])

### COPY NUM vs SOMATIC AF

In [None]:
def loadCNVforPatient(cnvFile,patientName):
    dfCNV = pd.read_table(cnvFile, names = CNV_COLUMNS )
    #add regions with default copy number
    last = dfCNV.iloc[0]
    for i in range(1, dfCNV.shape[0]-1):
        if last['posEnd']<>dfCNV.iloc[i]['posStart']:
            if last['chrom']==dfCNV.iloc[i]['chrom']:
                dfCNV.loc[len(dfCNV)] = [last['chrom'],last['posEnd'], dfCNV.iloc[i]['posStart']-1,2,'none']
            else:
                dfCNV.loc[len(dfCNV)] = [last['chrom'],last['posEnd'], cd.chromosomeLength[last['chrom']],2,'none']
                if dfCNV.iloc[i]['posStart']<>0:
                    dfCNV.loc[len(dfCNV)] = [dfCNV.iloc[i]['chrom'],0, dfCNV.iloc[i]['posStart'],2,'none']
        last = dfCNV.iloc[i]
    #fix first record
    if dfCNV.iloc[0]['posStart']<>0:
        dfCNV.loc[len(dfCNV)] = ['1',0, dfCNV.iloc[0]['posStart'],2,'none']
    #Additional Fields
    dfCNV['chromPos']= dfCNV['chrom'].apply(lambda x: cd.intChrom(x)) + dfCNV['posStart'] / dfCNV['chrom'].apply(lambda x: cd.chromosomeLength[str(x)])
    dfCNV['chrom'] = dfCNV['chrom'].apply(lambda x: cd.intChrom(x))
    dfCNV['cappedCopyNum'] = dfCNV['copyNum'].clip(upper=5)
    dfCNV = dfCNV.sort_values(["chrom","posStart","posEnd"]).reset_index()
    dfCNV['regionLength'] = (dfCNV['posEnd']-dfCNV['posStart'])
    dfCNV['patientName']=patientName
    return dfCNV

In [None]:
CNV_COLUMNS = ['chrom','posStart','posEnd','copyNum','gainOrLoss']
CNV_PATH="/Users/peterpriestley/hmf/analyses/CPCT02140029/"
CNV_FILENAMES=[]
for x in os.listdir(CNV_PATH):
    if x[-9:] == ".bam_CNVs":
        CNV_FILENAMES.append(x)
CNV_FILENAMES

In [None]:
def calculateCopyNumForVariant(chromosome,position):
    return dfCNV[(dfCNV.chrom==chromosome)&(dfCNV.posStart<position)&(dfCNV.posEnd>=position)]['copyNum'].mean()

In [None]:
for filename in CNV_FILENAMES:
    patientName=filename[:12]
    dfCNV = loadCNVforPatient(CNV_PATH+filename,patientName)

In [None]:
df.head(1)

In [None]:
#df['intchrom']=
df['copyNum']=np.vectorize(calculateCopyNumForVariant)( df['chrom'].apply(lambda x: cd.intChrom(x)),(df.pos))

### SCRATCH

In [None]:
genes['1'][10298755:10298770]

In [None]:
len(df[(df.inPON==False)&(df.inDBSNP==True)&(df.consensus==True)&(df.annAllEffects.str.contains("missense"))])

In [None]:
len(df)