In [None]:
import os
import pandas as pd
import numpy as np
import copy
import sys
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)
sys.path.append("/Users/peterpriestley/hmf/repos/scripts/analysisscripts") 
import chromosomeDefinition as cd
import analyseVCF as aVCF
from scipy.stats import norm
from sklearn.neighbors import KernelDensity

In [None]:
CNV_COLUMNS = ['chrom','posStart','posEnd','copyNum','gainOrLoss','BAF','score','germlineOrSomatic','oneOrZero']

In [None]:
# LOAD BED
BED_PATH = "/Users/peterpriestley/hmf/analyses/giabTruthsets/"
BED_FILE_NAME = "NA12878_GIAB_highconf_IllFB-IllGATKHC-CG-Ion-Solid_ALLCHROM_v3.2.2_highconf.bed"
BED_PATH = "/Users/peterpriestley/hmf/analyses/slices/"
BED_FILE_NAME = "exons_UCSC_autosomes.bed"
#BED_FILE_NAME = "CPCT_Slicing.bed"
USE_BED = True
LOAD_FULL_FILE = False

In [None]:
bed = aVCF.loadBEDFile(BED_PATH,BED_FILE_NAME)

### FUNCTIONS

In [None]:
def findFiles(path,suffix):
    files=[]
    for x in os.listdir(path):
        if x[-len(suffix):] == suffix:
            files.append(x)
    return files

In [None]:
def loadCNVforPatient(cnvFile,patientName):
    dfCNV = pd.read_table(cnvFile, names = CNV_COLUMNS )
    #add regions with default copy number
    last = dfCNV.iloc[0]
    for i in range(1, dfCNV.shape[0]-1):
        if last['posEnd']<>dfCNV.iloc[i]['posStart']:
            if last['chrom']==dfCNV.iloc[i]['chrom']:
                dfCNV.loc[len(dfCNV)] = [last['chrom'],last['posEnd'], dfCNV.iloc[i]['posStart']-1,2,'none','','',0,0]
            else:
                dfCNV.loc[len(dfCNV)] = [last['chrom'],last['posEnd'], cd.chromosomeLength[last['chrom']],2,'none','','',0,0]
                if dfCNV.iloc[i]['posStart']<>0:
                    dfCNV.loc[len(dfCNV)] = [dfCNV.iloc[i]['chrom'],0, dfCNV.iloc[i]['posStart'],2,'none','','',0,0]
        last = dfCNV.iloc[i]
    #fix first record
    if dfCNV.iloc[0]['posStart']<>0:
        dfCNV.loc[len(dfCNV)] = ['1',0, dfCNV.iloc[0]['posStart'],2,'none','','',0,0]
    #Additional Fields
    dfCNV['chromPos']= dfCNV['chrom'].apply(lambda x: cd.intChrom(x)) + dfCNV['posStart'] / dfCNV['chrom'].apply(lambda x: cd.chromosomeLength[str(x)])
    dfCNV['chrom'] = dfCNV['chrom'].apply(lambda x: cd.intChrom(x))
    dfCNV['cappedCopyNum'] = dfCNV['copyNum'].clip(upper=5)
    dfCNV = dfCNV.sort_values(["chrom","posStart","posEnd"]).reset_index()
    dfCNV['regionLength'] = (dfCNV['posEnd']-dfCNV['posStart'])
    dfCNV['patientName']=patientName
    return dfCNV

In [None]:
def calculateMBAFBetween(chrom,minPos,maxPos):
    dftemp = dfBAF[(dfBAF.Chromosome==chrom)&(dfBAF.Position>minPos)&(dfBAF.Position<maxPos)]
    return dftemp['mBAF'].median(),dftemp['mBAF'].count()

In [None]:
def loadBAFData(bafFile):
    df = pd.read_table(bafFile)
    return df

In [None]:
def cnvChart(dfCNV,filename,minChrom=1,maxChrom=23):
    plt.figure(figsize=[18,3])
    plt.title(filename)
    ax = plt.gca()
    ax.plot(dfCNV['chromPos'], dfCNV['copyNum'],drawstyle="steps-post")
    ax.axis([minChrom, maxChrom, 0, 6])
       

In [None]:
def cnvLabel(copyNumber):
    if copyNumber < 2:
        return 'loss'
    elif copyNumber > 2:
        return 'amplification'
    else:
        return'normal'

In [None]:
def createBAF(dfGermline):
    dfBAF = dfGermline[(dfGermline['variantType']=="SNP")&(dfGermline['normalSGT']=="0/1")&(dfGermline['normalAF']>0.4) \
                   &(dfGermline['normalAF']<0.65)&(dfGermline['normalDP']>30)&\
                   (dfGermline['normalDP']<100)][['chrom','pos','tumorAF']]
    dfBAF.columns = ['Chromosome', 'Position','BAF']
    dfBAF['Position'] = pd.to_numeric(dfBAF['Position'])
    dfBAF['mBAF']= 0.5+abs(dfBAF['BAF']-0.5)
    dfBAF['chromPos']= dfBAF['Chromosome'].apply(lambda x: cd.intChrom(x)) + dfBAF['Position'] / dfBAF['Chromosome'].apply(lambda x: cd.chromosomeLength[str(x)])
    dfBAF['Chromosome']= dfBAF['Chromosome'].apply(lambda x: cd.intChrom(x))
    return dfBAF

In [None]:
def patientIDFromFilename(filename,findKey):
    patientIDStart = filename.find(findKey)   #
    return filename[patientIDStart:patientIDStart+12]
    #return "GIAB12878"

In [None]:
def germlineVCFSampleNames(filename,patientID):
    return {patientID+'R':'normal',patientID+'T':'tumor'}

In [None]:
def loadVCF(path,filename,sampleNames,patientID,bedPath,bedFileName):
    df = pd.DataFrame()
    if USE_BED:
        bed = aVCF.loadBEDFile(bedPath,bedFileName)
        return pd.concat([df,aVCF.loadVariantsFromVCF(path,filename,sampleNames,patientID,True,True,bed,LOAD_FULL_FILE)])
    else:
        return pd.concat([df,aVCF.loadVariantsFromVCF(path,filename,sampleNames,patientID,True)])

In [None]:
def AFByPosPlot(pos,AF,startChartPos,endChartPos,maxY=1.0,height=5):
    plt.figure(figsize=[18,height])
    plt.scatter(pos, AF)
    plt.grid(b=True, which='both', color='0.65',linestyle='-')
    plt.axis([startChartPos, endChartPos,0, maxY])
    plt.show()

In [None]:
def loadPON(aPath,aPONFile):
    myPON = []
    with open(aPath + aPONFile, 'r') as f:
        for line in f:
            line = line.strip('\n')
            splitLine = line.split('\t')
            myPON.append(splitLine)
    dfPON = pd.DataFrame(myPON)
    dfPON.columns = ['chrom','pos','ref','alt','PONCount']
    return dfPON

In [None]:
def findPeaks(log_dens):
    peaks = []
    troughs = []
    dens=np.exp(log_dens)
    diff = [dens[x] - dens[x-1] for x in range(1,len(dens))]
    for i in range (len(diff)-1):
        if diff[i+1] < 0 and diff[i] > 0:
            peaks.append(float(i+1)/len(dens))
        if diff[i+1] > 0 and diff[i] < 0:
            troughs.append(float(i+1)/len(dens))
    return peaks, troughs

In [None]:
def pdfChart(log_dens,maxYValue=8):
    fig, ax = plt.subplots()
    ax.plot(X_plot[:, 0], np.exp(log_dens), '-',label="kernel = '{0}'".format('gaussian'))
    ax.legend(loc='upper right')
    fig.set_figwidth(10)
    fig.set_figheight(5)
    ax.set_xlim(0, 1)
    ax.set_ylim(-0.02, maxYValue)
    ax.grid(b=True, which='both', color='0.65',linestyle='-')
    plt.show()

In [None]:
def calculateSomaticCNV(dfTumorCNV):
    lastSomaticCopyNum = 2
    dfTumorCNV['copyNumSomatic']= dfTumorCNV['copyNum']
    for i in range(1, dfTumorCNV.shape[0]-1):
        if dfTumorCNV.iloc[i].germlineOrSomatic=="germline":
                dfTumorCNV.ix[i,'copyNumSomatic'] = lastSomaticCopyNum
        elif dfTumorCNV.iloc[i].germlineOrSomatic=="somatic"  or dfTumorCNV.iloc[i].germlineOrSomatic=="-":  
            lastSomaticCopyNum = dfTumorCNV.ix[i,'copyNumSomatic']
    return dfTumorCNV

### LOAD PON

In [None]:
# Only needs to be run once
#dfPON2 = loadPON("/Users/peterpriestley/hmf/analyses/PON/779filePON/","PON.tsv")

In [None]:
#def loadPONvcf(PONFile):
#    numHeaderRows = 0
#    with open(PONFile) as fp:
#        while fp.readline()[0]=='#':
#            numHeaderRows = numHeaderRows+1
#    dfPON = pd.read_table(PONFile,skiprows =numHeaderRows-1, dtype={'#CHROM':'str','POS':'str'})  #names = CNV_COLUMNS
#    dfPON['PON_COUNT'] = dfPON['INFO'].apply(lambda x: x.split('=')[1])
#    dfPON.rename(columns={'#CHROM': 'chrom', 'POS': 'pos','REF':'ref','ALT':'alt'}, inplace=True)
#    return dfPON
#
#dfPON = loadPONvcf("/Users/peterpriestley/hmf/analyses/PON/PON.vcf")

### FIND ALL FILES

In [None]:
PATH = "/Users/peterpriestley/hmf/analyses/RNA/"

In [None]:
#germlineVCFFilename = findFiles(PATH,"SLICED.vcf")[0]
rnaVCFFilename = findFiles(PATH,"0011_AHTYKLBGX2.filtered_variants_snpEff_snpSift.vcf")[0]
somaticVCFFilename = findFiles(PATH,"_germline_exon_filtered.vcf")[5]
patientID = patientIDFromFilename(somaticVCFFilename,"CPCT0")
#cnvFilename = findFiles(PATH,"gz_CNVs")[0]
print "rna =",rnaVCFFilename
print "somatic =",somaticVCFFilename
print "patient =",patientID
#print "CNV =",cnvFilename

### LOAD VCFS

In [None]:
dfRNA = loadVCF(PATH,rnaVCFFilename,{patientID+'T':'RNA'},patientID,BED_PATH,BED_FILE_NAME)
dfRNA = dfRNA[dfRNA.variantType  <> 'Missing Genotype']

In [None]:
#LOAD VCFS
dfSomatic = loadVCF(PATH,somaticVCFFilename,{patientID+'T':'melted'},patientID,BED_PATH,BED_FILE_NAME)

In [None]:
# APPLY PON to SOMATICs
#dfSomatic['alt'] = dfSomatic['meltedallele']
#dfSomatic = pd.merge(dfSomatic,dfPON,how='left', on=['chrom','pos','ref','alt'])
#dfSomatic['PON_COUNT'].fillna(0, inplace=True)
#dfSomatic['inPON'] = pd.to_numeric(dfSomatic.PON_COUNT,errors=coerce)>4

### RNA ANALYSIS

In [None]:
dfSomatic = pd.merge(dfSomatic,dfRNA[['chrom','pos','ref','RNAallele','RNADP','RNAAF']],how='left', on=['chrom','pos','ref'])

In [None]:
dfSomatic['RNADP'].fillna(-1.0, inplace=True)
dfSomatic['inRNA'] = ~pd.isnull(dfSomatic.RNAAF)
dfSomatic['sameAllele'] = (dfSomatic.RNAallele == dfSomatic.meltedallele)

In [None]:
pd.pivot_table(dfSomatic[dfSomatic.RNADP > 2], values='pos', index=['variantType'], columns=['inRNA'], aggfunc='count',margins=False).fillna("")

In [None]:
pd.pivot_table(dfRNA[dfRNA.RNADP > 5], values='pos', index=['variantType'], columns=['filter'], aggfunc='count',margins=False).fillna("")

In [None]:

plt.figure(figsize=[18,4])

plt.hist(dfRNA[(dfRNA.RNADP >0)&(dfRNA.RNADP < 100)&(dfRNA.variantType == 'SNP')&(dfRNA.chromFrac<23)]["RNADP"].values,bins=100)
plt.legend()
plt.show()

In [None]:
ser = dfSomatic[(dfSomatic.variantType == 'SNP')&(dfSomatic.chromFrac<23)]["meltedDP"].sort_values()
cum_dist = np.linspace(0.,1.,len(ser))
ser_cdf = pd.Series(cum_dist, index=ser,name="melted DP")
ser_cdf.plot(drawstyle='steps',legend=True,title="CDF of melted DP (Exome only)",figsize=[13,3],xlim=[50,150],ylim=[0.0,1.0])

In [None]:
pd.to_numeric(dfRNA['RNADP'], errors='coerce').mean()

In [None]:
ser = dfRNA[(dfRNA.RNADP >0)&(dfRNA.variantType == 'SNP')&(dfRNA.chromFrac<23)]["RNADP"].sort_values()
cum_dist = np.linspace(0.,1.,len(ser))
ser_cdf = pd.Series(cum_dist, index=ser,name="RNA DP, count="+str(len(ser)))
ser_cdf.plot(drawstyle='steps',legend=True,title="CDF of RNA DP (Exome only)",figsize=[13,3],xlim=[0.0,100],ylim=[0.0,1.0])

### TUMOR PURITY

In [None]:
c, fractionMin, fractionMax = 1,0,22
dfSom = dfBAF[(dfBAF.chromPos>c+fractionMin)&(dfBAF.chromPos<c+fractionMax)]
plt.figure(figsize=[18,5])
plt.scatter(dfSom.chromPos, dfSom.BAF)
plt.grid(b=True, which='both', color='0.65',linestyle='-')
plt.axis([c+fractionMin, c+fractionMax,0, 1.0])
plt.show()
# CNV according to freeC
plt.figure(figsize=[5,2])
plt.hist(dfCNV['copyNum'].values,cumulative=False, normed=True, bins=20,range=(0,10),weights=dfCNV['regionLength'].values)
plt.show()
# should only use regions with mbaf > x
dftemp = dfCNV[(dfCNV.mBAF>=0.0)&(dfCNV.mBAF<=1.0)&(dfCNV.mBAFCount>20)&(dfCNV.chromPos<=c+fractionMax)&(dfCNV.chromPos>=c+fractionMin)]
plt.figure(figsize=[18,5])
for CNV in range(0,6):
    plt.hist(dftemp[(dftemp.copyNum==CNV)]['mBAF'].values,cumulative=False, normed=False, bins=50,range=(0.5,1.0),weights=dftemp[(dftemp.copyNum==CNV)]['mBAFCount'].values,stacked = True,rwidth=0.8,label='CNV ='+str(CNV))
    plt.legend()
plt.show()

In [None]:
dfCNV[(dfCNV.chrom==15)&(dfCNV.mBAFCount>=100)&(dfCNV.copyNum>=1)&(dfCNV.mBAF>=0.50)&(dfCNV.mBAF<=10.63)]

### SOMATIC AF

In [None]:
#dfSomatic[(dfSomatic.inPON==False)&(dfSomatic.consensus==True)&(dfSomatic.bedRegion<>"A")&((dfSomatic.annWorstImpact=='MODERATE')|(dfSomatic.annWorstImpact=='HIGH'))]

In [None]:
pd.pivot_table(dfSomatic[(dfSomatic.variantType == 'SNP')&(dfSomatic.consensus == True)&(dfSomatic.inPON == False)], values='pos', index=['ref'], columns=['meltedallele'], aggfunc='count',margins=True).fillna("")

In [None]:
c,fractionMin,fractionMax = 1,0,22
dfSom = dfSomatic[(dfSomatic['variantType']=="SNP")&(dfSomatic['inDBSNP']==True)&(dfSomatic['consensus']==True)&(dfSomatic.chromFrac>c+fractionMin)&(dfSomatic.chromFrac<c+fractionMax)]
AFByPosPlot(dfSom.chromFrac, dfSom.meltedAF,c+fractionMin, c+fractionMax,1)

In [None]:
c,fractionMin,fractionMax = 1,0,22
dfSom = dfSomatic[(dfSomatic['variantType']=="SNP")&(dfSomatic['inDBSNP']==False)&(dfSomatic['consensus']==True)&(dfSomatic.chromFrac>c+fractionMin)&(dfSomatic.chromFrac<c+fractionMax)]
AFByPosPlot(dfSom.chromFrac, dfSom.meltedAF,c+fractionMin, c+fractionMax,1)

In [None]:
for inDBSNP in dfSomatic.inDBSNP.unique():
    ser = dfSomatic[(dfSomatic.inDBSNP == inDBSNP)&(dfSomatic.chromFrac<23)&(dfSomatic.consensus==True)]["meltedAF"].sort_values()
    cum_dist = np.linspace(0.,1.,len(ser))
    ser_cdf = pd.Series(cum_dist, index=ser,name=inDBSNP)
    ser_cdf.plot(drawstyle='steps',legend=True,title="AF by inDBSNP",figsize=[13,3],xlim=[0.0,0.5],ylim=[0.0,1.0])

In [None]:
AFByPosPlot(dfSom.meltedDP, dfSom.meltedAF,0, 300,1.0,3)

In [None]:
c,fractionMin,fractionMax = 1,0,22
dfTemp = dfBAF[(dfBAF.chromPos>c+fractionMin)&(dfBAF.chromPos<c+fractionMax)]
AFByPosPlot(dfTemp.chromPos, dfTemp.BAF,c+fractionMin, c+fractionMax)
dfSom = dfSomatic[(dfSomatic['variantType']=="SNP")&(dfSomatic['inPON']==False)&(dfSomatic['inDBSNP']==False)&(dfSomatic['consensus']==True)&(dfSomatic.chromFrac>c+fractionMin)&(dfSomatic.chromFrac<c+fractionMax)]
AFByPosPlot(dfSom.chromFrac, dfSom.meltedAF,c+fractionMin, c+fractionMax,1)

#CDF
ser = dfSom.meltedAF.sort_values()
cum_dist = np.linspace(0.,1.,len(ser))
ser_cdf = pd.Series(cum_dist, index=ser,name=": c="+str(ser.count())+" m="+str(round(ser.median(),2)))
ser_cdf.plot(drawstyle='steps',legend=True,title=" Allelic Frequency by ",figsize=[15,6],xlim=[0.0,0.7])

#mbaf
dfCNV[(dfCNV.chromPos>c+fractionMin)&(dfCNV.chromPos<c+fractionMax)&(dfCNV.mBAFCount>500)][['chrom','posStart','copyNum','chromPos','regionLength','mBAF','mBAFCount']]

###PEAK
X = dfSom.meltedAF.sort_values()[:, np.newaxis]
X_plot = np.linspace(0,1,100)[:, np.newaxis]
kde = KernelDensity(kernel='gaussian', bandwidth=0.02).fit(X)
log_dens = kde.score_samples(X_plot)
#pdfChart(log_dens)
findPeaks(log_dens)

### SV ANALYSIS

In [None]:
patientID="CPCT02070138"

In [None]:
svVCFFilename = findFiles(PATH,"somaticSV.vcf")[0]
print "SV =",svVCFFilename
print "patient =",patientID

In [None]:
SV_BED_PATH = "/Users/peterpriestley/hmf/analyses/slices/"
SV_BED_FILE_NAME = "cosmicCancerCensus.bed"
#BED_FILE_NAME = "exon6.bed"

In [None]:
dfSV = loadVCF(PATH,svVCFFilename,{patientID+'T':'tumor'},patientID,SV_BED_PATH,SV_BED_FILE_NAME)

In [None]:
pd.pivot_table(dfSV[(dfSV.variantSubType<>"IGN")&(dfSV.tumorAF>=0.001)], values='id', index=['variantSubType'], columns=['variantType'], aggfunc='count').fillna("")

In [None]:
def findMatchingBNDChromPos(df):
    # can 
    df['BNDMatchPos'] = ""
    for i in range(0,df.shape[0]):
        if df.iloc[i]['variantSubType'] == "BND":
            tmp = df.iloc[i]['tumorallele'].find("]")
            if tmp < 0:
                tmp = df.iloc[i]['tumorallele'].find("[")
            if tmp >=0:
                df.iloc[i,df.columns.get_loc('BNDMatchPos')] = df.iloc[i]['tumorallele'][tmp+1:df.iloc[i]['tumorallele'].find("]",tmp+1)]           
    return df

In [None]:
dfSV = findMatchingBNDChromPos(dfSV)

In [None]:
dfSV[(dfSV.bedRegion<>'A')&(dfSV.tumorAF<=0.01)&(dfSV.variantSubType=="DEL")][['chrom','pos','chromFrac','variantSubType','bedRegion','BNDMatchPos','tumorAF','tumorDP','tumorSGT','tumorSVLenMin','tumorSVLenMax','tumorSVStartMin','tumorSVStartMax']]

In [None]:
for variantSubType in dfSV.variantSubType.unique():
    ser = dfSV[(dfSV.variantSubType == variantSubType)&(dfSV.chromFrac<23)]["tumorAF"].sort_values()
    cum_dist = np.linspace(0.,1.,len(ser))
    ser_cdf = pd.Series(cum_dist, index=ser,name=variantSubType)
    ser_cdf.plot(drawstyle='steps',legend=True,title="AF",figsize=[13,3],xlim=[0.0,0.8],ylim=[0.0,1.0])

### SCRATCH

In [None]:
import random as rand
def randomiseAF(AF):
    return AF +(rand.random()-0.5)/100000
dfGermline['modAF']= dfGermline['normalAF'].apply(lambda x: randomiseAF(x))

In [None]:
plt.figure(figsize=[18,4])
plt.hist(dfGermline[(dfGermline.chrom <> 'A')&(dfGermline.normalSGT=='0/1')]['modAF'].values,
    cumulative=False, normed=False, bins=50,range=(0.0,1.0),histtype='bar',stacked = False,rwidth=0.8,label=['True','False'])
plt.title('AF of reference sample HC heterozygous calls ')
plt.show()

### RNA

In [None]:
rnaVCFFilename = findFiles(PATH,"snpSift.vcf")[0]

In [None]:
dfRNA = loadVCF(PATH,rnaVCFFilename,{'FR13253675':'tumor'},patientID,BED_PATH,BED_FILE_NAME)

In [None]:
dfRNA = dfRNA[dfRNA.variantType<>"Missing Genotype"]
dfRNA['alt']=dfRNA['tumorallele']
dfRNA['rnaGT']=dfRNA['tumorSGT']

In [None]:
dfRNA[['chrom','pos','ref','alt','rnaGT']].head()

In [None]:
dfSomatic = pd.merge(dfSomatic,dfRNA[['chrom','pos','ref','alt','rnaGT']],how='left', on=['chrom','pos','ref','alt'])
dfSomatic['rnaGT'].fillna('', inplace=True)
dfSomatic['inRNA'] = dfSomatic.rnaGT<>''

In [None]:
del dfSomatic['rnaGT_x']
del dfSomatic['rnaGT_y']
del dfSomatic['inRNA']

In [None]:
pd.pivot_table(dfSomatic[dfSomatic.consensus==True], values='pos', index=['annWorstEffect'], columns=['variantType','inRNA'], aggfunc='count',margins=False).fillna("")

In [None]:
pd.pivot_table(dfSomatic[dfSomatic.consensus==True], values='pos', index=['annWorstEffect'], columns=['inPON','variantType','inRNA'], aggfunc='count',margins=False).fillna("")

In [None]:
len(dfRNA[(dfRNA.annWorstEffect=='missense_variant')&(dfRNA.tumorSGT=='0/1')])

In [None]:
dfSomatic[((dfSomatic.annWorstImpact == "MODERATE")|(dfSomatic.annWorstImpact == "HIGH"))&(dfSomatic.consensus==True)][['chrom','pos','meltedAF','inRNA','inPON','inDBSNP','inCOSMIC','id','variantType','annGene','annWorstImpact','annWorstEffect']]

In [None]:
plt.figure(figsize=[18,4])
plt.hist(dfRNA[dfRNA.annWorstEffect =='missense_variant']['tumorDP'].values,
    cumulative=False, normed=False, bins=51,range=(0,50),histtype='bar',stacked = False,rwidth=0.8,label=['True','False'])
plt.title('RNA Depth PDF - missense variants')
plt.show()