In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pysam
from collections import Counter
from matplotlib import cm
from matplotlib.colors import ListedColormap

import io
from pysam import VariantFile
from matplotlib_venn import venn3
import venn

# Tumor burden

In [None]:
# tumor burden
tf_file_1 = pd.read_csv('../data/tumor_burden/tumor_burden_ichorcna_CCG.txt', header=None)
tf_file_1['patient'] = tf_file_1[0].str.split('.').str[0].str.split('_').str[-1]
tf_file_1['date'] = tf_file_1[0].str.split('.').str[1]
tf_file_1['patient_date'] = tf_file_1['patient']  + '_' + tf_file_1['date'] 
tf_file_1.set_index('patient_date', inplace=True)
tf_file_1.drop(0, axis=1, inplace=True)
tf_file_1.rename(columns={1: 'tumor_burden'}, inplace=True)

tf_file_2 = pd.read_csv('../data/tumor_burden/tumor_burden_ichorcna_CRC.txt', header=None)
tf_file_2['patient'] = tf_file_2[0].str.split('-').str[1].str.split('_').str[0]
tf_file_2['date'] = tf_file_2[0].str.split('_').str[1]
tf_file_2['patient_date'] = tf_file_2['patient'] + '_' + tf_file_2['date'] 
tf_file_2.set_index('patient_date', inplace=True)
tf_file_2.drop(0, axis=1, inplace=True)
tf_file_2.rename(columns={1: 'tumor_burden'}, inplace=True)

print(tf_file_1.shape, tf_file_2.shape)

tf_file = pd.concat([tf_file_1, tf_file_2])
tf_file['patient'] = tf_file['patient'].astype(int)
tf_file['date'] = pd.to_datetime(tf_file['date'], format='%d%m%y')
tf_file.reset_index(inplace=True)
tf_file.drop('patient_date', axis=1, inplace=True)
tf_file

# Treatment

In [None]:
treatment_file = pd.read_csv('../data/treatment/patient_treatment_total_std_201109.txt', sep='\t')
treatment_file['patient'] = treatment_file['patient'].astype(int)
treatment_file['date'] = pd.to_datetime(treatment_file['date'], format='%Y-%m-%d')
treatment_file.rename(columns={'value':'treatment'}, inplace=True)
treatment_file = treatment_file[['patient', 'date', 'treatment']]
treatment_file

## Patient 986

In [None]:
date_deepwgs_1 = '100215'
date_deepwgs_2 = '261016'

patient = 986
treatment_patient = treatment_file[treatment_file['patient'] == patient].sort_values('date')
treatment_patient['date'] = treatment_patient['date'].astype(str)
tf_patient = tf_file[tf_file['patient'] == patient].sort_values('date')
tf_patient['date'] = tf_patient['date'].astype(str)
df_patient = pd.concat([treatment_patient, tf_patient])
df_patient = df_patient.sort_values('date')
df_patient['date'] = df_patient['date'].astype(str)
df_patient['tumor_burden'] = df_patient['tumor_burden'].fillna(method='ffill')

In [None]:
plt.figure()
sns.set(font_scale=2)
fig, ax = plt.subplots(figsize=(50,10))
ax.plot(df_patient['date'], df_patient['tumor_burden'], 'k-', linewidth=4)
ax.plot(tf_patient['date'], tf_patient['tumor_burden'], 'k.', marker='s', markersize=20)
ax.set_ylabel('tumor burden', fontsize = 25.0)
labels = ax.axes.get_xticklabels()
ax.axes.set_xticklabels(labels, rotation=45, size = 20.0)
# twin object for two different y-axis on the sample plot
ax2=ax.twinx()
# make a plot with different y-axis using second axis object
sns.stripplot(y='treatment', x='date', hue='treatment', data=df_patient, s=20, ax=ax2) 
plt.legend((), ())
if not os.path.exists('../figures/oncosg_timeline_patient'+str(patient)+'.png'):
    plt.savefig('../figures/oncosg_timeline_patient'+str(patient)+'.png')

In [None]:
# check tf dates
tf_patient[['date', 'tumor_burden']]

## Patient 809

In [None]:
date_deepwgs_1 = '110914'
date_deepwgs_2 = '030915'

patient = 809
treatment_patient = treatment_file[treatment_file['patient'] == patient].sort_values('date')
treatment_patient['date'] = treatment_patient['date'].astype(str)
tf_patient = tf_file[tf_file['patient'] == patient].sort_values('date')

# data from oncosg added manually
tf_patient = tf_patient.append({'date': '2014-09-11', 'patient': 809, 'tumor_burden': 0.3577}, ignore_index=True)
tf_patient = tf_patient.append({'date': '2015-09-03', 'patient': 809, 'tumor_burden': 0.4626}, ignore_index=True)

tf_patient['date'] = tf_patient['date'].astype(str)
df_patient = pd.concat([treatment_patient, tf_patient])
df_patient = df_patient.sort_values('date')
df_patient['date'] = df_patient['date'].astype(str)
df_patient['tumor_burden'] = df_patient['tumor_burden'].fillna(method='ffill')

In [None]:
plt.figure()
sns.set(font_scale=2)
fig, ax = plt.subplots(figsize=(50,10))
ax.plot(df_patient['date'], df_patient['tumor_burden'], 'k-', linewidth=4)
ax.plot(tf_patient['date'], tf_patient['tumor_burden'], 'k.', marker='s', markersize=20)
ax.set_ylabel('tumor burden', fontsize = 25.0)
ax.set_ylim([0, 0.6])
labels = ax.axes.get_xticklabels()
ax.axes.set_xticklabels(labels, rotation=45, size = 20.0)
# twin object for two different y-axis on the sample plot
ax2=ax.twinx()
# make a plot with different y-axis using second axis object
sns.stripplot(y='treatment', x='date', hue='treatment', data=df_patient, s=20, ax=ax2) 
plt.legend((), ())
if not os.path.exists('../figures/oncosg_timeline_patient'+str(patient)+'.png'):
    plt.savefig('../figures/oncosg_timeline_patient'+str(patient)+'.png')

In [None]:
# check tf dates
tf_patient[['date', 'tumor_burden']]

Conclusion: We can only find samples with very estimated low tumor burden in patient 986. 

Let's manually check those samples do not contain any mutation.

# Mutation calls for patient 986
# Recent file

In [None]:
def read_vcf(path):
    with open(path, 'r') as f:
        lines = [l for l in f if not l.startswith('##')]
    res = pd.read_csv(
        io.StringIO(''.join(lines)),
        dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
               'QUAL': str, 'FILTER': str, 'INFO': str},
        sep='\t'
    ).rename(columns={'#CHROM': 'CHROM'})
    return res

In [None]:
patient = 986
mutation_df = read_vcf('../data/variant_calls/986_reGeno.VEP.vcf')
#print(mutation_file['GENE'].nunique())
#mutation_file = mutation_file[mutation_file['pid'] == patient]
#mutation_file.groupby('GENE').count()
mutation_df = mutation_df[mutation_df['FILTER'] == 'PASS']
print(mutation_df.columns)

In [None]:
col = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']
for it in list(date_lowtftimepoints):
    aux = '986_'+ it.split('-')[-1]+it.split('-')[1]+it.split('-')[0][-2:]
    if aux in mutation_df.columns:
        col.append(aux)
    else:
        print(aux)
print(col)
mutation_lowtftimepoints = mutation_df[col]
mutation_lowtftimepoints

In [None]:
for i in range(mutation_lowtftimepoints.shape[0]):
    print(float(mutation_lowtftimepoints['986_011015'].iloc[i].split(':')[-1])/float(mutation_lowtftimepoints['986_011015'].iloc[i].split(':')[-2]))

In [None]:
#foo = lambda x: pd.Series(x.split('|')[3])
foo2 = lambda x: pd.Series(float(x.split(':')[-1])/float(x.split(':')[-2]))

#mutation_lowtftimepoints['INFO'] = mutation_lowtftimepoints['INFO'].apply(foo)
mutation_lowtftimepoints['986_011015'] = mutation_lowtftimepoints['986_011015'].apply(foo2)
mutation_lowtftimepoints['986_110615'] = mutation_lowtftimepoints['986_110615'].apply(foo2)
mutation_lowtftimepoints

# Mutation calls for patient 986
# Older file

In [None]:
patient = 986
mutation_file = pd.read_csv('../data/variant_calls/total_variants_mmpm_puradj.txt', sep='\t')
print(mutation_file['GENE'].nunique())
mutation_file = mutation_file[mutation_file['pid'] == patient]
mutation_file.groupby('GENE').count()

In [None]:
mutation_lowtftimepoints = mutation_file[(mutation_file['ichor_tf'] == 0)][['date', 'GENE', 'REF', 'ALT', 'VAF', 'TIERS']]
mutation_lowtftimepoints = mutation_lowtftimepoints[(mutation_lowtftimepoints['TIERS'] != 'Germline') & (mutation_lowtftimepoints['TIERS'] != 'Artifact')]
# mutation_lowtftimepoints = mutation_lowtftimepoints[(mutation_lowtftimepoints['TIERS'] == 'Trusted') ]
mutation_lowtftimepoints

In [None]:
date_lowtftimepoints = mutation_lowtftimepoints['date'].unique()
date_lowtftimepoint

In [None]:
lowtftimepoints_dict = {'date': mutation_lowtftimepoints['date'].unique(),
                        'median VAF': [],
                        '# mutated genes' : [],
                        'median VAF within mutated genes': [],
                        '# mutated genes TRUSTED': [],
                        'median VAF within mutated genes TRUSTED': []
                       }

for date in mutation_lowtftimepoints['date'].unique():
    nmut = mutation_lowtftimepoints[(mutation_lowtftimepoints['date'] == date) & (mutation_lowtftimepoints['VAF'] != 0)].shape[0]
    nmuttrust = mutation_lowtftimepoints[(mutation_lowtftimepoints['date'] == date)  & (mutation_lowtftimepoints['VAF'] != 0)  & (mutation_lowtftimepoints['TIERS'] == 'Trusted')].shape[0]
    medianvaf = np.median(mutation_lowtftimepoints[(mutation_lowtftimepoints['date'] == date)]['VAF'].values)
    medianvafn = np.median(mutation_lowtftimepoints[(mutation_lowtftimepoints['date'] == date) & (mutation_lowtftimepoints['VAF'] != 0)]['VAF'].values)
    medianvafntrust = np.median(mutation_lowtftimepoints[(mutation_lowtftimepoints['date'] == date) & (mutation_lowtftimepoints['VAF'] != 0) & (mutation_lowtftimepoints['TIERS'] == 'Trusted')]['VAF'].values)
    lowtftimepoints_dict['# mutated genes'].append(nmut)
    lowtftimepoints_dict['# mutated genes TRUSTED'].append(nmuttrust)
    lowtftimepoints_dict['median VAF'].append(medianvaf)
    lowtftimepoints_dict['median VAF within mutated genes'].append(medianvafn)
    lowtftimepoints_dict['median VAF within mutated genes TRUSTED'].append(medianvafntrust)
    

lowtftimepoints_pd = pd.DataFrame.from_dict(lowtftimepoints_dict)
lowtftimepoints_pd.set_index('date', inplace=True)
lowtftimepoints_pd

In [None]:
genes2checkmanually = []
results_df = None
for date in mutation_lowtftimepoints['date'].unique():
    print(date)
    if results_df is None:
        results_df = mutation_lowtftimepoints[(mutation_lowtftimepoints['date'] == date) & (mutation_lowtftimepoints['VAF'] != 0)  & (mutation_lowtftimepoints['TIERS'] == 'Trusted')]
    else:
        results_df = pd.concat([results_df, mutation_lowtftimepoints[(mutation_lowtftimepoints['date'] == date) & (mutation_lowtftimepoints['VAF'] != 0)  & (mutation_lowtftimepoints['TIERS'] == 'Trusted')]])
    ll = list(mutation_lowtftimepoints[(mutation_lowtftimepoints['date'] == date) & (mutation_lowtftimepoints['VAF'] != 0)  & (mutation_lowtftimepoints['TIERS'] == 'Trusted')]['GENE'].values)
    for g in ll:
        genes2checkmanually.append(g)
genes2checkmanually = list(np.unique(genes2checkmanually))
results_df

In [None]:
# print genes to check munually
genes2checkmanually

In [None]:
# need to get gene location in hg19
print(mutation_lowtftimepoints['date'].unique())
gene_df = None
for date in  mutation_lowtftimepoints['date'].unique():
    for i, r in mutation_lowtftimepoints[(mutation_lowtftimepoints['date'] == date) & (mutation_lowtftimepoints['VAF'] != 0)  & (mutation_lowtftimepoints['TIERS'] == 'Trusted')].iterrows():
        if gene_df is None:
            gene_df = mutation_file[['date', '#CHROM', 'POS', 'REF', 'ALT', 'GENE']][(mutation_file['date'] == r['date']) & (mutation_file['GENE'] == r['GENE'])]
        else:
            gene_df = pd.concat([gene_df, mutation_file[['date', '#CHROM', 'POS', 'REF', 'ALT', 'GENE']][(mutation_file['date'] == r['date']) & (mutation_file['GENE'] == r['GENE'])]])
gene_df.drop('date', axis=1, inplace=True)
gene_df.drop_duplicates(inplace=True)
gene_df

In [None]:
gene_df[['#CHROM', 'POS', 'GENE', 'REF', 'ALT']].to_csv('../data/variant_calls/positionfile.txt', header=False, index=False)

In [None]:
# location
# /mnt/projects/zwpoh/cfDNA/bulk/ccg/ccg_batch2/lpwgs/hg19_bam/patient/986*.bam

# 4 mutations across time

In [None]:
mutations_acrosstime = mutation_file[(mutation_file['GENE'] == 'EPHB2') | (mutation_file['GENE'] == 'APC') | (mutation_file['GENE'] == 'SOX9') | (mutation_file['GENE'] == 'PIK3CA') | (mutation_file['GENE'] == 'TP53')][['date', 'GENE', 'REF', 'ALT', 'VAF', 'TIERS']]
mutations_acrosstime = mutations_acrosstime[(mutations_acrosstime['TIERS'] == 'Trusted')]
mutations_acrosstime

In [None]:
# mutations_acrosstime = mutations_acrosstime.pivot_table(values='VAF', index='GENE', columns='date', aggfunc='first')
mutations_acrosstime = mutations_acrosstime.T
mutations_acrosstime

In [None]:
date_deepwgs_1 = '100215'
date_deepwgs_2 = '261016'

patient = 986
treatment_patient = treatment_file[treatment_file['patient'] == patient].sort_values('date')
treatment_patient['date'] = treatment_patient['date'].astype(str)
tf_patient = tf_file[tf_file['patient'] == patient].sort_values('date')
tf_patient['date'] = tf_patient['date'].astype(str)
df_patient = pd.concat([treatment_patient, tf_patient])
df_patient = df_patient.sort_values('date')
df_patient['date'] = df_patient['date'].astype(str)
df_patient = df_patient.sort_values(by='date')
#df_patient['tumor_burden'] = df_patient['tumor_burden'].fillna(method='ffill')

In [None]:
def interpolate_gaps(values, limit=None):
    """
    Fill gaps using linear interpolation, optionally only fill gaps up to a
    size of `limit`.
    """
    values = np.asarray(values)
    i = np.arange(values.size)
    valid = np.isfinite(values)
    filled = np.interp(i, i[valid], values[valid])

    if limit is not None:
        invalid = ~valid
        for n in range(1, limit+1):
            invalid[:-n] &= invalid[n:]
        filled[invalid] = np.nan

    return filled

btf = interpolate_gaps(np.copy(df_patient['tumor_burden'].values), limit=None)
#btf = np.copy(df_patient['tumor_burden'].values)
#nans, xbtf = nan_helper(btf)
#btf[nans] = np.interp(xbtf(nans), xbtf(~nans), btf[~nans])

In [None]:
# to solve x-axis issue


In [None]:
plt.figure()
sns.set(style="ticks", context="talk")
sns.set(font_scale=2)
fig, ax2 = plt.subplots(figsize=(40,10))

# make a plot with different y-axis using second axis object
sns.stripplot(y='treatment', x='date', hue='treatment', data=df_patient, s=10, ax=ax2)
plt.legend((), ())
labels = ax2.axes.get_xticklabels()
# twin object for two different y-axis on the sample plot
ax=ax2.twinx()
ax.plot(df_patient['date'], df_patient['tumor_burden'], 'k.', marker='s', markersize=10)
ax2.set_xticklabels([])
ax2.set_xticks(posnewlabels)
ax2.set_xticklabels(newlabels, rotation=90)


ax.plot(df_patient['date'][~df_patient['tumor_burden'].isna()], df_patient['tumor_burden'][~df_patient['tumor_burden'].isna()], 'k-', linewidth=4)
ax.plot(df_patient['date'][~df_patient['tumor_burden'].isna()], df_patient['tumor_burden'][~df_patient['tumor_burden'].isna()], 'k.', marker='s', markersize=10)
ax.set_ylabel('ichorCNA tumor burden', fontsize = 25.0)
ax.set_ylim(-0.01, 1)

ax.set_xticklabels([])
ax.set_xticks(posnewlabels)
ax.set_xticklabels(newlabels)

xacrosstime = [i for i in mutations_acrosstime.index if i in df_patient['date'].values]

yacrosstime = [m for i,m in enumerate(mutations_acrosstime['APC'].values) if mutations_acrosstime.index[i] in df_patient['date'].values]
ele1 = ax.plot(xacrosstime, yacrosstime, 'r-', linewidth=2, label='APC')
ax.plot(xacrosstime, yacrosstime, 'r.', marker='D', markersize=10)

yacrosstime = [m for i,m in enumerate(mutations_acrosstime['PIK3CA'].values) if mutations_acrosstime.index[i] in df_patient['date'].values]
ele2 = ax.plot(xacrosstime, yacrosstime, 'b-', linewidth=2, label='PIK3CA')
ax.plot(xacrosstime, yacrosstime, 'b.', marker='D', markersize=10)

yacrosstime = [m for i,m in enumerate(mutations_acrosstime['EPHB2'].values) if mutations_acrosstime.index[i] in df_patient['date'].values]
ele3 = ax.plot(xacrosstime, yacrosstime, 'g-', linewidth=2, label='EPHB2')
ax.plot(xacrosstime, yacrosstime, 'g.', marker='D', markersize=10)

yacrosstime = [m for i,m in enumerate(mutations_acrosstime['SOX9'].values) if mutations_acrosstime.index[i] in df_patient['date'].values]
ele4 = ax.plot(xacrosstime, yacrosstime, 'c-', linewidth=2, label='SOX9')
ax.plot(xacrosstime, yacrosstime, 'c.', marker='D', markersize=10)

yacrosstime = [m for i,m in enumerate(mutations_acrosstime['TP53'].values) if mutations_acrosstime.index[i] in df_patient['date'].values]
ele5 = ax.plot(xacrosstime, yacrosstime, 'm-', linewidth=2, label='TP53')
ax.plot(xacrosstime, yacrosstime, 'm.', marker='D', markersize=10)

fig.legend([ele1, ele2, ele3, ele4, ele5], ['APC', 'PIK3CA', 'EPHB2', 'SOX9', 'TP53'], loc='upper left')
ax.legend(loc='upper left')

plt.title('Patient 986: VAF evolution across timepoints')
plt.show()

In [None]:
plt.figure()
plt.style.use("dark_background")
sns.set(style="darkgrid", context="talk")
plt.rcParams.update({"grid.linewidth":0.5, "grid.alpha":0.5})
sns.set(font_scale=2)
fig, ax2 = plt.subplots(figsize=(40,10))

# make a plot with different y-axis using second axis object
sns.stripplot(y='treatment', x='date', hue='treatment', data=df_patient, s=10, ax=ax2)
plt.legend((), ())
labels = ax2.axes.get_xticklabels()
# twin object for two different y-axis on the sample plot
ax=ax2.twinx()
ax.plot(df_patient['date'], df_patient['tumor_burden'], 'k.', marker='s', markersize=10)
ax2.set_xticklabels([])
ax2.set_xticks(posnewlabels)
ax2.set_xticklabels(newlabels, rotation=90)

ax.plot(df_patient['date'][~df_patient['tumor_burden'].isna()], df_patient['tumor_burden'][~df_patient['tumor_burden'].isna()], 'k-', linewidth=4)
ax.plot(df_patient['date'][~df_patient['tumor_burden'].isna()], df_patient['tumor_burden'][~df_patient['tumor_burden'].isna()], 'k.', marker='s', markersize=10)
ax.set_ylabel('ichorCNA tumor burden', fontsize = 25.0)
ax.set_ylim(-0.01, 1)

ax.set_xticklabels([])
ax.set_xticks(posnewlabels)
ax.set_xticklabels(newlabels)

xacrosstime = [i for i in mutations_acrosstime.index if i in df_patient['date'].values]

yacrosstime = [m for i,m in enumerate(mutations_acrosstime['APC'].values) if mutations_acrosstime.index[i] in df_patient['date'].values]
ele1 = ax.plot(xacrosstime, yacrosstime, 'r-', linewidth=2, label='APC')
ax.plot(xacrosstime, yacrosstime, 'r.', marker='D', markersize=10)

yacrosstime = [m for i,m in enumerate(mutations_acrosstime['PIK3CA'].values) if mutations_acrosstime.index[i] in df_patient['date'].values]
ele2 = ax.plot(xacrosstime, yacrosstime, 'b-', linewidth=2, label='PIK3CA')
ax.plot(xacrosstime, yacrosstime, 'b.', marker='D', markersize=10)

yacrosstime = [m for i,m in enumerate(mutations_acrosstime['EPHB2'].values) if mutations_acrosstime.index[i] in df_patient['date'].values]
ele3 = ax.plot(xacrosstime, yacrosstime, 'g-', linewidth=2, label='EPHB2')
ax.plot(xacrosstime, yacrosstime, 'g.', marker='D', markersize=10)

yacrosstime = [m for i,m in enumerate(mutations_acrosstime['SOX9'].values) if mutations_acrosstime.index[i] in df_patient['date'].values]
ele4 = ax.plot(xacrosstime, yacrosstime, 'c-', linewidth=2, label='SOX9')
ax.plot(xacrosstime, yacrosstime, 'c.', marker='D', markersize=10)

yacrosstime = [m for i,m in enumerate(mutations_acrosstime['TP53'].values) if mutations_acrosstime.index[i] in df_patient['date'].values]
ele5 = ax.plot(xacrosstime, yacrosstime, 'm-', linewidth=2, label='TP53')
ax.plot(xacrosstime, yacrosstime, 'm.', marker='D', markersize=10)

fig.legend([ele1, ele2, ele3, ele4, ele5], ['APC', 'PIK3CA', 'EPHB2', 'SOX9', 'TP53'], loc='upper left')
ax.legend(loc='upper left')

plt.title('Patient 986: VAF evolution across timepoints')
plt.show()

In [None]:
newlabels = []
posnewlabels = []
c = 0
for l in labels:
    print(l)
    x = str(l).split(',')[0].split('(')[-1]
    y = str(l).split(',')[1][1:]
    d = str(l).split(',')[2][2:-2]
    if d in list(tf_patient['date'].values):
        print(c)
        c += 1
        posnewlabels.append(int(x))
        newlabels.append(plt.text(x, y, d))
for ln in newlabels:
    print(ln)

In [None]:
for l in newlabels:
    d = str(l).split(',')[2][2:-2]
    print(d)
    if d not in list(mutations_acrosstime.index):
        print(d)

In [None]:
list(mutations_acrosstime.index)

In [None]:
print(list(mutations_acrosstime.index))
len(list(mutations_acrosstime.index))

In [None]:
len(newlabels)

In [None]:
plt.plot(mutations_acrosstime.index, mutations_acrosstime['TP53'], 'm-', linewidth=4, label='TP53')
plt.xticks(rotation = 90)

In [None]:
mutation_file[mutation_file['#CHROM'] == 'chr22']['GENE'].unique()

In [None]:
mutation_file[mutation_file['#CHROM'] == 'chr22'][['#CHROM', 'POS', 'REF', 'ALT']].drop_duplicates()