In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pysam
from collections import Counter
from matplotlib import cm
from matplotlib.colors import ListedColormap
from datetime import datetime, date, timedelta

import io
from pysam import VariantFile

import warnings
warnings.filterwarnings('ignore')

In [None]:
context = 'paper' # talk or paper

if context == 'talk':
    sns.set(style="darkgrid", context="talk", rc={"lines.linewidth": 2, "legend.fontsize":25})
    plt.style.use("dark_background")
    plt.rcParams.update({"grid.linewidth":0.5, "grid.alpha":0.5,'font.size': 30})
    sns.set_palette("deep")
else:
    sns.set(context="paper", font_scale=4, rc={"lines.linewidth": 2, "legend.fontsize":25})
    plt.rcParams.update({'font.size': 35})
    sns.set_palette("deep")

lc = 'w' if context == 'talk' else 'k'

# For one patient

In [None]:
# CRC deep WGS patients
# MSS: 986, 809, 1014, 519, 1279
# MS status not done: 512, 1531

patient = 986

## Tumor burden

In [None]:
# tumor burden
tf_file_1 = pd.read_csv('../data/tumor_burden/tumor_burden_ichorcna_batch1.txt', header=None)
tf_file_1['patient'] = tf_file_1[0].str.split('-').str[1].str.split('_').str[0]
tf_file_1['date'] = tf_file_1[0].str.split('_').str[1]
tf_file_1['patient_date'] = tf_file_1['patient']  + '_' + tf_file_1['date'] 
tf_file_1.set_index('patient_date', inplace=True)
tf_file_1.drop(0, axis=1, inplace=True)
tf_file_1.rename(columns={1: 'tumor_burden'}, inplace=True)

tf_file_2 = pd.read_csv('../data/tumor_burden/tumor_burden_ichorcna_batch2.txt', header=None)
tf_file_2['patient'] = tf_file_2[0].str.split('_').str[0]
tf_file_2['date'] = tf_file_2[0].str.split('_').str[1]
tf_file_2['patient_date'] = tf_file_2['patient'] + '_' + tf_file_2['date'] 
tf_file_2.set_index('patient_date', inplace=True)
tf_file_2.drop(0, axis=1, inplace=True)
tf_file_2.rename(columns={1: 'tumor_burden'}, inplace=True)

tf_file_3 = pd.read_csv('../data/tumor_burden/tumor_burden_ichorcna_deepWGS.txt', header=None)
tf_file_3['patient'] = tf_file_3[0].str.split('_').str[0]
tf_file_3['date'] = tf_file_3[0].str.split('_').str[1]
tf_file_3['patient_date'] = tf_file_3['patient']  + '_' + tf_file_3['date'] 
tf_file_3.set_index('patient_date', inplace=True)
tf_file_3.drop(0, axis=1, inplace=True)
tf_file_3.rename(columns={1: 'tumor_burden'}, inplace=True)

tf_file = pd.concat([tf_file_1, tf_file_2, tf_file_3])
tf_file['patient'] = tf_file['patient'].astype(int)
tf_file['date'] = pd.to_datetime(tf_file['date'], format='%d%m%y')
tf_file.reset_index(inplace=True)
tf_file.drop('patient_date', axis=1, inplace=True)

tumorburden_dates = tf_file[tf_file['patient'] == patient]['date'].sort_values().astype(str).unique()
print(tumorburden_dates)
print(tf_file.shape)

#tf_file['patient'].unique()
print(tf_file_1.shape[0], tf_file_2.shape[0], tf_file_3.shape[0], tf_file_1.shape[0] + tf_file_2.shape[0] + tf_file_3.shape[0], tf_file.shape[0])
print(65 + 192 + 10)

print(tf_file[tf_file['patient'] == patient].shape[0])
tf_file[tf_file['patient'] == patient]

## Treatment

In [None]:
treatment_file = pd.read_csv('../data/treatment/patient_treatment_total_std_201109.txt', sep='\t')
treatment_file['patient'] = treatment_file['patient'].astype(int)
treatment_file['date'] = pd.to_datetime(treatment_file['date'], format='%Y-%m-%d')
treatment_file.rename(columns={'value':'treatment'}, inplace=True)
treatment_file = treatment_file[['patient', 'date', 'treatment']]
treatment_file[treatment_file['patient'] == patient]['date'].astype(str).unique()

## Plot patient timeline

In [None]:
treatment_patient = treatment_file[treatment_file['patient'] == patient].sort_values('date')
treatment_patient['date'] = treatment_patient['date'].astype(str)
tf_patient = tf_file[tf_file['patient'] == patient].sort_values('date')
tf_patient['date'] = tf_patient['date'].astype(str)
df_patient = pd.concat([treatment_patient, tf_patient])
df_patient = df_patient.sort_values('date')

In [None]:
print([str(a) for a in tf_patient['date'].unique() if a not in df_patient['date'].unique()])
print([str(b) for b in df_patient['date'].unique() if b not in tf_patient['date'].unique()])
alldates = sorted(list(set(list(df_patient['date'].values) + list(tf_patient['date'].values))))
print(len(alldates))

In [None]:
daytimes = [(datetime.strptime(alldates[i], '%Y-%m-%d') - datetime.strptime(alldates[0], '%Y-%m-%d')).days
            for i in range(len(alldates))]
print(daytimes)

In [None]:
plt.figure()

fig, ax2 = plt.subplots(figsize=(40,10))

# make a plot with different y-axis using second axis object
sns.stripplot(y='treatment', x='date', hue='treatment', data=df_patient, s=10, ax=ax2)
ax2.grid(False)
plt.legend((), ())
        
# twin object for two different y-axis on the sample plot
ax=ax2.twinx()
ele0 = ax.plot(df_patient['date'], df_patient['tumor_burden'], lc+'.', marker='s', markersize=10, label='ichorCNA tumor burden')

ax.plot(df_patient['date'][~df_patient['tumor_burden'].isna()], df_patient['tumor_burden'][~df_patient['tumor_burden'].isna()], lc+'-', linewidth=4)
ax.plot(df_patient['date'][~df_patient['tumor_burden'].isna()], df_patient['tumor_burden'][~df_patient['tumor_burden'].isna()], lc+'.', marker='s', markersize=10)
ax.set_ylabel('fraction (tumor burden or VAF)', fontsize=30)
ax.set_ylim(-0.01, 1)

fig.legend([ele0], ['ichorCNA tumor burden'], loc='upper left')
ax.legend(loc='upper left')

labels = [ad if ad in tumorburden_dates else '' for ad in alldates]
ax2.set_xticklabels(labels, rotation=90, fontsize=30)
ax.set_xticklabels(labels, rotation=90, fontsize=30)

plt.title('Patient {}: VAF evolution across timepoints'.format(patient))

if not os.path.exists('../figures/low_tb_timepoints/oncosg_timeline_patient'+str(patient)+'.png'):
    plt.savefig('../figures/low_tb_timepoints/oncosg_timeline_patient'+str(patient)+'.png', bbox_inches='tight')
    
plt.show()

In [None]:
# check tf dates

date_lowtftimepoints = list(tf_patient[tf_patient['tumor_burden'] == 0]['date'].unique())
if date_lowtftimepoints == []:
    print('no zero ichorCNA estimate tumor burden')
    print('min tumor burden is {}'.format(min(tf_patient['tumor_burden'])))
    if min(tf_patient['tumor_burden']) < 0.1:
        date_lowtftimepoints = list(tf_patient[tf_patient['tumor_burden'] == min(tf_patient['tumor_burden'])]['date'].unique())
print(date_lowtftimepoints)
tf_patient[['date', 'tumor_burden']].T

## Mutation calls

In [None]:
def read_vcf(path):
    with open(path, 'r') as f:
        lines = [l for l in f if not l.startswith('##')]
    res = pd.read_csv(
        io.StringIO(''.join(lines)),
        dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
               'QUAL': str, 'FILTER': str, 'INFO': str},
        sep='\t'
    ).rename(columns={'#CHROM': 'CHROM'})
    return res

In [None]:
# 226 - panel
mutation_df = pd.read_excel('../data/variant_calls/226 PANEL VARIANTS CLASSIFICSATION EXCEL/CCG_226_'+str(patient)+'_reGeno.VEP.readable_tiers.xls')
print(mutation_df.columns)

col = ['#CHROM', 'POS', 'REF', 'ALT', 'GENE', 'TIERS']
targeted_lowtf = []
for it in list(date_lowtftimepoints):
    aux1 = 'CCG_226_'+str(patient)+'.'+ it.split('-')[-1]+it.split('-')[1]+it.split('-')[0][-2:]+'.P'
    aux2 = 'CCG_226_'+str(patient)+'.'+ it.split('-')[-1]+it.split('-')[1]+it.split('-')[0][-2:]
    if sum(mutation_df.columns.str.contains(aux1)) == 1 or sum(mutation_df.columns.str.contains(aux2)) == 1:
        if sum(mutation_df.columns.str.contains(aux1)) == 1:
            idx = mutation_df.columns.str.contains(aux1).tolist().index(True)
        elif sum(mutation_df.columns.str.contains(aux2)) == 1:
            idx = mutation_df.columns.str.contains(aux2).tolist().index(True)
        col.append(mutation_df.columns[idx])
        targeted_lowtf.append(str(pd.to_datetime(mutation_df.columns[idx].split('.')[1], format='%d%m%y')).split(' ')[0])
print(col)
mutation_df = mutation_df[col]
mutation_df.insert(loc = 6,
      column = 'helper',
      value = 'hello')
mutation_lowtftimepoints_226 = (mutation_df.set_index(col[:6]+['helper'])
                               .stack()
                               .unstack(-2)
                               .ffill(axis=1)
                               .bfill(axis=1, downcast='infer')
                               .add_prefix('new_')
                               .reset_index()
                               .rename({'level_6': 'date'}, axis=1))

mutation_lowtftimepoints_226['date'] = mutation_lowtftimepoints_226['date'].str.split('.').str[1]
mutation_lowtftimepoints_226['date'] = pd.to_datetime(mutation_lowtftimepoints_226['date'], format='%d%m%y').astype(str)
foo2 = lambda x: pd.Series(float(x.split(' / ')[0])/float(x.split(' / ')[1].split(' = ')[0]))
mutation_lowtftimepoints_226['VAF'] = mutation_lowtftimepoints_226['new_hello'].apply(foo2)
mutation_lowtftimepoints_226.drop('new_hello', axis=1, inplace=True)
#mutation_lowtftimepoints_226 = mutation_lowtftimepoints_226[mutation_lowtftimepoints_226['TIERS'] == 'Trusted']
print(mutation_lowtftimepoints_226.shape)
print(targeted_lowtf)
mutation_lowtftimepoints_226.head()

In [None]:
# MCP panel
mutation_df = pd.read_csv('../data/variant_calls/final_readablefile/CCG_MCP_'+str(patient)+'_reGeno.VEP.readable.txt', sep='\t')
print(mutation_df.columns)

col = ['#CHROM', 'POS', 'REF', 'ALT', 'GENE', 'TIERS']
if 'TIERS' not in mutation_df.columns:
    mutation_df['TIERS'] = 'Trusted'

for it in list(date_lowtftimepoints):
    aux1 = 'CCG_MCP_'+str(patient)+'.'+ it.split('-')[-1]+it.split('-')[1]+it.split('-')[0][-2:]+'.P'
    if sum(mutation_df.columns.str.contains(aux1)) == 1:
        idx = mutation_df.columns.str.contains(aux1).tolist().index(True)
        print(idx)
    else:
        aux2 = 'CCG_MCP_'+str(patient)+'.'+ it.split('-')[-1]+it.split('-')[1]+it.split('-')[0][-2:]
        if sum(mutation_df.columns.str.contains(aux2)) == 1:
            idx = mutation_df.columns.str.contains(aux2).tolist().index(True)
            print(idx)
        col.append(mutation_df.columns[idx])
print(col)
mutation_df = mutation_df[col]
mutation_df.insert(loc = 6,
      column = 'helper',
      value = 'hello')
mutation_lowtftimepoints_MCP = (mutation_df.set_index(col[:6]+['helper'])
                               .stack()
                               .unstack(-2)
                               .ffill(axis=1)
                               .bfill(axis=1, downcast='infer')
                               .add_prefix('new_')
                               .reset_index()
                               .rename({'level_6': 'date'}, axis=1))

mutation_lowtftimepoints_MCP['date'] = mutation_lowtftimepoints_MCP['date'].str.split('.').str[1]
mutation_lowtftimepoints_MCP['date'] = pd.to_datetime(mutation_lowtftimepoints_MCP['date'], format='%d%m%y').astype(str)
foo2 = lambda x: pd.Series(float(x.split(' / ')[0])/float(x.split(' / ')[1].split(' = ')[0]))
mutation_lowtftimepoints_MCP['VAF'] = mutation_lowtftimepoints_MCP['new_hello'].apply(foo2)
mutation_lowtftimepoints_MCP.drop('new_hello', axis=1, inplace=True)
print(mutation_lowtftimepoints_MCP.shape)
mutation_lowtftimepoints_MCP.head()

In [None]:
mutation_lowtftimepoints = mutation_lowtftimepoints_226
#mutation_lowtftimepoints = pd.concat([mutation_lowtftimepoints_226, mutation_lowtftimepoints_MCP])
print(mutation_lowtftimepoints.shape)
print(mutation_lowtftimepoints['date'].unique())
mutation_lowtftimepoints[mutation_lowtftimepoints['TIERS'] == 'Trusted']

In [None]:
lowtftimepoints_dict = {'date': date_lowtftimepoints,
                        'median VAF': [],
                        '# mutated genes' : [],
                        'median VAF within mutated genes': [],
                        '# mutated genes TRUSTED': [],
                        'median VAF within mutated genes TRUSTED': []
                       }

for date in date_lowtftimepoints:
    nmut = mutation_lowtftimepoints[(mutation_lowtftimepoints['date'] == date) & (mutation_lowtftimepoints['VAF'] != 0)].shape[0]
    nmuttrust = mutation_lowtftimepoints[(mutation_lowtftimepoints['date'] == date)  & (mutation_lowtftimepoints['VAF'] != 0)  & (mutation_lowtftimepoints['TIERS'] == 'Trusted')].shape[0]
    medianvaf = np.median(mutation_lowtftimepoints[(mutation_lowtftimepoints['date'] == date)]['VAF'].values)
    medianvafn = np.median(mutation_lowtftimepoints[(mutation_lowtftimepoints['date'] == date) & (mutation_lowtftimepoints['VAF'] != 0)]['VAF'].values)
    medianvafntrust = np.median(mutation_lowtftimepoints[(mutation_lowtftimepoints['date'] == date) & (mutation_lowtftimepoints['VAF'] != 0) & (mutation_lowtftimepoints['TIERS'] == 'Trusted')]['VAF'].values)
    lowtftimepoints_dict['# mutated genes'].append(nmut)
    lowtftimepoints_dict['# mutated genes TRUSTED'].append(nmuttrust)
    lowtftimepoints_dict['median VAF'].append(medianvaf)
    lowtftimepoints_dict['median VAF within mutated genes'].append(medianvafn)
    lowtftimepoints_dict['median VAF within mutated genes TRUSTED'].append(medianvafntrust)
    

lowtftimepoints_pd = pd.DataFrame.from_dict(lowtftimepoints_dict)
lowtftimepoints_pd.set_index('date', inplace=True)
lowtftimepoints_pd.loc[targeted_lowtf]

In [None]:
targeted_lowtf

## Plot mutations timeline

In [None]:
mutation_df_MCP = pd.read_csv('../data/variant_calls/final_readablefile/CCG_MCP_'+str(patient)+'_reGeno.VEP.readable.txt', sep='\t')
print(mutation_df_MCP.columns)


col = ['#CHROM', 'POS', 'REF', 'ALT', 'GENE', 'TIERS']
if 'TIERS' not in mutation_df_MCP.columns:
    mutation_df_MCP['TIERS'] = 'Trusted'

for c in list(mutation_df_MCP.columns[6:]):
    if c.startswith('CCG_MCP_'+str(patient)):
        col.append(c)
print(col)
mutation_df_MCP = mutation_df_MCP[col]
mutation_df_MCP.insert(loc = 6,
      column = 'helper',
      value = 'hello')
mutations_acrosstime_MCP = (mutation_df_MCP.set_index(col[:6]+['helper'])
                               .stack()
                               .unstack(-2)
                               .ffill(axis=1)
                               .bfill(axis=1, downcast='infer')
                               .add_prefix('new_')
                               .reset_index()
                               .rename({'level_6': 'date'}, axis=1))
mutations_acrosstime_MCP['date'] = mutations_acrosstime_MCP['date'].str.split('.').str[1]
mutations_acrosstime_MCP['date'] = pd.to_datetime(mutations_acrosstime_MCP['date'], format='%d%m%y').astype(str)
foo2 = lambda x: pd.Series(float(x.split(' / ')[0])/float(x.split(' / ')[1].split(' = ')[0]))
mutations_acrosstime_MCP['VAF'] = mutations_acrosstime_MCP['new_hello'].apply(foo2)
mutations_acrosstime_MCP.drop('new_hello', axis=1, inplace=True)
mutations_acrosstime_MCP.rename(columns={'TIER': 'TIERS'}, inplace=True)
mutations_acrosstime_MCP = mutations_acrosstime_MCP.pivot_table(values='VAF', index='GENE', columns='date', aggfunc='first')
mutations_acrosstime_MCP = mutations_acrosstime_MCP.T
mutations_acrosstime_MCP.head()

In [None]:
mutation_df_226 = pd.read_excel('../data/variant_calls/226 PANEL VARIANTS CLASSIFICSATION EXCEL/CCG_226_'+str(patient)+'_reGeno.VEP.readable_tiers.xls')
print(mutation_df_226.columns)


col = ['#CHROM', 'POS', 'REF', 'ALT', 'GENE', 'TIERS']
mutation_df_226 = mutation_df_226[mutation_df_226["TIERS"] == 'Trusted']
for c in list(mutation_df_226.columns[6:]):
    if c.startswith('CCG_226_'+str(patient)):
        col.append(c)
print(col)
mutation_df_226 = mutation_df_226[col]
mutation_df_226.insert(loc = 6,
      column = 'helper',
      value = 'hello')
mutations_acrosstime_226 = (mutation_df_226.set_index(col[:6]+['helper'])
                               .stack()
                               .unstack(-2)
                               .ffill(axis=1)
                               .bfill(axis=1, downcast='infer')
                               .add_prefix('new_')
                               .reset_index()
                               .rename({'level_6': 'date'}, axis=1))
mutations_acrosstime_226['date'] = mutations_acrosstime_226['date'].str.split('.').str[1]
mutations_acrosstime_226['date'] = pd.to_datetime(mutations_acrosstime_226['date'], format='%d%m%y').astype(str)
foo2 = lambda x: pd.Series(float(x.split(' / ')[0])/float(x.split(' / ')[1].split(' = ')[0]))
mutations_acrosstime_226['VAF'] = mutations_acrosstime_226['new_hello'].apply(foo2)
mutations_acrosstime_226.drop('new_hello', axis=1, inplace=True)
mutations_acrosstime_226 = mutations_acrosstime_226.pivot_table(values='VAF', index='GENE', columns='date', aggfunc='first')
mutations_acrosstime_226 = mutations_acrosstime_226.T
mutations_acrosstime_226.head()

In [None]:
#mutations_acrosstime = pd.concat([mutations_acrosstime_226, mutations_acrosstime_MCP], axis=1)
mutations_acrosstime = mutations_acrosstime_226
print(mutations_acrosstime.shape)
mutations_acrosstime.loc[targeted_lowtf]

In [None]:
plt.figure()

fig, ax2 = plt.subplots(figsize=(40,10))

# make a plot with different y-axis using second axis object
sns.stripplot(y='treatment', x='date', hue='treatment', data=df_patient, s=10, ax=ax2)
labels = ax2.axes.get_xticklabels()
ax2.grid(False)
plt.legend((), ())
        
# twin object for two different y-axis on the sample plot
ax=ax2.twinx()
ele0 = ax.plot(df_patient['date'], df_patient['tumor_burden'], lc+'.', marker='s', markersize=10, label='ichorCNA tumor burden')

ax.plot(df_patient['date'][~df_patient['tumor_burden'].isna()], df_patient['tumor_burden'][~df_patient['tumor_burden'].isna()], lc+'-', linewidth=4)
ax.plot(df_patient['date'][~df_patient['tumor_burden'].isna()], df_patient['tumor_burden'][~df_patient['tumor_burden'].isna()], lc+'.', marker='s', markersize=10)
ax.set_ylabel('fraction (tumor burden or VAF)', fontsize=30)
ax.set_ylim(-0.01, 1)

xacrosstime = [i for i in mutations_acrosstime.index if i in df_patient['date'].values]

eles = [ele0]
collist = ['r', 'b', 'g', 'c', 'm', 'y', 'tab:pink', 'tab:orange', 'tab:olive', 'tab:purple', 'grey']
if mutation_df_226["TIERS"].iloc[0] == 'Trusted':
    lstype = '-' 
elif mutation_df_226["TIERS"].iloc[0] == 'LowEvidence':
    lstype = '--' 

for gi, gene in enumerate(mutations_acrosstime.columns):
    yacrosstime = [m for i,m in enumerate(mutations_acrosstime[gene].values) if mutations_acrosstime.index[i] in df_patient['date'].values]
    elei = ax.plot(xacrosstime, yacrosstime, color=collist[gi%len(collist)], ls=lstype, linewidth=2, label=gene)
    ax.plot(xacrosstime, yacrosstime, color=collist[gi%len(collist)], ls=lstype, marker='D', markersize=10)
    eles.append(elei)


fig.legend(eles, ['ichorCNA tumor burden'] + mutations_acrosstime.columns, loc='upper left')
ax.legend(loc='upper left')

labels = [ad if ad in tumorburden_dates else '' for ad in alldates]
ax2.set_xticklabels(labels, rotation=90, fontsize=30)
ax.set_xticklabels(labels, rotation=90, fontsize=30)

for dltbt in date_lowtftimepoints:
    print(dltbt)
    ax.get_xticklabels()[labels.index(dltbt)].set_color('blue') 
    ax2.get_xticklabels()[labels.index(dltbt)].set_color('blue') 
    
listdeepwgs = list(pd.to_datetime(tf_file_3[tf_file_3['patient'] == str(patient)]['date'], format='%d%m%y').astype(str).values)
for ldw in listdeepwgs:
    ax.get_xticklabels()[labels.index(ldw)].set_color('red') 
    ax2.get_xticklabels()[labels.index(ldw)].set_color('red') 

plt.title('Patient {}: VAF evolution across timepoints'.format(patient))

if not os.path.exists('../figures/low_tb_timepoints/oncosg_timeline_mutations_patient'+str(patient)+'.png'):
    plt.savefig('../figures/low_tb_timepoints/oncosg_timeline_mutations_patient'+str(patient)+'.png', bbox_inches='tight')

plt.show()

# For all patients

In [None]:
listpatients = [int(f.split('_')[2]) for f in os.listdir('../data/variant_calls/226 PANEL VARIANTS CLASSIFICSATION EXCEL/') if f.startswith('CCG')]
print(listpatients)
print(len(listpatients))

In [None]:
for patient in listpatients:
    print(patient)
    # tumor burden

    tf_file_1 = pd.read_csv('../data/tumor_burden/tumor_burden_ichorcna_batch1.txt', header=None)
    tf_file_1['patient'] = tf_file_1[0].str.split('-').str[1].str.split('_').str[0]
    tf_file_1['date'] = tf_file_1[0].str.split('_').str[1]
    tf_file_1['patient_date'] = tf_file_1['patient']  + '_' + tf_file_1['date'] 
    tf_file_1.set_index('patient_date', inplace=True)
    tf_file_1.drop(0, axis=1, inplace=True)
    tf_file_1.rename(columns={1: 'tumor_burden'}, inplace=True)

    tf_file_2 = pd.read_csv('../data/tumor_burden/tumor_burden_ichorcna_batch2.txt', header=None)
    tf_file_2['patient'] = tf_file_2[0].str.split('_').str[0]
    tf_file_2['date'] = tf_file_2[0].str.split('_').str[1]
    tf_file_2['patient_date'] = tf_file_2['patient'] + '_' + tf_file_2['date'] 
    tf_file_2.set_index('patient_date', inplace=True)
    tf_file_2.drop(0, axis=1, inplace=True)
    tf_file_2.rename(columns={1: 'tumor_burden'}, inplace=True)

    tf_file_3 = pd.read_csv('../data/tumor_burden/tumor_burden_ichorcna_deepWGS.txt', header=None)
    tf_file_3['patient'] = tf_file_3[0].str.split('_').str[0]
    tf_file_3['date'] = tf_file_3[0].str.split('_').str[1]
    tf_file_3['patient_date'] = tf_file_3['patient']  + '_' + tf_file_3['date'] 
    tf_file_3.set_index('patient_date', inplace=True)
    tf_file_3.drop(0, axis=1, inplace=True)
    tf_file_3.rename(columns={1: 'tumor_burden'}, inplace=True)

    tf_file = pd.concat([tf_file_1, tf_file_2, tf_file_3])
    tf_file['patient'] = tf_file['patient'].astype(int)
    tf_file['date'] = pd.to_datetime(tf_file['date'], format='%d%m%y')
    tf_file.reset_index(inplace=True)
    tf_file.drop('patient_date', axis=1, inplace=True)

    tumorburden_dates = tf_file[tf_file['patient'] == patient]['date'].sort_values().astype(str).unique()
    
    treatment_file = pd.read_csv('../data/treatment/patient_treatment_total_std_201109.txt', sep='\t')
    treatment_file['patient'] = treatment_file['patient'].astype(int)
    treatment_file['date'] = pd.to_datetime(treatment_file['date'], format='%Y-%m-%d')
    treatment_file.rename(columns={'value':'treatment'}, inplace=True)
    treatment_file = treatment_file[['patient', 'date', 'treatment']]
    treatment_file[treatment_file['patient'] == patient]['date'].astype(str).unique()
    
    treatment_patient = treatment_file[treatment_file['patient'] == patient].sort_values('date')
    treatment_patient['date'] = treatment_patient['date'].astype(str)
    tf_patient = tf_file[tf_file['patient'] == patient].sort_values('date')
    tf_patient['date'] = tf_patient['date'].astype(str)
    df_patient = pd.concat([treatment_patient, tf_patient])
    df_patient = df_patient.sort_values('date')

    alldates = sorted(list(set(list(df_patient['date'].values) + list(tf_patient['date'].values))))
    
    daytimes = [(datetime.strptime(alldates[i], '%Y-%m-%d') - datetime.strptime(alldates[0], '%Y-%m-%d')).days
                for i in range(len(alldates))]
    
    date_lowtftimepoints = list(tf_patient[tf_patient['tumor_burden'] == 0]['date'].unique())
    if date_lowtftimepoints == []:
        print('no zero ichorCNA estimate tumor burden')
        print('min tumor burden is {}'.format(min(tf_patient['tumor_burden'])))
        if min(tf_patient['tumor_burden']) < 0.1:
            date_lowtftimepoints = list(tf_patient[tf_patient['tumor_burden'] == min(tf_patient['tumor_burden'])]['date'].unique())
    print(date_lowtftimepoints)
    tf_patient[['date', 'tumor_burden']].T

    plt.figure()

    fig, ax2 = plt.subplots(figsize=(40,10))

    # make a plot with different y-axis using second axis object
    sns.stripplot(y='treatment', x='date', hue='treatment', data=df_patient, s=10, ax=ax2)
    ax2.grid(False)
    plt.legend((), ())

    # twin object for two different y-axis on the sample plot
    ax=ax2.twinx()
    ele0 = ax.plot(df_patient['date'], df_patient['tumor_burden'], lc+'.', marker='s', markersize=10, label='ichorCNA tumor burden')

    ax.plot(df_patient['date'][~df_patient['tumor_burden'].isna()], df_patient['tumor_burden'][~df_patient['tumor_burden'].isna()], lc+'-', linewidth=4)
    ax.plot(df_patient['date'][~df_patient['tumor_burden'].isna()], df_patient['tumor_burden'][~df_patient['tumor_burden'].isna()], lc+'.', marker='s', markersize=10)
    ax.set_ylabel('fraction (tumor burden or VAF)', fontsize=30)
    ax.set_ylim(-0.01, 1)

    fig.legend([ele0], ['ichorCNA tumor burden'], loc='upper left')
    ax.legend(loc='upper left')

    labels = [ad if ad in tumorburden_dates else '' for ad in alldates]
    ax2.set_xticklabels(labels, rotation=90, fontsize=30)
    ax.set_xticklabels(labels, rotation=90, fontsize=30)

    plt.title('Patient {}: VAF evolution across timepoints'.format(patient))

    if not os.path.exists('../figures/low_tb_timepoints/oncosg_timeline_patient'+str(patient)+'.png'):
        plt.savefig('../figures/low_tb_timepoints/oncosg_timeline_patient'+str(patient)+'.png', bbox_inches='tight')
        
    # 226 - panel
    mutation_df = pd.read_excel('../data/variant_calls/226 PANEL VARIANTS CLASSIFICSATION EXCEL/CCG_226_'+str(patient)+'_reGeno.VEP.readable_tiers.xls')

    col = ['#CHROM', 'POS', 'REF', 'ALT', 'GENE', 'TIERS']

    for it in list(date_lowtftimepoints):
        aux1 = 'CCG_226_'+str(patient)+'.'+ it.split('-')[-1]+it.split('-')[1]+it.split('-')[0][-2:]+'.P'
        aux2 = 'CCG_226_'+str(patient)+'.'+ it.split('-')[-1]+it.split('-')[1]+it.split('-')[0][-2:]
        if sum(mutation_df.columns.str.contains(aux1)) == 1 or sum(mutation_df.columns.str.contains(aux2)) == 1:
            if sum(mutation_df.columns.str.contains(aux1)) == 1:
                idx = mutation_df.columns.str.contains(aux1).tolist().index(True)
            elif sum(mutation_df.columns.str.contains(aux2)) == 1:
                idx = mutation_df.columns.str.contains(aux2).tolist().index(True)
            col.append(mutation_df.columns[idx])
    mutation_df = mutation_df[col]
    mutation_df.insert(loc = 6,
          column = 'helper',
          value = 'hello')
    if mutation_df.shape[1] > 7:
        mutation_lowtftimepoints_226 = (mutation_df.set_index(col[:6]+['helper'])
                                       .stack()
                                       .unstack(-2)
                                       .ffill(axis=1)
                                       .bfill(axis=1, downcast='infer')
                                       .add_prefix('new_')
                                       .reset_index()
                                       .rename({'level_6': 'date'}, axis=1))

        mutation_lowtftimepoints_226['date'] = mutation_lowtftimepoints_226['date'].str.split('.').str[1]
        mutation_lowtftimepoints_226['date'] = pd.to_datetime(mutation_lowtftimepoints_226['date'], format='%d%m%y').astype(str)
        foo2 = lambda x: pd.Series(float(x.split(' / ')[0])/float(x.split(' / ')[1].split(' = ')[0]))
        mutation_lowtftimepoints_226['VAF'] = mutation_lowtftimepoints_226['new_hello'].apply(foo2)
        mutation_lowtftimepoints_226.drop('new_hello', axis=1, inplace=True)
        #mutation_lowtftimepoints_226 = mutation_lowtftimepoints_226[mutation_lowtftimepoints_226['TIERS'] == 'Trusted']

        mutation_lowtftimepoints = mutation_lowtftimepoints_226

        lowtftimepoints_dict = {'date': date_lowtftimepoints,
                            'median VAF': [],
                            '# mutated genes' : [],
                            'median VAF within mutated genes': [],
                            '# mutated genes TRUSTED': [],
                            'median VAF within mutated genes TRUSTED': []
                           }

        for date in date_lowtftimepoints:
            nmut = mutation_lowtftimepoints[(mutation_lowtftimepoints['date'] == date) & (mutation_lowtftimepoints['VAF'] != 0)].shape[0]
            nmuttrust = mutation_lowtftimepoints[(mutation_lowtftimepoints['date'] == date)  & (mutation_lowtftimepoints['VAF'] != 0)  & (mutation_lowtftimepoints['TIERS'] == 'Trusted')].shape[0]
            medianvaf = np.median(mutation_lowtftimepoints[(mutation_lowtftimepoints['date'] == date)]['VAF'].values)
            medianvafn = np.median(mutation_lowtftimepoints[(mutation_lowtftimepoints['date'] == date) & (mutation_lowtftimepoints['VAF'] != 0)]['VAF'].values)
            medianvafntrust = np.median(mutation_lowtftimepoints[(mutation_lowtftimepoints['date'] == date) & (mutation_lowtftimepoints['VAF'] != 0) & (mutation_lowtftimepoints['TIERS'] == 'Trusted')]['VAF'].values)
            lowtftimepoints_dict['# mutated genes'].append(nmut)
            lowtftimepoints_dict['# mutated genes TRUSTED'].append(nmuttrust)
            lowtftimepoints_dict['median VAF'].append(medianvaf)
            lowtftimepoints_dict['median VAF within mutated genes'].append(medianvafn)
            lowtftimepoints_dict['median VAF within mutated genes TRUSTED'].append(medianvafntrust)


        lowtftimepoints_pd = pd.DataFrame.from_dict(lowtftimepoints_dict)
        lowtftimepoints_pd.set_index('date', inplace=True)

        mutation_df_226 = pd.read_excel('../data/variant_calls/226 PANEL VARIANTS CLASSIFICSATION EXCEL/CCG_226_'+str(patient)+'_reGeno.VEP.readable_tiers.xls')


        col = ['#CHROM', 'POS', 'REF', 'ALT', 'GENE', 'TIERS']
        if mutation_df_226[mutation_df_226["TIERS"] == 'Trusted'].shape[0] != 0:
            mutation_df_226 = mutation_df_226[mutation_df_226["TIERS"] == 'Trusted']
        else:
            mutation_df_226 = mutation_df_226[mutation_df_226["TIERS"] == 'LowEvidence']
        print(mutation_df_226.shape)
        if mutation_df_226.shape[0] > 0:

            for c in list(mutation_df_226.columns[6:]):
                if c.startswith('CCG_226_'+str(patient)):
                    col.append(c)
            mutation_df_226 = mutation_df_226[col]
            mutation_df_226.insert(loc = 6,
                  column = 'helper',
                  value = 'hello')
            mutations_acrosstime_226 = (mutation_df_226.set_index(col[:6]+['helper'])
                                           .stack()
                                           .unstack(-2)
                                           .ffill(axis=1)
                                           .bfill(axis=1, downcast='infer')
                                           .add_prefix('new_')
                                           .reset_index()
                                           .rename({'level_6': 'date'}, axis=1))
            mutations_acrosstime_226['date'] = mutations_acrosstime_226['date'].str.split('.').str[1]
            mutations_acrosstime_226['date'] = pd.to_datetime(mutations_acrosstime_226['date'], format='%d%m%y').astype(str)
            foo2 = lambda x: pd.Series(float(x.split(' / ')[0])/float(x.split(' / ')[1].split(' = ')[0]))
            mutations_acrosstime_226['VAF'] = mutations_acrosstime_226['new_hello'].apply(foo2)
            mutations_acrosstime_226.drop('new_hello', axis=1, inplace=True)
            mutations_acrosstime_226 = mutations_acrosstime_226.pivot_table(values='VAF', index='GENE', columns='date', aggfunc='first')
            mutations_acrosstime_226 = mutations_acrosstime_226.T

            mutations_acrosstime = mutations_acrosstime_226


            plt.figure()

            fig, ax2 = plt.subplots(figsize=(40,10))

            # make a plot with different y-axis using second axis object
            sns.stripplot(y='treatment', x='date', hue='treatment', data=df_patient, s=10, ax=ax2)
            labels = ax2.axes.get_xticklabels()
            ax2.grid(False)
            plt.legend((), ())

            # twin object for two different y-axis on the sample plot
            ax=ax2.twinx()
            ele0 = ax.plot(df_patient['date'], df_patient['tumor_burden'], lc+'.', marker='s', markersize=10, label='ichorCNA tumor burden')

            ax.plot(df_patient['date'][~df_patient['tumor_burden'].isna()], df_patient['tumor_burden'][~df_patient['tumor_burden'].isna()], lc+'-', linewidth=4)
            ax.plot(df_patient['date'][~df_patient['tumor_burden'].isna()], df_patient['tumor_burden'][~df_patient['tumor_burden'].isna()], lc+'.', marker='s', markersize=10)
            ax.set_ylabel('fraction (tumor burden or VAF)', fontsize=30)
            ax.set_ylim(-0.01, 1)

            xacrosstime = [i for i in mutations_acrosstime.index if i in df_patient['date'].values]

            eles = [ele0]
            collist = ['r', 'b', 'g', 'c', 'm', 'y', 'tab:pink', 'tab:orange', 'tab:olive', 'tab:purple', 'grey']
            if mutation_df_226["TIERS"].iloc[0] == 'Trusted':
                lstype = '-' 
            elif mutation_df_226["TIERS"].iloc[0] == 'LowEvidence':
                lstype = '--' 

            for gi, gene in enumerate(mutations_acrosstime.columns):
                yacrosstime = [m for i,m in enumerate(mutations_acrosstime[gene].values) if mutations_acrosstime.index[i] in df_patient['date'].values]
                elei = ax.plot(xacrosstime, yacrosstime, color=collist[gi%len(collist)], ls=lstype, linewidth=2, label=gene)
                ax.plot(xacrosstime, yacrosstime, color=collist[gi%len(collist)], ls=lstype, marker='D', markersize=10)
                eles.append(elei)

            fig.legend(eles, ['ichorCNA tumor burden'] + mutations_acrosstime.columns, loc='upper left')
            ax.legend(loc='upper left')

            labels = [ad if ad in tumorburden_dates else '' for ad in alldates]
            ax2.set_xticklabels(labels, rotation=90, fontsize=30)
            ax.set_xticklabels(labels, rotation=90, fontsize=30)

            for dltbt in date_lowtftimepoints:
                ax.get_xticklabels()[labels.index(dltbt)].set_color('blue') 
                ax2.get_xticklabels()[labels.index(dltbt)].set_color('blue') 

            listdeepwgs = list(pd.to_datetime(tf_file_3[tf_file_3['patient'] == str(patient)]['date'], format='%d%m%y').astype(str).values)
            for ldw in listdeepwgs:
                ax.get_xticklabels()[labels.index(ldw)].set_color('red') 
                ax2.get_xticklabels()[labels.index(ldw)].set_color('red') 

            plt.title('Patient {}: VAF evolution across timepoints'.format(patient))

            #if not os.path.exists('../figures/low_tb_timepoints/oncosg_timeline_mutations_patient'+str(patient)+'.png'):
            plt.savefig('../figures/low_tb_timepoints/oncosg_timeline_mutations_patient'+str(patient)+'.png', bbox_inches='tight')