# Pooled healthy exploration

In [None]:
%load_ext autoreload
%autoreload 2

import io
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pysam
import warnings
from sklearn.metrics import precision_recall_curve, f1_score, average_precision_score
warnings.filterwarnings('ignore')
from sklearn.metrics import confusion_matrix
from tqdm.notebook import tqdm

# set working directory
if not os.getcwd().endswith('cfdna_snv_benchmark'):
    os.chdir('../')
print('Current working directory: {}'.format(os.getcwd()))

from utils.config import Config
from utils.calltable import *

In [None]:
# exome bed file
exomebed = pd.read_csv(os.path.join('data', 'extdata', 'xgen-exome-research-panel-targets_nochr.bed'), sep='\t', header=None)
exomebed.columns = ['chrom', 'startpos', 'endpos', 'gene', 'col4', 'col5']
exomebed_itemlist = []
for ri, row in tqdm(exomebed.iterrows(), total=exomebed.shape[0]):
    #print(row.startpos, row.endpos)
    
    for i in range(row.startpos, row.endpos + 1):
        #print(str(row.chrom)+'_'+str(i))
        exomebed_itemlist.append(str(row.chrom)+'_'+str(i))
print(len(exomebed_itemlist))

In [None]:
exomebed.head()

In [None]:
print(len(exomebed_itemlist))
exomebed_itemlist_unique = list(set(exomebed_itemlist))
print(len(exomebed_itemlist_unique))

In [None]:
exomebed_itemlist_unique[:5]

# 1. Patient 986 mutations

In [None]:
tissue_df = read_vcf(os.path.join('data', 'matchedtissue', 'NCC_CRC-986_100215-T1W-ensemble-annotated.vcf.gz'))
tissue_df = tissue_df[tissue_df["INFO"].str.split('CALLERS=').str[1].str.split(';').str[0].str.split(',').str.len() == 4]
print(tissue_df.shape)
tissue_df['chrom_pos_ref_alt'] = tissue_df['CHROM'].astype('str').str.cat(tissue_df['POS'].astype('str'), sep="_").str.cat(tissue_df['REF'].astype('str'), sep='_').str.cat(tissue_df['ALT'].astype('str'), sep='_')
tissue_df.set_index('chrom_pos_ref_alt', inplace=True)
tissue_df['chrom_pos'] = tissue_df['CHROM'].astype('str').str.cat(tissue_df['POS'].astype('str'), sep="_")
tissue_df.head()

In [None]:
tissue_exome_df = tissue_df['chrom_pos'][tissue_df['chrom_pos'].isin(exomebed_itemlist_unique)]

mutpos_exome_tissue = list(tissue_exome_df.index)
print(len(mutpos_exome_tissue))

tissue_exome_df.shape
tissue_exome_df.head()

In [None]:
highctdna_df = read_vcf(os.path.join('data', 'initialsamples',  'bcbio', 'NCC_CRC-986_100215_CW_T-ensemble-annotated.vcf.gz'))
highctdna_df = highctdna_df[highctdna_df["INFO"].str.split('CALLERS=').str[1].str.split(';').str[0].str.split(',').str.len() == 5]
print(highctdna_df.shape)
highctdna_df['chrom_pos_ref_alt'] = highctdna_df['CHROM'].astype('str').str.cat(highctdna_df['POS'].astype('str'), sep="_").str.cat(highctdna_df['REF'].astype('str'), sep='_').str.cat(highctdna_df['ALT'].astype('str'), sep='_')
highctdna_df.set_index('chrom_pos_ref_alt', inplace=True)
highctdna_df['chrom_pos'] = highctdna_df['CHROM'].astype('str').str.cat(highctdna_df['POS'].astype('str'), sep="_")


mutpos_hightctdna = list(highctdna_df.index)
print(len(mutpos_hightctdna))

highctdna_df.head()

In [None]:
highctdna_exome_df = highctdna_df['chrom_pos'][highctdna_df['chrom_pos'].isin(exomebed_itemlist_unique)]

mutpos_exome_hightctdna = list(highctdna_exome_df.index)
print(len(mutpos_exome_hightctdna))


print(highctdna_exome_df.shape)
highctdna_exome_df.head()

In [None]:
mutpos_tissue = list(tissue_df.index)
print(len(mutpos_tissue))

mutpos_hightctdna = list(highctdna_df.index)
print(len(mutpos_hightctdna))

mutpos_both = list(set(mutpos_tissue) & set(mutpos_hightctdna))
print(len(mutpos_both))

mutpos_union = list(set(mutpos_tissue) | set(mutpos_hightctdna))
print(len(mutpos_union))

In [None]:
mutpos_exome_tissue = list(tissue_exome_df.index)
print(len(mutpos_exome_tissue))

mutpos_exome_hightctdna = list(highctdna_exome_df.index)
print(len(mutpos_exome_hightctdna))

mutpos_exome_both = list(set(mutpos_exome_tissue) & set(mutpos_exome_hightctdna))
print(len(mutpos_exome_both))

mutpos_exome_union = list(set(mutpos_exome_tissue) | set(mutpos_exome_hightctdna))
print(len(mutpos_exome_union))

# 2. Patient 986 genotype on selected mutation loci 

In [None]:
highctdna_germline_df = read_vcf(os.path.join('data', 'initialsamples',  'bcbio', 'NCC_CRC-986_100215_CW_T-N-gatk-haplotype-annotated.vcf.gz'))
print(highctdna_germline_df.shape)
highctdna_germline_df['chrom_pos'] = highctdna_germline_df['CHROM'].astype('str').str.cat(highctdna_germline_df['POS'].astype('str'), sep="_")
highctdna_germline_df.set_index('chrom_pos', inplace=True)
highctdna_germline_df['genotype'] = highctdna_germline_df['NCC_CRC-986_100215_CW_T-N'].str.split(':').str[0]
print(highctdna_germline_df['genotype'].value_counts())
highctdna_germline_df['totcov'] = highctdna_germline_df['NCC_CRC-986_100215_CW_T-N'].str.split(':').str[2]
highctdna_germline_df['totcov'].fillna(0, inplace=True)
highctdna_germline_df['totcov'] = highctdna_germline_df['totcov'].astype(int)
highctdna_germline_df['altcov1'] = highctdna_germline_df['NCC_CRC-986_100215_CW_T-N'].str.split(':').str[1].str.split(',').str[1]
highctdna_germline_df['altcov1'].fillna(0, inplace=True)
highctdna_germline_df['altcov1'] = highctdna_germline_df['altcov1'].astype(int)
highctdna_germline_df['altcov2'] = highctdna_germline_df['NCC_CRC-986_100215_CW_T-N'].str.split(':').str[1].str.split(':')
highctdna_germline_df['altcov2'] = [0 if pd.isna(c) else c[0].split(':')[0].split(',')[2] if len(c[0].split(':')[0].split(',')) > 2 else 0 for c in highctdna_germline_df['altcov2'].values]
highctdna_germline_df['altcov2'].fillna(0, inplace=True)
highctdna_germline_df['altcov2'] = highctdna_germline_df['altcov2'].astype(int)
highctdna_germline_df['altcov'] = highctdna_germline_df['altcov1']  + highctdna_germline_df['altcov2']
highctdna_germline_df['vaf'] = highctdna_germline_df['altcov'] / highctdna_germline_df['totcov']
highctdna_germline_df['vaf1'] =highctdna_germline_df['altcov1'] / highctdna_germline_df['totcov']
highctdna_germline_df['vaf2'] =highctdna_germline_df['altcov2'] / highctdna_germline_df['totcov']
highctdna_germline_df = highctdna_germline_df[['REF', 'ALT', 'genotype', 'totcov', 'altcov', 'altcov1', 'altcov2', 'vaf', 'vaf1', 'vaf2']]
highctdna_germline_df.head(20)

# 3. Healthies germline mismatch?

In [None]:
healthies = os.listdir(os.path.join('data', 'healthies'))
print(healthies)
print(len(healthies))

In [None]:
healthies_df = []
for healthy in healthies:
    print(healthy)
    healthy_df = read_vcf(os.path.join('data', 'healthies', healthy, healthy+'-gatk-haplotype-annotated.vcf.gz'))
    healthy_df['healthyid'] = healthy
    healthy_df['chrom_pos_ref_alt_healthyid'] = healthy_df['CHROM'].astype('str').str.cat(healthy_df['POS'].astype('str'), sep="_").str.cat(healthy_df['REF'].astype('str'), sep='_').str.cat(healthy_df['ALT'].astype('str'), sep='_').str.cat(healthy_df['healthyid'].astype('str'), sep='_')
    healthy_df.set_index('chrom_pos_ref_alt_healthyid', inplace=True)
    healthies_df.append(healthy_df)
healthies_df = pd.concat(healthies_df)
healthies_df.head()

In [None]:
# check germline genotype is same at given mutated locus

count = 0
count_vois = 0
mutpos_withoutgermlinemismatch = []
for mut in tqdm(mutpos_union):
    cond = True
    chrom, pos, ref, alt = mut.split('_')
    if chrom in [str(i) for i in range(1, 23)]:
        germline_healthies = healthies_df[(healthies_df['CHROM'] == chrom) & (healthies_df['POS'] == int(pos))]
        if chrom+'_'+pos in list(highctdna_germline_df.index):  # patient germline SNP
            germline_patient = highctdna_germline_df.loc[chrom+'_'+pos]
            genotype_patient = germline_patient['genotype']
            print(germline_healthies)
            if germline_healthies.empty:
                print('Patient SNP not present in healthies')
                print(chrom, pos)
                count += 1
            else:
                print('Patient SNP is present in healthies?')
                print(germline_healthies)
                count += 1
            cond = False
        else: # no patient germline SNP
            genotype_patient = '0/0'
            if germline_healthies.empty: # no healthy germline SNP
                pass
            else: # healthy germline SNP
                print('Healhy SNP not present in patient')
                print('How many healthies carry the SNP?')
                print(chrom, pos)
                count += 1
                cond = False
        # voisinage
        germline_healthies_voisinage = healthies_df[(healthies_df['CHROM'] == chrom) & (healthies_df['POS'] >= int(pos)-20) & (healthies_df['POS'] <= int(pos)+20)]
        vois_index = [v for v in list(highctdna_germline_df.index) if v.startswith(chrom+'_') and int(v.split('_')[-1]) in range(int(pos)-20,int(pos)+20)]
        if vois_index == [] and germline_healthies_voisinage.empty: # no SNP in voisinage in patient nor in healthies
            pass
        else:
            if vois_index != []:
                germline_patient_voisinage = highctdna_germline_df.loc[vois_index]
                print(germline_patient_voisinage[['REF', 'ALT', 'genotype', 'vaf', 'totcov']])
                if germline_healthies_voisinage.empty:
                    print('no SNP in healthies voisinage')
                else:
                    print(germline_healthies_voisinage)
                print('Is there a mismatch between patient SNP and healthy SNP?')
            count_vois += 1
            cond = False
        if cond:
            mutpos_withoutgermlinemismatch.append(mut)
print(count, count_vois, len(mutpos_hightctdna))
print(len(mutpos_withoutgermlinemismatch))

In [None]:
1-(len(mutpos_withoutgermlinemismatch)/8728)

In [None]:
mutpos_withoutgermlinemismatch_df = pd.Series(mutpos_withoutgermlinemismatch)
mutpos_withoutgermlinemismatch_df.to_csv(os.path.join('data', 'pooledhealthy', 'mutpos_withoutgermlinemismatch_NCC_CRC-986_100215_T1W_T_17healthies.csv'))
mutpos_withoutgermlinemismatch_df.head()

In [None]:
mutpos_withoutgermlinemismatch_df = pd.Series(mutpos_withoutgermlinemismatch)
mutpos_withoutgermlinemismatch_df.to_csv(os.path.join('data', 'pooledhealthy', 'mutpos_withoutgermlinemismatch_NCC_CRC-986_100215_CW_T_17healthies.csv'))
mutpos_withoutgermlinemismatch_df.head()

In [None]:
# check germline genotype is same at given mutated locus

count = 0
count_vois = 0
mutpos_withoutgermlinemismatch = []
for mut in tqdm(mutpos_both):
    cond = True
    chrom, pos, ref, alt = mut.split('_')
    if chrom in [str(i) for i in range(1, 23)]:
        germline_healthies = healthies_df[(healthies_df['CHROM'] == chrom) & (healthies_df['POS'] == int(pos))]
        if chrom+'_'+pos in list(highctdna_germline_df.index):  # patient germline SNP
            germline_patient = highctdna_germline_df.loc[chrom+'_'+pos]
            genotype_patient = germline_patient['genotype']
            print(germline_healthies)
            if germline_healthies.empty:
                print('Patient SNP not present in healthies')
                print(chrom, pos)
                count += 1
            else:
                print('Patient SNP is present in healthies?')
                print(germline_healthies)
                count += 1
            cond = False
        else: # no patient germline SNP
            genotype_patient = '0/0'
            if germline_healthies.empty: # no healthy germline SNP
                pass
            else: # healthy germline SNP
                print('Healhy SNP not present in patient')
                print('How many healthies carry the SNP?')
                print(chrom, pos)
                count += 1
                cond = False
        # voisinage
        germline_healthies_voisinage = healthies_df[(healthies_df['CHROM'] == chrom) & (healthies_df['POS'] >= int(pos)-20) & (healthies_df['POS'] <= int(pos)+20)]
        vois_index = [v for v in list(highctdna_germline_df.index) if v.startswith(chrom+'_') and int(v.split('_')[-1]) in range(int(pos)-150,int(pos)+150)]
        if vois_index == [] and germline_healthies_voisinage.empty: # no SNP in voisinage in patient nor in healthies
            pass
        else:
            if vois_index != []:
                germline_patient_voisinage = highctdna_germline_df.loc[vois_index]
                print(germline_patient_voisinage[['REF', 'ALT', 'genotype', 'vaf', 'totcov']])
                if germline_healthies_voisinage.empty:
                    print('no SNP in healthies voisinage')
                else:
                    print(germline_healthies_voisinage)
                print('Is there a mismatch between patient SNP and healthy SNP?')
            count_vois += 1
            cond = False
        if cond:
            mutpos_withoutgermlinemismatch.append(mut)
print(count, count_vois, len(mutpos_hightctdna))
print(len(mutpos_withoutgermlinemismatch))

In [None]:
print(count, count_vois, len(mutpos_both))
print(len(mutpos_withoutgermlinemismatch))

In [None]:
mutpos_withoutgermlinemismatch_df = pd.Series(mutpos_withoutgermlinemismatch)
mutpos_withoutgermlinemismatch_df.to_csv(os.path.join('data', 'pooledhealthy', 'mutpos_withoutgermlinemismatch_986_100215_both_12healthies.csv'))
mutpos_withoutgermlinemismatch_df.head()

In [None]:
# check germline mismatch at given mutated locus

count = 0
count_vois = 0
for mut in mutpos:
    chrom, pos, ref, alt = mut.split('_')
    if chrom in [str(i) for i in range(1, 23)]:
        aux = healthies_df[(healthies_df['CHROM'] == chrom) & (healthies_df['POS'] == int(pos))]
        if aux.empty:
            pass
        else:
            print(chrom, pos)
            print('mutation position', aux.shape[0])
            count += 1
        # voisinage
        vois = healthies_df[(healthies_df['CHROM'] == chrom) & (healthies_df['POS'] >= int(pos)-20) & (healthies_df['POS'] <= int(pos)+20)]
        if vois.empty:
            pass
        else:
            print(chrom, pos)
            print('voisinage', vois.shape[0])
            count_vois += 1
print(count, count_vois, len(mutpos))

In [None]:
print(count, count_vois, len(mutpos))
print(100*(count + count_vois)/len(mutpos))
len(mutpos) - count - count_vois

In [None]:
print(tissue_df.iloc[0]['CHROM'], tissue_df.iloc[0]['POS'])
    healthy_df[(healthy_df['CHROM'] == tissue_df.iloc[0]['CHROM']) & (healthy_df['POS'] == tissue_df.iloc[0]['POS'])]