In [None]:
import os
import io
import re
import numpy as np
import pandas as pd
import pysam
from tqdm.notebook import tqdm 

# Read SNV file

In [None]:
subtype = 'msi'

In [None]:
vcf_pd = pd.read_csv("../data/common_cancer_mutations/nant2_SNV_prefiltered_"+subtype+"_newModel_v2.MAF", sep='\t', header=None)

vcf_pd.columns = ['chr', 'pos_start', 'pos_end', 'ref', 'alt', 'patient_id', 'cohort']

print('analysis limited to chr22 only')
vcf_pd = vcf_pd[vcf_pd['chr'] == 'chr22']

print('number of rows in mutation file', vcf_pd.shape[0])

listpatient_ref = np.sort(vcf_pd['patient_id'].unique())
print(len(listpatient_ref))

print('number of patients', vcf_pd['patient_id'].nunique())

print('average number of mutations per patient', vcf_pd.groupby('patient_id').size().mean())

In [None]:
# mutations appearing in two patients or more
# check that the variant is the same

a = vcf_pd.groupby(['chr', 'pos_start', 'pos_end', 'ref', 'alt']).size()
a = a[a > 2]
print(a.shape[0])

b = vcf_pd.groupby(['chr', 'pos_start', 'pos_end']).size()
b = b[b > 2]
print(b.shape[0])

print([ai[1] for ai in a.index] == [bi[1] for bi in b.index])
print(a.values == b.values)

In [None]:
varfile = a.reset_index()
varfile.columns = ['chr', 'pos_start', 'pos_end', 'ref', 'alt', 'number of occurences']
varfile.drop('ref', axis=1, inplace=True)
varfile['vaf'] = np.nan
varfile

In [None]:
listfiles_annot = os.listdir('../data/common_cancer_mutations/annotations/')
listpatients_annot = [f.split('_')[0].split('-')[2] for f in listfiles_annot if f.endswith('snv_annotated.txt')]
listpatients_annot = np.unique(listpatients_annot)

for i, variant in tqdm(varfile.iterrows(), total=varfile.shape[0]):
    #print(variant)
    vaf_list = []
    print(variant['chr'], variant['pos_start'])
    for file in listfiles_annot:
        patient = file.split('_')[0].split('-')[2]
        if int(patient) in listpatient_ref:
            annot_pd = pd.read_csv("../data/common_cancer_mutations/annotations/"+file, sep='\t', low_memory=False)
            annot_pd = annot_pd[['Chr', 'START_POS_REF', 'REF', 'ALT', 'T_refDepth', 'T_altDepth']]
            annot_pd['Chr'] = annot_pd['Chr'].astype(str)
            aux = annot_pd[(annot_pd['Chr'] == str(variant['chr'][3:])) & (annot_pd['START_POS_REF'] == variant['pos_start'])]
            if not aux.empty:
                vaf = aux['T_altDepth']/ (aux['T_refDepth'] + aux['T_altDepth'])
                vaf = vaf.values[0]
                vaf_list.append(vaf)
    vaf_mean = np.mean(vaf_list)
    print(len(vaf_list) == variant['number of occurences'])
    varfile.iloc[i, varfile.columns.get_loc('vaf')] = np.round(vaf_mean,3)

In [None]:
varfile = varfile[['chr', 'pos_start', 'pos_end', 'vaf', 'alt']]
varfile['chr'] = varfile['chr'].str.lstrip('chr')
varfile

In [None]:
# save file
varfile.to_csv('../data/common_cancer_mutations/chr22_CRC_'+subtype.upper()+'_SNV_tf1.bed', sep='\t', header=False, index=False)

# Get different tumor fractions

In [None]:
tf = 0.1  # tumor fraction
varfile_bis = pd.read_csv('../data/common_cancer_mutations/chr22_CRC_'+subtype.upper()+'_SNV_tf1.bed', header=None)
varfile_bis['vaf'] = varfile_bis['vaf'] * tf
varfile_bis.to_csv('../data/common_cancer_mutations/chr22_CRC_'+subtype.upper()+'_SNV_tf'+str(tf)+'.bed', sep='\t', header=False, index=False)

# Read INDEL mutation file

In [None]:
subtype = 'msi'

In [None]:
vcf_pd = pd.read_csv("../data/common_cancer_mutations/nant2_indel_prefiltered_"+subtype+"_newModel.MAF", sep='\t', header=None)

vcf_pd.columns = ['chr', 'pos_start', 'pos_end', 'ref', 'alt', 'patient_id', 'cohort']

print('analysis limited to chr22 only')
vcf_pd = vcf_pd[vcf_pd['chr'] == 'chr22']

print('number of rows in mutation file', vcf_pd.shape[0])

listpatient_ref = np.sort(vcf_pd['patient_id'].unique())
print(len(listpatient_ref))

print('number of patients', vcf_pd['patient_id'].nunique())

print('average number of mutations per patient', vcf_pd.groupby('patient_id').size().mean())

In [None]:
# mutations appearing in two patients or more
# check that the variant is the same

a = vcf_pd.groupby(['chr', 'pos_start', 'pos_end', 'ref', 'alt']).size()
a = a[a > 2]
print(a.shape[0])

b = vcf_pd.groupby(['chr', 'pos_start', 'pos_end']).size()
b = b[b > 2]
print(b.shape[0])

print([ai[1] for ai in a.index] == [bi[1] for bi in b.index])
print(a.values == b.values)

In [None]:
varfile = a.reset_index()
varfile.columns = ['chr', 'pos_start', 'pos_end', 'ref', 'alt', 'number of occurences']
#varfile.drop('ref', axis=1, inplace=True)
varfile['vaf'] = np.nan
varfile

In [None]:
listfiles_annot = os.listdir('../data/common_cancer_mutations/annotations/')
listpatients_annot = [f.split('_')[0].split('-')[2] for f in listfiles_annot if f.endswith('indel_annotated.txt')]
listpatients_annot = np.unique(listpatients_annot)

for i, variant in tqdm(varfile.iterrows(), total=varfile.shape[0]):
    #print(variant)
    vaf_list = []
    print(variant['chr'], variant['pos_start'])
    for file in listfiles_annot:
        patient = file.split('_')[0].split('-')[2]
        if int(patient) in listpatient_ref:
            annot_pd = pd.read_csv("../data/common_cancer_mutations/annotations/"+file, sep='\t', low_memory=False)
            annot_pd = annot_pd[['Chr', 'START_POS_REF', 'REF', 'ALT', 'T_refDepth', 'T_altDepth']]
            annot_pd['Chr'] = annot_pd['Chr'].astype(str)
            aux = annot_pd[(annot_pd['Chr'] == str(variant['chr'][3:])) & (annot_pd['START_POS_REF'] == variant['pos_start'])  & (annot_pd['ALT'] == variant['alt'])]
            if not aux.empty:
                vaf = aux['T_altDepth']/ (aux['T_refDepth'] + aux['T_altDepth'])
                vaf = vaf.values[0]
                vaf_list.append(vaf)
    vaf_mean = np.mean(vaf_list)
    print(len(vaf_list) == variant['number of occurences'])
    varfile.iloc[i, varfile.columns.get_loc('vaf')] = np.round(vaf_mean,3)

In [None]:
# /!\ 0-based index in bamsurgeon for indels
# but VCF del: ATC -> A, input BED del: - 2 nucleotides
# but VCF ins: A -> ATC, input BED ins: + TC

varfile = varfile[['chr', 'pos_start', 'pos_end', 'vaf', 'ref', 'alt']]
varfile['chr'] = varfile['chr'].str.lstrip('chr')
varfile['len'] = varfile['alt'].str.len() - varfile['ref'].str.len()
varfile['type'] = np.nan
varfile['type'][varfile['len'].astype(int) > 0] = 'INS'
varfile['type'][varfile['len'].astype(int) < 0] = 'DEL'
varfile['alt'][varfile['type'] == 'INS'] = varfile['alt'].str[1:]
varfile['pos_end'][varfile['type'] == 'INS'] = varfile['pos_end']+varfile['len']
varfile['alt'][varfile['type'] == 'DEL'] = ''
varfile['pos_end'][varfile['type'] == 'DEL'] = varfile['pos_end']-varfile['len']+1 # negative length
varfile = varfile[['chr', 'pos_start', 'pos_end', 'vaf', 'type', 'alt']]
varfile

In [None]:
subtype

In [None]:
# save file
varfile.to_csv('../data/common_cancer_mutations/chr22_CRC_'+subtype.upper()+'_INDEL_tf1.bed', sep='\t', header=False, index=False)


# Get different tumor fractions

In [None]:
tf = 0.1  # tumor fraction
varfile_bis = pd.read_csv('../data/common_cancer_mutations/chr22_CRC_'+subtype.upper()+'_INDEL_tf1.bed', sep='\t', header=None)
varfile_bis.columns = ['chr', 'pos_start', 'pos_end', 'vaf', 'type', 'alt']
varfile_bis['vaf'] = varfile_bis['vaf'] * tf
varfile_bis.to_csv('../data/common_cancer_mutations/chr22_CRC_'+subtype.upper()+'_INDEL_tf'+str(tf)+'.bed', sep='\t', header=False, index=False)