In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from bs4 import BeautifulSoup
import json

In [2]:
with open( '../Data/VAF_data/p377_variant_summary.html', 'r') as mf:
    file = mf.read()

In [3]:
soup = BeautifulSoup(file, 'html.parser')

mysoup = soup.body.div.find_all("script")

mysoup = soup.body.div.find_all("script")[1].contents[0].strip()

In [4]:
table_dict = json.loads(mysoup)

columns = [stuff["label"][0] for stuff in table_dict["columns"]]

table_dict["data"]

data = []
for stuff in table_dict["data"]:
    rows = []
    for key in stuff:
        rows.append(stuff[key])
    data.append(rows)
    
variant_annotations = pd.DataFrame(data, columns = columns)

In [5]:
#Check where PD7151 SRSF2 data is (PD7151 = H198303, PD7153 = H198302)
variant_annotations.loc[variant_annotations['PATIENT'].isin(['IID_H198303'])].loc[variant_annotations['GENE'].isin(['SRSF2'])]['PROTEIN_CHANGE']

32    p.P95H
33    p.P95H
34    p.P95H
Name: PROTEIN_CHANGE, dtype: object

In [6]:
#Convert to a variant so it's not confused with PD7153 with the same protein change
variant_annotations.loc[32,'PROTEIN_CHANGE'] = 'p.P95Hb'
variant_annotations.loc[33,'PROTEIN_CHANGE'] = 'p.P95Hb'
variant_annotations.loc[34,'PROTEIN_CHANGE'] = 'p.P95Hb'

In [7]:
#Check this worked
variant_annotations.loc[variant_annotations['PATIENT'].isin(['IID_H198303'])].loc[variant_annotations['GENE'].isin(['SRSF2'])]['PROTEIN_CHANGE']

32    p.P95Hb
33    p.P95Hb
34    p.P95Hb
Name: PROTEIN_CHANGE, dtype: object

In [8]:
#Convert patient IDs
variant_annotations['pat'] = variant_annotations['PATIENT']

init_dict = {
    'IID_H198302': 'PD7153',
    'IID_H198303': 'PD7151',
    'IID_H198304': 'JP001'
}

variant_annotations['patient_init'] = variant_annotations['pat'].replace(init_dict)

In [9]:
#Add in cell type
variant_annotations['cel'] = variant_annotations['TARGET_NAME']
ct_dict = {
    'T01': 'BM',
    'T02': 'Mono',
    'T03': 'nBC',
    'T04': 'Neut'
}

patients = variant_annotations['TARGET_NAME'].to_list()
cells = []
for pat in patients: 
    for a in ct_dict:
        x = (pat.split('_')[2])
        if a == x:
            cells.append(ct_dict[a])
            
cell_dict = dict(zip(patients, cells))   

variant_annotations['celltype'] = variant_annotations['cel'].replace(cell_dict)

variant_annotations = variant_annotations.drop(columns = ['cel', 'pat'])

In [10]:
#Add in PCR amplicon name
variant_annotations['Mean_VAF'] = pd.to_numeric(variant_annotations['TARGET_VAF_MEAN'], downcast = 'float')

In [11]:
#Convert protein changes to amplicon names
#Create dictionary
amp_assign = {'p.Q1276*': 'PD7153_CUX1',
 'p.P95H': 'PD7153_SRSF2',       
 'p.L1065fs*1': 'PD7153_TET2a',
 'p.Q685*': 'PD7153_TET2b',
 'p.T360M': 'PD7153_TGFB3_g',
 'p.P95Hb': 'PD7151_SRSF2',
 'p.K1090fs*15': 'PD7151_TET2a',
 'p.A1224fs*2': 'PD7151_TET2b',
 'p.A187T': 'JP001_RUNX1_g',
 'p.P95R': 'JP001_SRSF2',
 'p.G1218fs*8': 'JP001_TET2a',
 'p.Y1337*': 'JP001_TET2b_g'}

#Create list of mutations/amplicons to include in final dataset
mut = []

for k in amp_assign.keys():
    mut.append(amp_assign[k])  

print(len(amp_assign), amp_assign)

variant_annotations['Amplicon'] = variant_annotations['PROTEIN_CHANGE'].replace(amp_assign)

12 {'p.Q1276*': 'PD7153_CUX1', 'p.P95H': 'PD7153_SRSF2', 'p.L1065fs*1': 'PD7153_TET2a', 'p.Q685*': 'PD7153_TET2b', 'p.T360M': 'PD7153_TGFB3_g', 'p.P95Hb': 'PD7151_SRSF2', 'p.K1090fs*15': 'PD7151_TET2a', 'p.A1224fs*2': 'PD7151_TET2b', 'p.A187T': 'JP001_RUNX1_g', 'p.P95R': 'JP001_SRSF2', 'p.G1218fs*8': 'JP001_TET2a', 'p.Y1337*': 'JP001_TET2b_g'}


In [12]:
# Make a smaller df that just contains PCR amplicons in it
variant_annotations_amps = variant_annotations.loc[variant_annotations['Amplicon'].isin(mut)]

#Test for where amplicon assignment doesn't fit patient (ie/ PD7151 SRSF2 which gets incorrectly labelled as a different patient because it wasn't PCR amplified)
#Now fixed above so that the mutation can be included
pt_test = []
for index, row in variant_annotations_amps.iterrows():
    pt_test.append(row['Amplicon'].split('_')[0])

variant_annotations_amps['pt_test'] = pt_test

variant_annotations_amps = variant_annotations_amps.loc[variant_annotations_amps['pt_test'] == variant_annotations_amps['patient_init']]   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  variant_annotations_amps['pt_test'] = pt_test


In [13]:
#Write dataframe to a file and streamline later analysis (no need to re-extract every time)
variant_annotations_amps.to_csv('../Data/VAF_data/bulkVAF_data.tsv', sep = '\t')