In [35]:
import pandas as pd 
import numpy as np
import os
from tqdm import tqdm
from glob import glob
import requests
import ast
from bs4 import BeautifulSoup
os.makedirs('processed-data', exist_ok=True)

In [36]:
#copy number alterations
df_all = pd.DataFrame()
for f in glob('raw-data/cna/*.csv'):
    df = pd.read_csv(f)
    for c in df.columns:
        df[c] = df[c].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and (x.startswith('[') and x.endswith(']')) or (x.startswith('{') and x.endswith('}')) else x)
    df['gene'] = f.split('/')[-1].split('_')[0]
    df['alterations'] = df['alterations'].apply(lambda x: ','.join(x))
    df['drugs'] = df['drugs'].apply(lambda x: ','.join([i['drugName'] for i in x]))
    df['approvedIndications'] = df['approvedIndications'].apply(lambda x: x[0] if len(x) > 0 else np.nan)
    df['cancer_type_raw_text'] = df['levelAssociatedCancerType']
    df['levelAssociatedCancerType'] = df['levelAssociatedCancerType'].apply(lambda x: x['name'] if x['name'] != '' else x['mainType']['name'])
    df['levelExcludedCancerTypes'] = df['levelExcludedCancerTypes'].apply(lambda x: ','.join(x))
    df['pmids'] = df['pmids'].apply(lambda x: ','.join(x))
    df['abstracts'] = df['abstracts'].apply(lambda x: ','.join([i['link'] for i in x]))
    df_all = pd.concat([df_all, df], axis=0)
df_all = df_all.reset_index(drop = True).sort_values(by = ['level', 'gene'])
print(df_all.shape)
df_all.to_csv('processed-data/oncokb_cna.csv', index=False)
df_all.alterations.value_counts()

(398, 12)


alterations
Oncogenic Mutations                                                                   264
Amplification                                                                         126
Deletion                                                                                6
Oncogenic Mutations (excluding Y646S, Y646H, Y646C, Y646F, Y646N, A682G and A692V)      2
Name: count, dtype: int64

In [38]:
#protein change
df_all = pd.DataFrame()
for f in glob('raw-data/cna/*.csv'):
    df = pd.read_csv(f)
    for c in df.columns:
        df[c] = df[c].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and (x.startswith('[') and x.endswith(']')) or (x.startswith('{') and x.endswith('}')) else x)
    df['gene'] = f.split('/')[-1].split('_')[0]
    df['change'] =  f.split('/')[-1].split('_')[1].replace('.csv', '')
    df['alterations'] = df['alterations'].apply(lambda x: ','.join(x))
    df['drugs'] = df['drugs'].apply(lambda x: ','.join([i['drugName'] for i in x]))
    df['approvedIndications'] = df['approvedIndications'].apply(lambda x: x[0] if len(x) > 0 else np.nan)
    df['cancer_type_raw_text'] = df['levelAssociatedCancerType']
    df['levelAssociatedCancerType'] = df['levelAssociatedCancerType'].apply(lambda x: x['name'] if x['name'] != '' else x['mainType']['name'])
    df['levelExcludedCancerTypes'] = df['levelExcludedCancerTypes'].apply(lambda x: ','.join(x))
    df['pmids'] = df['pmids'].apply(lambda x: ','.join(x))
    df['abstracts'] = df['abstracts'].apply(lambda x: ','.join([i['link'] for i in x]))
    df_all = pd.concat([df_all, df], axis=0)
df_all = df_all.reset_index(drop = True).sort_values(by = ['level', 'gene'])
print(df_all.shape)
df_all.to_csv('processed-data/oncokb_protein_change.csv', index=False)
df_all.alterations.value_counts()

(398, 13)


alterations
Oncogenic Mutations                                                                   264
Amplification                                                                         126
Deletion                                                                                6
Oncogenic Mutations (excluding Y646S, Y646H, Y646C, Y646F, Y646N, A682G and A692V)      2
Name: count, dtype: int64

In [41]:
#structural variants
df_all = pd.DataFrame()
for f in glob('raw-data/sv/*.csv'):
    df = pd.read_csv(f)
    for c in df.columns:
        df[c] = df[c].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and (x.startswith('[') and x.endswith(']')) or (x.startswith('{') and x.endswith('}')) else x)
    df['gene'] = f.split('/')[-1].split('_')[0].replace('.csv','')
    #df['change'] =  f.split('/')[-1].split('_')[1].replace('.csv', '')
    df['alterations'] = df['alterations'].apply(lambda x: ','.join(x))
    df['drugs'] = df['drugs'].apply(lambda x: ','.join([i['drugName'] for i in x]))
    df['approvedIndications'] = df['approvedIndications'].apply(lambda x: x[0] if len(x) > 0 else np.nan)
    df['cancer_type_raw_text'] = df['levelAssociatedCancerType']
    df['levelAssociatedCancerType'] = df['levelAssociatedCancerType'].apply(lambda x: x['name'] if x['name'] != '' else x['mainType']['name'])
    df['levelExcludedCancerTypes'] = df['levelExcludedCancerTypes'].apply(lambda x: x[0]['name'] if len(x) > 0 else np.nan)#x['name'] != '' else x['mainType']['name'])
    df['pmids'] = df['pmids'].apply(lambda x: ','.join(x))
    df['abstracts'] = df['abstracts'].apply(lambda x: ','.join([i['link'] for i in x]))
    df_all = pd.concat([df_all, df], axis=0)
df_all = df_all.reset_index(drop = True).sort_values(by = ['level', 'gene'])
print(df_all.shape)
df_all.to_csv('processed-data/oncokb_structural_variants.csv', index=False)
df_all.alterations.value_counts()

(714, 12)


alterations
Fusions                                 427
Oncogenic Mutations                     119
BCR-ABL1 Fusion                          70
Oncogenic Mutations (excluding V600)     56
PML-RARA Fusion                          21
EWSR1-FLI1 Fusion                        14
COL1A1-PDGFB Fusion                       7
Name: count, dtype: int64

In [43]:
aggr_df = pd.concat([
    pd.read_csv('processed-data/oncokb_protein_change.csv'),
    pd.read_csv('processed-data/oncokb_structural_variants.csv'),
    pd.read_csv('processed-data/oncokb_cna.csv')], axis=0)
aggr_df = aggr_df.drop_duplicates(subset = ['alterations', 'drugs', 'approvedIndications', 'level', 'levelAssociatedCancerType', 'levelExcludedCancerTypes', 'abstracts','pmids', 'description', 'fdaLevel'], keep = 'first')
aggr_df.to_csv('processed-data/oncokb_all.csv', index=False)
print(aggr_df.shape)
aggr_df.head()

(291, 13)


Unnamed: 0,alterations,drugs,approvedIndications,level,fdaLevel,levelAssociatedCancerType,levelExcludedCancerTypes,pmids,abstracts,description,gene,change,cancer_type_raw_text
0,Oncogenic Mutations,Olaparib,,LEVEL_1,LEVEL_Fda2,"Prostate Cancer, NOS",,32343890,,Olaparib is a PARP inhibitor that is FDA-appro...,ATM,LOSS,"{'id': 944, 'code': '', 'color': 'Cyan', 'name..."
1,Oncogenic Mutations,Olaparib,,LEVEL_1,LEVEL_Fda2,Prostate Cancer,,32343890,,Olaparib is a PARP inhibitor that is FDA-appro...,ATM,LOSS,"{'id': 936, 'code': '', 'color': 'Cyan', 'name..."
2,Oncogenic Mutations,"Talazoparib,Enzalutamide",,LEVEL_1,LEVEL_Fda2,"Prostate Cancer, NOS",,37285865,,"Talazoparib, a small molecule PARP inhibitor, ...",ATM,LOSS,"{'id': 944, 'code': '', 'color': 'Cyan', 'name..."
3,Oncogenic Mutations,"Talazoparib,Enzalutamide",,LEVEL_1,LEVEL_Fda2,Prostate Cancer,,37285865,,"Talazoparib, a small molecule PARP inhibitor, ...",ATM,LOSS,"{'id': 936, 'code': '', 'color': 'Cyan', 'name..."
8,Oncogenic Mutations,"Talazoparib,Enzalutamide",,LEVEL_1,LEVEL_Fda2,"Prostate Cancer, NOS",,37285865,,"Talazoparib, a small molecule PARP inhibitor, ...",ATR,LOSS,"{'id': 944, 'code': '', 'color': 'Cyan', 'name..."
