In [1]:
import pandas as pd 
import numpy as np
import os
from tqdm import tqdm
from glob import glob
import requests
import ast
from bs4 import BeautifulSoup
os.makedirs('processed-data', exist_ok=True)

In [19]:
#copy number alterations
df_all = pd.DataFrame()
for f in glob('raw-data/cna/*.csv'):
    df = pd.read_csv(f)
    for c in df.columns:
        df[c] = df[c].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and (x.startswith('[') and x.endswith(']')) or (x.startswith('{') and x.endswith('}')) else x)
    df['gene'] = f.split('/')[-1].split('_')[0]
    df['alterations'] = df['alterations'].apply(lambda x: ','.join(x))
    df['drugs'] = df['drugs'].apply(lambda x: ','.join([i['drugName'] for i in x]))
    df['approvedIndications'] = df['approvedIndications'].apply(lambda x: x[0] if len(x) > 0 else np.nan)
    df['cancer_type_raw_text'] = df['levelAssociatedCancerType']
    df['levelAssociatedCancerType'] = df['levelAssociatedCancerType'].apply(lambda x: x['name'] if x['name'] != '' else x['mainType']['name'])
    df['levelExcludedCancerTypes'] = df['levelExcludedCancerTypes'].apply(lambda x: ','.join(x))
    df['pmids'] = df['pmids'].apply(lambda x: ','.join(x))
    df['abstracts'] = df['abstracts'].apply(lambda x: ','.join([i['link'] for i in x]))
    df_all = pd.concat([df_all, df], axis=0)
df_all = df_all.reset_index(drop = True).sort_values(by = ['level', 'gene'])
print(df_all.shape)
df_all.to_csv('processed-data/oncokb_cna.csv', index=False)
df_all.alterations.value_counts()

(398, 12)


alterations
Oncogenic Mutations                                                                   264
Amplification                                                                         126
Deletion                                                                                6
Oncogenic Mutations (excluding Y646S, Y646H, Y646C, Y646F, Y646N, A682G and A692V)      2
Name: count, dtype: int64

In [20]:
#protein change
df_all = pd.DataFrame()
for f in glob('raw-data/protein_change/*.csv'):
    df = pd.read_csv(f)
    for c in df.columns:
        df[c] = df[c].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and (x.startswith('[') and x.endswith(']')) or (x.startswith('{') and x.endswith('}')) else x)
    df['gene'] = f.split('/')[-1].split('_')[0]
    df['change'] =  f.split('/')[-1].split('_')[1].replace('.csv', '')
    df['alterations'] = df['alterations'].apply(lambda x: ','.join(x))
    df['drugs'] = df['drugs'].apply(lambda x: ','.join([i['drugName'] for i in x]))
    df['approvedIndications'] = df['approvedIndications'].apply(lambda x: x[0] if len(x) > 0 else np.nan)
    df['cancer_type_raw_text'] = df['levelAssociatedCancerType']
    df['levelAssociatedCancerType'] = df['levelAssociatedCancerType'].apply(lambda x: x['name'] if x['name'] != '' else x['mainType']['name'])
    df['levelExcludedCancerTypes'] = df['levelExcludedCancerTypes'].apply(lambda x: x)
    df['pmids'] = df['pmids'].apply(lambda x: ','.join(x))
    df['abstracts'] = df['abstracts'].apply(lambda x: ','.join([i['link'] for i in x]))
    df_all = pd.concat([df_all, df], axis=0)
df_all = df_all.reset_index(drop = True).sort_values(by = ['level', 'gene'])
print(df_all.shape)
df_all.to_csv('processed-data/oncokb_protein_change.csv', index=False)
df_all.alterations.value_counts()

(1667, 13)


alterations
Oncogenic Mutations                                                                                                                                                                                                              735
D368N,D275V,D275G,D275A,A463V,A463T,A463D,A288V,S461L,K425R,V464A,V411L,P286R,P286S,P286H,S459F,L424V,L424I,F367S,S297F,S297Y,A456P,P436R,M295R,V411M,M444K,D368Y,F367V,S459Y,P286L,Y458H,Y458C,S461T,S461P,P436S,P436H,F367L    296
Oncogenic Mutations (excluding V600)                                                                                                                                                                                              48
R132S,R132L,R132C,R132H,R132G                                                                                                                                                                                                     35
D402N,L474P,S478N,E318K                                                 

In [21]:
#structural variants
df_all = pd.DataFrame()
for f in glob('raw-data/sv/*.csv'):
    df = pd.read_csv(f)
    for c in df.columns:
        df[c] = df[c].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and (x.startswith('[') and x.endswith(']')) or (x.startswith('{') and x.endswith('}')) else x)
    df['gene'] = f.split('/')[-1].split('_')[0].replace('.csv','')
    #df['change'] =  f.split('/')[-1].split('_')[1].replace('.csv', '')
    df['alterations'] = df['alterations'].apply(lambda x: ','.join(x))
    df['drugs'] = df['drugs'].apply(lambda x: ','.join([i['drugName'] for i in x]))
    df['approvedIndications'] = df['approvedIndications'].apply(lambda x: x[0] if len(x) > 0 else np.nan)
    df['cancer_type_raw_text'] = df['levelAssociatedCancerType']
    df['levelAssociatedCancerType'] = df['levelAssociatedCancerType'].apply(lambda x: x['name'] if x['name'] != '' else x['mainType']['name'])
    df['levelExcludedCancerTypes'] = df['levelExcludedCancerTypes'].apply(lambda x: x[0]['name'] if len(x) > 0 else np.nan)#x['name'] != '' else x['mainType']['name'])
    df['pmids'] = df['pmids'].apply(lambda x: ','.join(x))
    df['abstracts'] = df['abstracts'].apply(lambda x: ','.join([i['link'] for i in x]))
    df_all = pd.concat([df_all, df], axis=0)
df_all = df_all.reset_index(drop = True).sort_values(by = ['level', 'gene'])
print(df_all.shape)
df_all.to_csv('processed-data/oncokb_structural_variants.csv', index=False)
df_all.alterations.value_counts()

(714, 12)


alterations
Fusions                                 427
Oncogenic Mutations                     119
BCR-ABL1 Fusion                          70
Oncogenic Mutations (excluding V600)     56
PML-RARA Fusion                          21
EWSR1-FLI1 Fusion                        14
COL1A1-PDGFB Fusion                       7
Name: count, dtype: int64

In [22]:
aggr_df = pd.concat([
    pd.read_csv('processed-data/oncokb_protein_change.csv'),
    pd.read_csv('processed-data/oncokb_structural_variants.csv'),
    pd.read_csv('processed-data/oncokb_cna.csv')], axis=0)
aggr_df = aggr_df.drop_duplicates(subset = ['alterations', 'drugs', 'approvedIndications', 'level', 'levelAssociatedCancerType', 'levelExcludedCancerTypes', 'abstracts','pmids', 'description', 'fdaLevel'], keep = 'first')
aggr_df.to_csv('processed-data/oncokb_all.csv', index=False)
print(aggr_df.shape)
aggr_df.head()

(625, 13)


Unnamed: 0,alterations,drugs,approvedIndications,level,fdaLevel,levelAssociatedCancerType,levelExcludedCancerTypes,pmids,abstracts,description,gene,change,cancer_type_raw_text
0,T315I,Ponatinib,,LEVEL_1,LEVEL_Fda2,B-Lymphoblastic Leukemia/Lymphoma,[],241804942319022129567798,,Ponatinib is a small molecule kinase inhibitor...,ABL1,t315i,"{'id': 95, 'code': 'BLL', 'color': 'LimeGreen'..."
1,T315I,Ponatinib,,LEVEL_1,LEVEL_Fda2,Chronic Myelogenous Leukemia,[],241804942319022119878872,,Ponatinib is a small molecule kinase inhibitor...,ABL1,t315i,"{'id': 560, 'code': 'CML', 'color': 'LightSalm..."
2,T315I,Asciminib,,LEVEL_1,LEVEL_Fda2,Chronic Myelogenous Leukemia,[],,https://ash.confex.com/ash/2020/webprogram/Pap...,Asciminib is a STAMP (Specifically Targeting t...,ABL1,t315i,"{'id': 560, 'code': 'CML', 'color': 'LightSalm..."
3,E17K,"Capivasertib,Fulvestrant",,LEVEL_1,LEVEL_Fda2,Breast Cancer,[],37256976,,"Capivasertib is an orally available, ATP-compe...",AKT1,e17k,"{'id': 873, 'code': '', 'color': 'HotPink', 'n..."
4,V600E,Dabrafenib,Dabrafenib is FDA-approved for BRAF V600E muta...,LEVEL_1,LEVEL_Fda2,Melanoma,[],226083382305196622735384,,Dabrafenib is an orally bioavailable RAF inhib...,BRAF,v600e,"{'id': 453, 'code': 'MEL', 'color': 'Black', '..."


In [23]:
aggr_df.level.value_counts()

level
LEVEL_1     221
LEVEL_2     140
LEVEL_3A    104
LEVEL_4      71
LEVEL_R2     61
LEVEL_R1     28
Name: count, dtype: int64