# Preparation of CCLE data

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from pathlib import Path
import janitor

plt.style.use('seaborn-whitegrid')
plt.rcParams['figure.figsize'] = (8.0, 5.0)
plt.rcParams['axes.titlesize'] = 18
plt.rcParams['axes.labelsize'] = 15

data_dir = Path('../data')
modeling_data_dir = Path('../modeling_data')

In [2]:
mutation_df = pd.read_csv(data_dir / 'CCLE_mutation_data.csv').clean_names()
mutation_df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,hugo_symbol,entrez_gene_id,ncbi_build,chromosome,start_position,end_position,strand,variant_classification,variant_type,reference_allele,...,iscosmichotspot,cosmichscnt,exac_af,wes_ac,sangerwes_ac,sangerrecalibwes_ac,rnaseq_ac,hc_ac,rd_ac,wgs_ac
0,AGRN,375790,37,1,979072,979072,+,Silent,SNP,A,...,False,0,,27:24,9:10,9:12,104:20,,,15:13
1,ATAD3A,55210,37,1,1459233,1459233,+,Silent,SNP,A,...,False,0,8e-06,29:49,33:40,30:38,315:308,,,17:31
2,NADK,65220,37,1,1685635,1685635,+,Missense_Mutation,SNP,G,...,False,0,,25:39,16:19,17:20,176:266,,,14:23
3,PLCH2,9651,37,1,2436128,2436128,+,Missense_Mutation,SNP,G,...,False,0,,9:20,19:22,20:20,,,,23:15
4,LRRC47,57470,37,1,3703695,3703695,+,Silent,SNP,G,...,False,0,3.3e-05,19:21,7:19,8:17,87:104,,,11:16


In [3]:
mutation_columns = [
    'tumor_sample_barcode', 'hugo_symbol', 'chromosome', 'start_position', 
    'end_position', 'variant_classification', 'variant_type', 'protein_change'
]
mutation_df = mutation_df[mutation_columns]

In [4]:
hotspot_codons = [12, 13, 59, 61, 146]
hotspot_codons_re = '12|13|59|61|146'

kras_mutations = mutation_df \
    .pipe(lambda x: x[x.hugo_symbol == 'KRAS']) \
    .pipe(lambda x: x[x.variant_classification == 'Missense_Mutation']) \
    .pipe(lambda x: x[x.protein_change.str.contains(hotspot_codons_re)]) \
    [['tumor_sample_barcode', 'protein_change']] \
    .drop_duplicates() \
    .rename({'tumor_sample_barcode': 'cell_line'}, axis=1) \
    .groupby('cell_line') \
    .aggregate(lambda x: ';'.join(x.protein_change))
kras_mutations

Unnamed: 0_level_0,protein_change
cell_line,Unnamed: 1_level_1
A427_LUNG,p.G12D
A549_LUNG,p.G12S
AGS_STOMACH,p.G12D
AMO1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,p.A146T
ASPC1_PANCREAS,p.G12D
...,...
SW948_LARGE_INTESTINE,p.Q61L
TCCPAN2_PANCREAS,p.G12R
TOV21G_OVARY,p.G13C
UMUC3_URINARY_TRACT,p.G12C


In [5]:
kras_mutations[kras_mutations.protein_change.str.contains(';')]

Unnamed: 0_level_0,protein_change
cell_line,Unnamed: 1_level_1
NCIH2291_LUNG,p.G12V;p.G12C


In [7]:
mutation_df.to_csv(modeling_data_dir / 'ccle_mutations.csv', index=False)
kras_mutations.to_csv(modeling_data_dir / 'kras_mutants.csv', index=True)