In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [69]:
# Load data from CSV files into data frames with Pandas
# DepMap RNA-seq data in units of TPM (log2)
gdsc_pyr = pd.read_csv('GDSC_PYR.csv') #GDSC pyrimethamine sensitivity data
ung = pd.read_csv('UNG_19Q3.csv') #DepMap RNA-seq data for UNG
tmem173 = pd.read_csv('TMEM173_19Q3.csv') #DepMap RNA-seq data for TMEM173/STING
stat3 = pd.read_csv('STAT3_19Q3.csv') #DepMap RNA-seq data for STAT3
socs3 = pd.read_csv('SOCS3_19Q3.csv') #DepMap RNA-seq data for SOCS3
smarcc2 = pd.read_csv('SMARCC2_19Q3.csv') #DepMap RNA-seq data for SMARCC2
smarcc1 = pd.read_csv('SMARCC1_19Q3.csv') #DepMap RNA-seq data for SMARCC1
smarca4 = pd.read_csv('SMARCA4_19Q3.csv') #DepMap RNA-seq data for SMARCA4/BRG1
slc19a1 = pd.read_csv('SLC19A1_19Q3.csv') #DepMap RNA-seq data for SLC19A1
cxcl10 = pd.read_csv('CXCL10_19Q3.csv') #DepMap RNA-seq data for CXCL10
bcl6 = pd.read_csv('BCL6_19Q3.csv') #DepMap RNA-seq data for BCL6
apex1 = pd.read_csv('APEX1_19Q3.csv') #DepMap RNA-seq data for APEX1
dhfr = pd.read_csv('DHFR_19Q3.csv') #DepMap RNA-seq data for DHFR

In [70]:
# Make a list of RNA-seq data frames
df_list = [
    ung,
    tmem173,
    stat3,
    socs3,
    smarcc2,
    smarcc1,
    smarca4,
    slc19a1,
    cxcl10,
    bcl6,
    apex1,
    dhfr
]

# Rename TPM (log2) column to gene name and 
ung.rename(columns={'TPM (log2)': 'UNG'}, inplace=True)
tmem173.rename(columns={'TPM (log2)': 'TMEM173'}, inplace=True)
stat3.rename(columns={'TPM (log2)': 'STAT3'}, inplace=True)
socs3.rename(columns={'TPM (log2)': 'SOCS3'}, inplace=True)
smarcc2.rename(columns={'TPM (log2)': 'SMARCC2'}, inplace=True)
smarcc1.rename(columns={'TPM (log2)': 'SMARCC1'}, inplace=True)
smarca4.rename(columns={'TPM (log2)': 'SMARCA4'}, inplace=True)
slc19a1.rename(columns={'TPM (log2)': 'SLC19A1'}, inplace=True)
cxcl10.rename(columns={'TPM (log2)': 'CXCL10'}, inplace=True)
bcl6.rename(columns={'TPM (log2)': 'BCL6'}, inplace=True)
apex1.rename(columns={'TPM (log2)': 'APEX1'}, inplace=True)
dhfr.rename(columns={'TPM (log2)': 'DHFR'}, inplace=True)

# Rename Primary Disease column to Cancer Type for ung data frame only
ung.rename(columns={'Primary Disease': 'Cancer Type'}, inplace=True)

# Delete DepMap ID and Lineage columns from RNA-seq data frames
for df in df_list:
    df.drop('DepMap ID', axis=1, inplace=True)
    df.drop('Lineage', axis=1, inplace=True)

In [71]:
# Delete TCGA classification, Tissue, and Tissue sub-type columns from gdsc_pyr data frame
gdsc_pyr.drop('TCGA classification', axis=1, inplace=True)
gdsc_pyr.drop('Tissue', axis=1, inplace=True)
gdsc_pyr.drop('Tissue sub-type', axis=1, inplace=True)

# Remove '-' from cell line names and force to uppercase
gdsc_pyr['Cell line'] = gdsc_pyr['Cell line'].str.replace('-', '').str.upper()

In [72]:
# Merge gdsc_pyr with RNA-seq data frames
temp_merge = pd.merge(left=gdsc_pyr, right=ung, left_on='Cell line', right_on='Cell Line Name', how='inner')
temp_merge = pd.merge(left=temp_merge, right=tmem173, left_on='Cell line', right_on='Cell Line Name', how='inner')
temp_merge = pd.merge(left=temp_merge, right=stat3, left_on='Cell line', right_on='Cell Line Name', how='inner')
temp_merge = pd.merge(left=temp_merge, right=socs3, left_on='Cell line', right_on='Cell Line Name', how='inner')
temp_merge = pd.merge(left=temp_merge, right=smarcc2, left_on='Cell line', right_on='Cell Line Name', how='inner')
temp_merge = pd.merge(left=temp_merge, right=smarcc1, left_on='Cell line', right_on='Cell Line Name', how='inner')
temp_merge = pd.merge(left=temp_merge, right=smarca4, left_on='Cell line', right_on='Cell Line Name', how='inner')
temp_merge = pd.merge(left=temp_merge, right=slc19a1, left_on='Cell line', right_on='Cell Line Name', how='inner')
temp_merge = pd.merge(left=temp_merge, right=cxcl10, left_on='Cell line', right_on='Cell Line Name', how='inner')
temp_merge = pd.merge(left=temp_merge, right=bcl6, left_on='Cell line', right_on='Cell Line Name', how='inner')
temp_merge = pd.merge(left=temp_merge, right=apex1, left_on='Cell line', right_on='Cell Line Name', how='inner')
final = pd.merge(left=temp_merge, right=dhfr, left_on='Cell line', right_on='Cell Line Name', how='inner')

In [75]:
# Remove duplicate columns from final data frame
to_drop = []
for column in final:
    if column.endswith('_y') or column.endswith('_x'):
        to_drop.append(column)
    final = final.drop(to_drop, axis=1, inplace=False)
    
# Remove Cancer Type column from final data frame
final = final.drop('Cancer Type', axis=1, inplace=False)

# Reorder columns in final data frame
final = final.reindex(columns=[
    'Cell line',
    'Primary Disease',
    'IC50',
    'AUC',
    'APEX1', 
    'BCL6',
    'CXCL10',
    'DHFR',
    'SLC19A1',
    'SMARCA4',
    'SMARCC1',
    'SMARCC2',
    'SOCS3',
    'STAT3',
    'TMEM173',
    'UNG']
)

In [76]:
final

Unnamed: 0,Cell line,Primary Disease,IC50,AUC,APEX1,BCL6,CXCL10,DHFR,SLC19A1,SMARCA4,SMARCC1,SMARCC2,SOCS3,STAT3,TMEM173,UNG
0,LS411N,Colon/Colorectal Cancer,0.722230,0.420598,7.846932,2.454176,0.356144,5.849999,3.544733,6.252476,5.146492,5.495695,1.298658,2.100978,3.129283,6.725196
1,GIMEN,Neuroblastoma,1.431816,0.519293,7.530289,5.113534,0.000000,4.176323,1.863938,5.381975,4.894818,5.582255,6.113117,5.920293,2.937344,4.146492
2,697,Leukemia,1.565345,0.538762,8.614636,4.211012,0.000000,6.695019,4.401221,7.706254,6.159871,5.975676,0.056584,4.422906,2.805292,5.455163
3,ES2,Ovarian Cancer,1.912182,0.569532,7.926889,3.968091,0.000000,4.661065,4.155425,6.382494,4.948134,4.862451,4.458776,3.777157,3.958843,4.548437
4,LAMA84,Leukemia,2.059533,0.578236,7.640317,3.606442,0.310340,5.381283,3.694880,6.350144,5.408712,5.711220,5.046578,6.170726,7.203397,4.485427
5,OSRC2,Kidney Cancer,2.059810,0.567375,7.126601,4.761285,0.000000,5.414474,3.876762,5.816600,4.264536,6.022368,3.880686,5.850749,5.150153,5.233044
6,COLO829,Skin Cancer,2.358134,0.585737,7.265381,3.649615,0.028569,3.532317,3.799087,6.296824,5.105594,4.895303,0.650765,5.448901,4.739848,4.516646
7,SJSA1,Bone Cancer,2.437705,0.597080,7.247358,2.397803,0.807355,6.373996,4.497612,6.535742,4.796494,5.371907,4.531693,5.926474,6.200850,5.022812
8,UACC257,Skin Cancer,2.575552,0.606783,7.806324,2.070389,0.000000,5.712045,4.928844,7.171427,4.538538,4.934045,1.021480,5.093814,5.224195,5.979568
9,KM12,Colon/Colorectal Cancer,2.605882,0.609712,7.324901,2.671293,0.042644,5.683416,2.910733,5.418865,4.498251,4.915999,1.427606,3.670161,0.400538,4.503349


1. What is the relationship between IC50 and AUC?
2. Are certain types of cancers more sensitive to pyrimethamine?
3. Does pyrimethamine sensitivity correlate with individual gene expression? Does this change depending on IC50-AUC relationship?