In [1]:
import numpy as np
import pandas as pd
from cell_name_mapping import name_map

In [2]:
df = pd.read_csv(
    "../../ablation_GraphPINE/data/IC50.zip",
    usecols=["NSC", "CELL_NAME", "LOG_HI_CONCENTRATION"],
)
df.CELL_NAME = [name_map[i] for i in df.CELL_NAME]
df.head()

Unnamed: 0,NSC,LOG_HI_CONCENTRATION,CELL_NAME
0,123127,-4.6021,NCI_H23
1,123127,-4.6021,NCI_H522
2,123127,-4.6021,SF_268
3,123127,-4.6021,SF_295
4,123127,-4.6021,U251


In [3]:
train = pd.read_csv("../../ablation_GraphPINE/data/train_IC50.csv")
valid = pd.read_csv("../../ablation_GraphPINE/data/valid_IC50.csv")
test = pd.read_csv("../../ablation_GraphPINE/data/test_IC50.csv")

In [4]:
drugs = sorted(set(train["NSC"]) | set(valid["NSC"]) | set(test["NSC"]))
cell_name = sorted(
    set(train["CELL_NAME"]) | set(valid["CELL_NAME"]) | set(test["CELL_NAME"])
)

In [5]:
df = df[df.NSC.isin(drugs) & df.CELL_NAME.isin(cell_name)]
df = df.groupby(["NSC", "CELL_NAME"])["LOG_HI_CONCENTRATION"].mean().reset_index()
df.head()

Unnamed: 0,NSC,CELL_NAME,LOG_HI_CONCENTRATION
0,186,786_0,-4.0
1,186,A498,-3.5806
2,186,A549,-3.600571
3,186,ACHN,-4.0
4,186,BT_549,-4.0


In [6]:
df = df.pivot(index="NSC", columns="CELL_NAME", values="LOG_HI_CONCENTRATION")
df.index = list(df.index)
df.columns = list(df.columns)
df = df.T
df.to_csv("Data/cell_drug.csv")
df

Unnamed: 0,186,295,721,740,750,752,755,757,762,844,...,801012,801013,801014,802113,803712,804962,807579,809693,813783,820799
786_0,-4.0,-3.5,-3.5,-3.676456,-3.475342,-3.606378,-3.1249,-4.0,-3.367571,-3.1505,...,-5.0,-5.30105,-5.30105,-4.1505,-5.0,-4.301,-4.301,-4.1249,-4.6021,-5.0
A498,-3.5806,-3.5,-3.5,-3.673421,-3.54189,-3.606288,-3.1249,-4.018717,-3.319892,-2.301,...,-5.0,-5.30105,-5.30105,-4.1505,-5.0,-4.301,-4.301,-4.1249,-4.6021,
A549,-3.600571,-3.5,-3.5,-3.698016,-3.554876,-3.605258,-3.1249,-4.085042,-3.327377,-3.1505,...,-5.0,-5.30105,-5.30105,-4.1505,-5.0,-4.301,-4.301,-4.1249,-4.6021,-5.0
ACHN,-4.0,-3.5,-3.5,-3.680634,-3.487414,-3.606425,-3.1249,-4.0,-3.3709,-3.1505,...,-5.0,-5.30105,-5.30105,-4.1505,-5.0,-4.301,-4.301,-4.1249,-4.6021,-5.0
BT_549,-4.0,-3.0,-3.5,-3.782474,-3.150525,-3.615821,-3.1249,-4.0,-3.500714,,...,-5.0,-5.30105,-5.30105,-4.1505,-5.0,-4.301,-4.301,-4.1249,-4.6021,-5.0
CAKI_1,-3.6505,-3.5,-3.5,-3.704931,-3.549743,-3.605854,-3.1249,-3.68164,-3.315872,-3.1505,...,-5.0,-5.30105,-5.30105,-4.1505,-5.0,-4.301,-4.301,-4.1249,-4.6021,-5.0
CCRF_CEM,-3.563125,-3.5,-3.5,-3.664506,-3.547364,-3.605388,-3.1249,-4.017277,-3.316533,-3.1505,...,-5.0,-5.30105,-5.30105,-4.1505,-5.0,-4.301,-4.301,-4.1249,-4.6021,-5.0
COLO205,-3.563125,-3.5,-3.5,-3.703706,-3.550857,-3.605416,-3.1249,-3.68164,-3.314706,-3.1505,...,-5.0,-5.30105,-5.30105,-4.1505,-5.0,-4.301,-4.301,-4.1249,-4.6021,-5.0
DU_145,-4.0,-3.5,-3.5,-3.827568,-3.30105,-3.614158,-3.1249,-4.0,-3.47575,,...,-5.0,-5.30105,-5.30105,-4.1505,-5.0,-4.301,-4.301,-4.1249,-4.6021,-5.0
EKVX,-3.600571,-3.5,-3.5,-3.703706,-3.549743,-3.60559,-3.1249,-4.018717,-3.316196,-3.1505,...,-5.0,-5.30105,-5.30105,-4.1505,-5.0,-4.301,-4.301,-4.1249,-4.6021,-5.0


In [23]:
df.isna().astype(int).to_csv("Data/null_mask.csv")
df.isna().astype(int)

Unnamed: 0,186,295,721,740,750,752,755,757,762,844,...,801012,801013,801014,802113,803712,804962,807579,809693,813783,820799
786_0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A498,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
A549,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ACHN,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BT_549,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
CAKI_1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CCRF_CEM,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
COLO205,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DU_145,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
EKVX,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
df_bi = (df < -2).astype(int)
df_bi.to_csv("Data/cell_drug_binary.csv")
df_bi

Unnamed: 0,186,295,721,740,750,752,755,757,762,844,...,801012,801013,801014,802113,803712,804962,807579,809693,813783,820799
786_0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
A498,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,0
A549,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
ACHN,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
BT_549,1,1,1,1,1,1,1,1,1,0,...,1,1,1,1,1,1,1,1,1,1
CAKI_1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
CCRF_CEM,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
COLO205,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
DU_145,1,1,1,1,1,1,1,1,1,0,...,1,1,1,1,1,1,1,1,1,1
EKVX,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [14]:
nsc2SMILES = pd.read_csv("/Users/inouey2/Downloads/nsc_smiles.csv")
nsc2SMILES = nsc2SMILES.loc[nsc2SMILES.NSC.isin(df_bi.columns)]
nsc2SMILES.head()

Unnamed: 0,NSC,SMILES
185,186,C[C@H]1OC=C2C(=C(C(=O)O)C(=O)C(=C2[C@@H]1C)C)O
294,295,OC(=O)CCCc1ccccc1
720,721,NC(=S)N\N=C\1/C(=O)Nc2ccccc12
739,740,CN(Cc1cnc2nc(N)nc(N)c2n1)c3ccc(cc3)C(=O)N[C@@H...
749,750,CS(=O)(=O)OCCCCOS(=O)(=O)C


In [15]:
from rdkit import Chem
from rdkit.Chem import AllChem

params = Chem.SmilesParserParams()
params.useChirality = True
params.radicalElectrons = 2
params.removeHs = False
params.replacements = {}

mfp = []

for i in nsc2SMILES["SMILES"]:
    mol = Chem.MolFromSmiles(i, params=params)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=920)
    mfp.append(np.array(fp))

mfp = pd.DataFrame(mfp, dtype=np.float32, index=list(nsc2SMILES["NSC"]))
mfp.to_csv("Data/drug_feature.csv")
mfp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,910,911,912,913,914,915,916,917,918,919
186,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
295,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
721,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
740,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
804962,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
807579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
809693,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
813783,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
test["labels"] = np.load("../../ablation_GraphPINE/data/test_IC50_labels.npy")
test = test.rename(columns={"NSC": "drugs", "CELL_NAME": "cells"})[
    ["cells", "drugs", "labels"]
]
test.to_csv("Data/testset.csv")
test

Unnamed: 0,cells,drugs,labels
0,SK_MEL_2,767354,1.0
1,HOP_62,800108,1.0
2,OVCAR_5,800937,1.0
3,OVCAR_5,654830,1.0
4,BT_549,800812,1.0
...,...,...,...
6520,SK_MEL_2,756328,1.0
6521,SK_OV_3,799339,1.0
6522,SK_MEL_2,799341,1.0
6523,SN12C,799363,1.0


In [17]:
gene_exp = pd.read_csv(
    "../../ablation_GraphPINE/data/nci60_gene_exp.csv", index_col=0
).T
gene_exp.to_csv("Data/gene_feature.csv")
gene_exp.head()

Unnamed: 0,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A3GALT2,A4GALT,A4GNT,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
MCF7,1.92,2.89,0.03,0.03,0.21,0.0,0.0,0.0,2.46,0.0,...,8.52,22.48,0.44,1.51,5.59,0.97,0.94,6.99,2.52,3.05
MDA_MB_231,0.49,0.12,0.02,0.02,0.0,0.01,0.0,0.07,0.22,0.0,...,16.76,27.56,0.32,3.43,1.44,0.55,1.58,8.15,0.45,18.57
HS578T,3.37,1.17,0.04,0.04,0.0,0.09,0.0,0.0,0.35,0.3,...,1.97,3.22,0.16,0.34,2.11,0.14,1.42,103.53,1.27,1.65
BT_549,6.0,1.92,0.0,0.0,0.0,0.0,0.0,0.0,5.52,0.0,...,1.93,3.75,0.45,0.94,2.79,0.49,1.22,36.05,2.5,2.31
T47D,3.73,1.65,0.01,0.01,0.06,0.0,0.0,0.15,0.71,0.0,...,4.02,25.63,0.55,1.85,5.18,0.8,1.51,13.0,1.64,1.28


In [18]:
train["labels"] = np.load("../../ablation_GraphPINE/data/train_IC50_labels.npy")
train = train.rename(columns={"NSC": "drugs", "CELL_NAME": "cells"})[
    ["cells", "drugs", "labels"]
]
valid["labels"] = np.load("../../ablation_GraphPINE/data/valid_IC50_labels.npy")
valid = valid.rename(columns={"NSC": "drugs", "CELL_NAME": "cells"})[
    ["cells", "drugs", "labels"]
]
tmp = pd.concat([train, valid])
tmp.to_csv("Data/5-fold_CV.csv")
tmp

Unnamed: 0,cells,drugs,labels
0,NCI_H522,756364,1.0
1,SF_539,800964,1.0
2,ACHN,666096,0.0
3,SF_295,251222,0.0
4,UO_31,799346,1.0
...,...,...,...
4511,SF_539,756381,1.0
4512,MOLT_4,701996,0.0
4513,MDA_N,752,0.0
4514,NCI_H522,756303,1.0
