In [1]:
import pandas as pd
import numpy as np 
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import json
import random

In [2]:
drug_response_data = pd.read_csv("/data/yingfei/cancer_data/cell_drug_auc_final_1111.csv")
print(drug_response_data.shape)
drug_response_data.head()

(692, 386)


Unnamed: 0,ARXSPAN_ID,JW-7-24-1,KIN001-260,NSC-87877,GNE-317,NAVITOCLAX,PLX-4720,ERK5-IN-1,VX-11E,TGX-221,...,BIX02189,ISPINESIB MESYLATE,KIN001-135,KIN001-206,KIN001-236,KIN001-266,LUMINESPIB,NUTLIN-3A,SGC0946,SL 0101-1
0,ACH-000001,0.778432,0.951321,0.840287,-99999.0,-99999.0,0.93641,0.891908,0.402122,0.6596,...,0.982272,0.733492,0.072277,0.983536,-99999.0,0.651746,0.982084,0.893777,0.983861,0.933035
1,ACH-000002,0.788327,0.87476,0.760137,0.905754,0.930764,0.991068,0.278288,0.972905,0.558774,...,0.899628,0.604228,0.046061,0.918265,0.865628,-99999.0,0.985921,0.839175,0.988639,0.533204
2,ACH-000004,0.73763,0.938733,0.835441,0.808964,0.966133,0.983552,-99999.0,0.993912,0.596027,...,0.965245,0.782295,0.366912,0.844979,0.761323,-99999.0,0.980945,0.946694,0.992605,0.761956
3,ACH-000006,0.176396,0.571569,0.581729,0.585938,0.95851,0.935574,0.600428,-99999.0,0.596056,...,0.834158,0.60536,0.12493,0.813384,0.782187,-99999.0,0.971058,0.871277,-99999.0,0.49502
4,ACH-000007,0.569751,0.755449,0.954924,0.939587,0.965638,0.932666,-99999.0,0.956481,0.838692,...,0.927519,0.812955,0.71978,0.815099,0.795736,0.860768,0.983317,0.888067,0.987564,0.787948


In [3]:
cell_line_mutation = pd.read_csv("/data/yingfei/cancer_data/mutations_raw_324_final.csv")
print(cell_line_mutation.shape)
cell_line_mutation.head()

(692, 325)


Unnamed: 0,DepMap_ID,ABL1,ACVR1B,AKT1,AKT2,AKT3,ALK,ALOX12B,FAM123B,APC,...,VEGFA,VHL,WHSC1,WHSC1L1,WT1,XPO1,XRCC2,ZNF217,ZNF703,TERC
0,ACH-000001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
1,ACH-000002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,ACH-000004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,ACH-000006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,ACH-000007,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [4]:
drug_smiles_data = pd.read_csv("/data/yingfei/cancer_data/drug_smiles.csv", header = None, names = ['drug_name', 'smiles'])
print(drug_smiles_data.shape)
drug_smiles_data.head()

(408, 2)


Unnamed: 0,drug_name,smiles
0,JW-7-24-1,COC1=CC(=CC(=C1)C2=CC3=C4C(=CN=C3C=C2)C=CC(=O)...
1,KIN001-260,C1CC1COC2=CC=CC(=O)C2=C3C=C(C(=C(N3)N)C#N)C4CC...
2,NSC-87877,C1=CC2=C(C(=O)C(=NNC3=CC4=C(C=C3)C=C(C=C4)S(=O...
3,GNE-317,CC1=C(SC2=C1N=C(N=C2N3CCOCC3)C4=CN=C(N=C4)N)C5...
4,NAVITOCLAX,CC1(CCC(=C(C1)CN2CCN(CC2)C3=CC=C(C=C3)C(=O)NS(...


In [5]:
### drug sensitivity task [given the prompt with a cell line and 1 candidate drug -> predict sensitive/resistant and provide reasoning]

### Input Prompt:
# Think step by step and decide in a single word reflecting the drug sensitivity of the drug on the cell line with given mutations: [Sensitive/Resistant], [Reasoning]
# Drug and cell line mutations: 
# The drug is AGI-6780. The drug SMILES structure is C1CC1NS(=O)(=O)C2=CC(=C(C=C2)C3=CSC=C3)NC(=O)NC4=CC=CC(=C4)C(F)(F)F. The drug target is IDH2(R140Q). The drug target pathway is Metabolism. 
# The mutations of the cell line are NOTCH1 NOTCH3 PIK3R1 PPP2R1A TP53 TSC2 WHSCL1. 
# Drug Sensitivity: ?

### Data needed for prompt: cell line mutation data, drug information (drug name, drug SMILES structure, target pathway [KEGG/database])
### Data needed for output: cell line drug response prediction (drug with lower AUDRC value indicates better response)

### Intermediate dataset: Cell line name, mutation status columns, Drug 1 name, Drug 1 SMILES, Drug 1 target pathway, Drug 1 response, Drug 2 name, ...
### Initiliaze the data
interm_data = {
    'cell_id': [],
    'mut_status': []
}

interm_data['drug_name'] = []
interm_data['drug_SMILES'] = []
interm_data['drug_target'] = []
interm_data['drug_target_path'] = []
interm_data['drug_response'] = []
interm_data['threshold_high'] = []
interm_data['threshold_low'] = []
interm_data['sensitivity'] = []

In [6]:
drug_response_subset = drug_response_data[drug_response_data.columns[1:]].copy()
# can change to other threshold
drug_response_data['threshold_high'] = drug_response_subset[drug_response_subset>0].quantile(0.95, axis = 1)
drug_response_data['threshold_low'] = drug_response_subset[drug_response_subset>0].quantile(0.05, axis = 1)
drug_response_data.head()

Unnamed: 0,ARXSPAN_ID,JW-7-24-1,KIN001-260,NSC-87877,GNE-317,NAVITOCLAX,PLX-4720,ERK5-IN-1,VX-11E,TGX-221,...,KIN001-135,KIN001-206,KIN001-236,KIN001-266,LUMINESPIB,NUTLIN-3A,SGC0946,SL 0101-1,threshold_high,threshold_low
0,ACH-000001,0.778432,0.951321,0.840287,-99999.0,-99999.0,0.93641,0.891908,0.402122,0.6596,...,0.072277,0.983536,-99999.0,0.651746,0.982084,0.893777,0.983861,0.933035,0.984736,0.510903
1,ACH-000002,0.788327,0.87476,0.760137,0.905754,0.930764,0.991068,0.278288,0.972905,0.558774,...,0.046061,0.918265,0.865628,-99999.0,0.985921,0.839175,0.988639,0.533204,0.988788,0.348857
2,ACH-000004,0.73763,0.938733,0.835441,0.808964,0.966133,0.983552,-99999.0,0.993912,0.596027,...,0.366912,0.844979,0.761323,-99999.0,0.980945,0.946694,0.992605,0.761956,0.992118,0.391353
3,ACH-000006,0.176396,0.571569,0.581729,0.585938,0.95851,0.935574,0.600428,-99999.0,0.596056,...,0.12493,0.813384,0.782187,-99999.0,0.971058,0.871277,-99999.0,0.49502,0.977434,0.264454
4,ACH-000007,0.569751,0.755449,0.954924,0.939587,0.965638,0.932666,-99999.0,0.956481,0.838692,...,0.71978,0.815099,0.795736,0.860768,0.983317,0.888067,0.987564,0.787948,0.985497,0.566427


In [7]:
### Fill in data features
gene_columns = cell_line_mutation.columns[1:]
drug_columns = drug_response_data.columns[1:-2]
for i in tqdm(range(len(drug_response_data))):
    for j in range(len(drug_columns)):
        drug_response = drug_response_data.loc[i,drug_columns[j]]
        threshold_high = drug_response_data.loc[i, 'threshold_high']
        threshold_low = drug_response_data.loc[i, 'threshold_low']
        if drug_response > 0:
            if drug_response > threshold_high: # resistant
                interm_data['sensitivity'].append("Resistant")
            elif drug_response < threshold_low:
                interm_data['sensitivity'].append("Sensitive")
            else:
                continue
            interm_data['cell_id'].append(cell_line_mutation.loc[i, 'DepMap_ID'])
            interm_data['mut_status'].append(', '.join(gene for gene in gene_columns if cell_line_mutation.loc[i, gene] == 1))
            interm_data['drug_name'].append(drug_columns[j])
            interm_data['drug_response'].append(drug_response)
            interm_data['threshold_high'].append(threshold_high)
            interm_data['threshold_low'].append(threshold_low)

100%|████████████████████████████████████████████████████████████████████████████████████████| 692/692 [02:36<00:00,  4.42it/s]


In [8]:
def drug_info_ws_v2(drug_name): ### faster
    try:
        url = f'https://www.cancerrxgene.org/search?query={drug_name}'
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        drug_info_table = soup.find_all('table')[0].find_all('td')
        drug_target = drug_info_table[2].text.strip()
        drug_target_pathway = drug_info_table[3].text.strip()
        return {'name': drug_name, 'target': drug_target, 'pathway': drug_target_pathway}
    except Exception as e:
        # print(f"Failed to retrieve the web page overview for drug {drug_name}, {e}")
        return {'name': "", 'target': "", 'pathway': ""}

In [9]:
### Drug features (dict)
drug_info_dict = {}
not_avail_drug_lst = []
for i in tqdm(range(len(drug_columns))):
    drug_name = drug_columns[i]
    smiles = drug_smiles_data.loc[drug_smiles_data.drug_name == drug_name, 'smiles'].values[0]
    drug_info_data = drug_info_ws_v2(drug_name)
    if drug_name not in drug_info_dict:
        drug_info_dict[drug_name] = {}
    drug_info_dict[drug_name]['SMILES'] = smiles
    if drug_info_data['target'] == "" and drug_name not in not_avail_drug_lst:
        not_avail_drug_lst.append(drug_name)
        drug_info_dict[drug_name]['target'] = 'Unknown'
        drug_info_dict[drug_name]['target_pathway'] = 'Unknown'
    else:
        drug_info_dict[drug_name]['target'] = drug_info_data['target']
        drug_info_dict[drug_name]['target_pathway'] = drug_info_data['pathway']

100%|████████████████████████████████████████████████████████████████████████████████████████| 385/385 [07:39<00:00,  1.19s/it]


In [10]:
with open("drug_info_dict.json", "w") as file:
    json.dump(drug_info_dict , file) 

In [11]:
with open("drug_info_dict.json","r") as file:
    drug_info_json = json.load(file)

In [12]:
for drug_name in interm_data['drug_name']:
    interm_data['drug_SMILES'].append(drug_info_dict[drug_name]['SMILES'])
    interm_data['drug_target'].append(drug_info_dict[drug_name]['target'])
    interm_data['drug_target_path'].append(drug_info_dict[drug_name]['target_pathway'])

In [13]:
interm_df = pd.DataFrame(interm_data)
print(interm_df.shape)
interm_df.head()

(22940, 10)


Unnamed: 0,cell_id,mut_status,drug_name,drug_SMILES,drug_target,drug_target_path,drug_response,threshold_high,threshold_low,sensitivity
0,ACH-000001,"NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, W...",VX-11E,CC1=CN=C(NC2=C(Cl)C=C(F)C=C2)N=C1C3=CNC(C(N[C@...,ERK2,ERK MAPK signaling,0.402122,0.984736,0.510903,Sensitive
1,ACH-000001,"NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, W...",AGI-6780,C1CC1NS(=O)(=O)C2=CC(=C(C=C2)C3=CSC=C3)NC(=O)N...,IDH2(R140Q),Metabolism,0.49162,0.984736,0.510903,Sensitive
2,ACH-000001,"NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, W...",AZD8931,CNC(=O)CN1CCC(CC1)OC2=C(C=C3C(=C2)C(=NC=N3)NC4...,"EGFR, ERBB2, ERBB3",RTK signaling,0.507522,0.984736,0.510903,Sensitive
3,ACH-000001,"NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, W...",IMATINIB,CC1=C(C=C(C=C1)NC(=O)C2=CC=C(C=C2)CN3CCN(CC3)C...,"ABL, KIT, PDGFR","Other, kinases",0.4067,0.984736,0.510903,Sensitive
4,ACH-000001,"NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, W...",XMD11-85H,O=C1CCN(C2CCCC2)C3=NC(NC4=CC=C(C(NC5CCN(C)CC5)...,"LRRK2, ERK5","Other, kinases",0.990736,0.984736,0.510903,Resistant


In [14]:
interm_data.keys()

dict_keys(['cell_id', 'mut_status', 'drug_name', 'drug_SMILES', 'drug_target', 'drug_target_path', 'drug_response', 'threshold_high', 'threshold_low', 'sensitivity'])

In [15]:
len(not_avail_drug_lst)

38

In [16]:
### Some drug info is not available on the GDSC platform?
print(not_avail_drug_lst)

['SB-505124', 'TORIN-2', 'RU-SKI-43', 'JQ1-(+)', 'NU-7441', 'ZOLEDRONATE', 'VINCRISTINE', 'ABT-737', 'TENIPOSIDE', 'DACOMITINIB', 'SB-525334', 'JNJ-38877605', 'CANERTINIB', 'SINULARIN', 'PODOPHYLLOTOXIN', 'FH-535', 'ELEPHANTIN', 'NSC-319726', 'ALVOCIDIB', 'TWS-119', 'NELARABINE', 'QS-11', 'GW-441756', 'LCL-161', 'GW-843682X', 'FILANESIB', 'TRICHOSTATIN-A', 'EPOTHILONE-B', 'DIHYDROROTENONE', 'T-0901317', 'BRYOSTATIN-1', 'TUBASTATIN-A', 'IKK-2-INHIBITOR-V', 'ACETALAX', 'NINTEDANIB', 'SU-11274', 'CARMUSTINE', 'GALLIBISCOQUINAZOLE']


In [17]:
interm_df.to_csv("/data/yingfei/cancer_data/llm_prompt_data/llm_interm_data_task1_simple.csv", index=False)

In [18]:
### import interm_df to create the prompt_data
interm_df = pd.read_csv("/data/yingfei/cancer_data/llm_prompt_data/llm_interm_data_task1_simple.csv")
print(interm_df.shape)
interm_df.head()

(22940, 10)


Unnamed: 0,cell_id,mut_status,drug_name,drug_SMILES,drug_target,drug_target_path,drug_response,threshold_high,threshold_low,sensitivity
0,ACH-000001,"NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, W...",VX-11E,CC1=CN=C(NC2=C(Cl)C=C(F)C=C2)N=C1C3=CNC(C(N[C@...,ERK2,ERK MAPK signaling,0.402122,0.984736,0.510903,Sensitive
1,ACH-000001,"NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, W...",AGI-6780,C1CC1NS(=O)(=O)C2=CC(=C(C=C2)C3=CSC=C3)NC(=O)N...,IDH2(R140Q),Metabolism,0.49162,0.984736,0.510903,Sensitive
2,ACH-000001,"NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, W...",AZD8931,CNC(=O)CN1CCC(CC1)OC2=C(C=C3C(=C2)C(=NC=N3)NC4...,"EGFR, ERBB2, ERBB3",RTK signaling,0.507522,0.984736,0.510903,Sensitive
3,ACH-000001,"NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, W...",IMATINIB,CC1=C(C=C(C=C1)NC(=O)C2=CC=C(C=C2)CN3CCN(CC3)C...,"ABL, KIT, PDGFR","Other, kinases",0.4067,0.984736,0.510903,Sensitive
4,ACH-000001,"NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, W...",XMD11-85H,O=C1CCN(C2CCCC2)C3=NC(NC4=CC=C(C(NC5CCN(C)CC5)...,"LRRK2, ERK5","Other, kinases",0.990736,0.984736,0.510903,Resistant


In [19]:
### prompt data
prompt_data = {
    'cell_id': [],
    'prompt': [],
    'answer': []
}

In [20]:
for i in tqdm(range(len(interm_df))):
    prompt = "Think step by step and decide in a single word reflecting the drug sensitivity of the drug on the cell line with given mutations: [Sensitive/Resistant], [Reasoning].\n"
    prompt += "Drug and cell line mutations: \n"
    cell_drug_data = interm_df.iloc[i]
    mut_status = cell_drug_data['mut_status']
    cell_id = cell_drug_data['cell_id']
    drug_name = cell_drug_data[f'drug_name']
    drug_SMILES = cell_drug_data[f'drug_SMILES']
    drug_target = cell_drug_data[f'drug_target']
    drug_target_path = cell_drug_data[f'drug_target_path']
    prompt += f"The drug is {drug_name}. The drug SMILES structure is {drug_SMILES}. Drug target is {drug_target}. Drug target pathway is {drug_target_path}.\n"
    prompt += f"The mutations of the cell line are {mut_status}.\n"
    prompt += "Drug Sensitivity: ?"
    prompt_data['cell_id'].append(cell_id)
    prompt_data['prompt'].append(prompt)
    prompt_data['answer'].append(cell_drug_data[f'sensitivity'])

100%|██████████████████████████████████████████████████████████████████████████████████| 22940/22940 [00:02<00:00, 9029.64it/s]


In [21]:
prompt_df = pd.DataFrame(prompt_data)
for i in range(3):
    print(prompt_df.prompt[i])
    print(prompt_df.answer[i])
    print()

Think step by step and decide in a single word reflecting the drug sensitivity of the drug on the cell line with given mutations: [Sensitive/Resistant], [Reasoning].
Drug and cell line mutations: 
The drug is VX-11E. The drug SMILES structure is CC1=CN=C(NC2=C(Cl)C=C(F)C=C2)N=C1C3=CNC(C(N[C@@H](C4=CC(Cl)=CC=C4)CO)=O)=C3. Drug target is ERK2. Drug target pathway is ERK MAPK signaling.
The mutations of the cell line are NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, WHSC1L1.
Drug Sensitivity: ?
Sensitive

Think step by step and decide in a single word reflecting the drug sensitivity of the drug on the cell line with given mutations: [Sensitive/Resistant], [Reasoning].
Drug and cell line mutations: 
The drug is AGI-6780. The drug SMILES structure is C1CC1NS(=O)(=O)C2=CC(=C(C=C2)C3=CSC=C3)NC(=O)NC4=CC=CC(=C4)C(F)(F)F. Drug target is IDH2(R140Q). Drug target pathway is Metabolism.
The mutations of the cell line are NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, WHSC1L1.
Drug Sensitivity: ?
Sens

In [22]:
interm_df.loc[(interm_df.cell_id == "ACH-000001") & (interm_df.drug_name == "AGI-6780")]

Unnamed: 0,cell_id,mut_status,drug_name,drug_SMILES,drug_target,drug_target_path,drug_response,threshold_high,threshold_low,sensitivity
1,ACH-000001,"NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, W...",AGI-6780,C1CC1NS(=O)(=O)C2=CC(=C(C=C2)C3=CSC=C3)NC(=O)N...,IDH2(R140Q),Metabolism,0.49162,0.984736,0.510903,Sensitive


In [23]:
### train_test_split
### train data
f = open("/home/yingfei/train_celllines_v1_1111.txt", "r")
train_cell_line = [line.strip() for line in f.readlines()]
print(len(train_cell_line))

### test data
f = open("/home/yingfei/test_celllines_v1_1111.txt", "r")
test_cell_line = [line.strip() for line in f.readlines()]
print(len(test_cell_line))

### train_prompt_data
train_prompt_data = prompt_df.loc[prompt_df.cell_id.isin(train_cell_line)].reset_index(drop = True)
print(train_prompt_data.shape)

### test_prompt_data
test_prompt_data = prompt_df.loc[prompt_df.cell_id.isin(test_cell_line)].reset_index(drop = True)
print(test_prompt_data.shape)

623
69
(20628, 3)
(2312, 3)


In [24]:
train_prompt_data.to_csv("/data/yingfei/cancer_data/llm_prompt_data/train_prompt_data_task1_simple.csv", index = False)
test_prompt_data.to_csv("/data/yingfei/cancer_data/llm_prompt_data/test_prompt_data_task1_simple.csv", index = False)

In [25]:
interm_df.loc[(interm_df.drug_name == "JW-7-24-1") & (interm_df.cell_id == "ACH-000001")]

Unnamed: 0,cell_id,mut_status,drug_name,drug_SMILES,drug_target,drug_target_path,drug_response,threshold_high,threshold_low,sensitivity


In [26]:
interm_df.loc[interm_df.drug_response < 0.4]

Unnamed: 0,cell_id,mut_status,drug_name,drug_SMILES,drug_target,drug_target_path,drug_response,threshold_high,threshold_low,sensitivity
6,ACH-000001,"NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, W...",YK-4-279,COC1=CC=C(C=C1)C(=O)CC2(C3=C(C=CC(=C3NC2=O)Cl)...,RNA helicase A,Other,0.365219,0.984736,0.510903,Sensitive
15,ACH-000001,"NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, W...",FORETINIB,COC1=CC2=C(C=CN=C2C=C1OCCCN3CCOCC3)OC4=C(C=C(C...,"MET, KDR, TIE2, VEGFR3/FLT4, RON, PDGFR, FGFR1...",RTK signaling,0.360070,0.984736,0.510903,Sensitive
17,ACH-000001,"NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, W...",ARA-G,C1=NC2=C(N1[C@H]3[C@H]([C@@H]([C@H](O3)CO)O)O)...,Anti-metabolite,Other,0.370906,0.984736,0.510903,Sensitive
20,ACH-000001,"NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, W...",GSK2606414,CN1C=C(C2=C(N=CN=C21)N)C3=CC4=C(C=C3)N(CC4)C(=...,PERK,Metabolism,0.236837,0.984736,0.510903,Sensitive
22,ACH-000001,"NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, W...",AGI-5198,CC1=CC=CC=C1C(C(=O)NC2CCCCC2)N(C3=CC(=CC=C3)F)...,IDH1 (R132H),Metabolism,0.379095,0.984736,0.510903,Sensitive
...,...,...,...,...,...,...,...,...,...,...
22925,ACH-001716,"ATM, BRCA1, CHEK2, FGF14, IDH1, KDM5C, MKNK1, ...",ARA-G,C1=NC2=C(N1[C@H]3[C@H]([C@@H]([C@H](O3)CO)O)O)...,Anti-metabolite,Other,0.262695,0.982525,0.484057,Sensitive
22934,ACH-001716,"ATM, BRCA1, CHEK2, FGF14, IDH1, KDM5C, MKNK1, ...",VENETOCLAX,CC1(CCC(=C(C1)C2=CC=C(C=C2)Cl)CN3CCN(CC3)C4=CC...,BCL2,Apoptosis regulation,0.298000,0.982525,0.484057,Sensitive
22936,ACH-001716,"ATM, BRCA1, CHEK2, FGF14, IDH1, KDM5C, MKNK1, ...",WYE-125132,CNC(=O)NC1=CC=C(C=C1)C2=NC3=C(C=NN3C4CCC5(CC4)...,mTOR,PI3K/MTOR signaling,0.159410,0.982525,0.484057,Sensitive
22938,ACH-001716,"ATM, BRCA1, CHEK2, FGF14, IDH1, KDM5C, MKNK1, ...",GW-2580,COC1=CC=C(C=C1)COC2=C(C=C(C=C2)CC3=CN=C(N=C3N)...,CSF1R,RTK signaling,0.238713,0.982525,0.484057,Sensitive


In [29]:
interm_df.loc[interm_df.drug_response > 0.8]

Unnamed: 0,cell_id,mut_status,drug_name,drug_SMILES,drug_target,drug_target_path,drug_response,threshold_high,threshold_low,sensitivity
4,ACH-000001,"NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, W...",XMD11-85H,O=C1CCN(C2CCCC2)C3=NC(NC4=CC=C(C(NC5CCN(C)CC5)...,"LRRK2, ERK5","Other, kinases",0.990736,0.984736,0.510903,Resistant
9,ACH-000001,"NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, W...",XMD8-92,CCOC1=C(C=CC(=C1)N2CCC(CC2)O)NC3=NC=C4C(=N3)N(...,ERK5,ERK MAPK signaling,0.985743,0.984736,0.510903,Resistant
11,ACH-000001,"NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, W...",EX-527,C1CC(C2=C(C1)C3=C(N2)C=CC(=C3)Cl)C(=O)N,SIRT1,Chromatin histone acetylation,0.989182,0.984736,0.510903,Resistant
12,ACH-000001,"NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, W...",KRAS (G12C) INHIBITOR-12,C=CC(=O)N1CCN(CC1)C(=O)CNC2=CC(=C(C=C2O)Cl)I,KRAS (G12C),ERK MAPK signaling,0.991612,0.984736,0.510903,Resistant
14,ACH-000001,"NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, W...",LEFLUNOMIDE,CC1=C(C=NO1)C(=O)NC2=CC=C(C=C2)C(F)(F)F,Pyrimidine synthesis inhibitor,DNA replication,0.987970,0.984736,0.510903,Resistant
...,...,...,...,...,...,...,...,...,...,...
22930,ACH-001716,"ATM, BRCA1, CHEK2, FGF14, IDH1, KDM5C, MKNK1, ...",FMK,C[C@@H](C(=O)N[C@@H](CC(=O)O)C(=O)CF)NC(=O)[C@...,RSK,"Other, kinases",0.982835,0.982525,0.484057,Resistant
22931,ACH-001716,"ATM, BRCA1, CHEK2, FGF14, IDH1, KDM5C, MKNK1, ...",PHA-665752,CC1=C(NC(=C1C(=O)N2CCC[C@@H]2CN3CCCC3)C)C=C4C5...,MET,RTK signaling,0.982877,0.982525,0.484057,Resistant
22932,ACH-001716,"ATM, BRCA1, CHEK2, FGF14, IDH1, KDM5C, MKNK1, ...",KIN001-042,C1=CC(=CC(=C1)I)CSC2=NN=C(O2)C3=CC=NC=C3,GSK3B,WNT signaling,0.985676,0.982525,0.484057,Resistant
22933,ACH-001716,"ATM, BRCA1, CHEK2, FGF14, IDH1, KDM5C, MKNK1, ...",CEDIRANIB,CC1=CC2=C(N1)C=CC(=C2F)OC3=NC=NC4=CC(=C(C=C43)...,"VEGFR, FLT1, FLT2, FLT3, FLT4, KIT, PDGFRB",RTK signaling,0.983767,0.982525,0.484057,Resistant


In [30]:
train_prompt_data.iloc[93].prompt

'Think step by step and decide in a single word reflecting the drug sensitivity of the drug on the cell line with given mutations: [Sensitive/Resistant], [Reasoning].\nDrug and cell line mutations: \nThe drug is NIRAPARIB. The drug SMILES structure is C1C[C@H](CNC1)C2=CC=C(C=C2)N3C=C4C=CC=C(C4=N3)C(=O)N. Drug target is PARP1, PARP2. Drug target pathway is Genome integrity.\nThe mutations of the cell line are FGFR3, POLE, TP53, ZNF703.\nDrug Sensitivity: ?'