In [1]:
!nvidia-smi

Wed Apr  3 20:57:14 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.05    Driver Version: 455.23.05    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  On   | 00000000:3B:00.0 Off |                    0 |
| N/A   29C    P0    35W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla P100-PCIE...  On   | 00000000:5E:00.0 Off |                    0 |
| N/A   28C    P0    37W / 250W |      2MiB / 16280MiB |      0%      Default |
|       

In [2]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, BitsAndBytesConfig, Trainer, pipeline
from peft import LoraConfig
from datasets import Dataset
from langchain.prompts.prompt import PromptTemplate

from trl import SFTTrainer

model_name = "/data/yingfei/models/llm/llama2/llama/llama-2-7b-chat-hf"

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=getattr(torch, "float16"),
    bnb_4bit_use_double_quant=False,
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load llama2 model
based_model = AutoModelForCausalLM.from_pretrained(model_name, 
  quantization_config=quant_config, 
  device_map={'':0}
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at /data/yingfei/models/llm/llama2/llama/llama-2-7b-chat-hf and are newly initialized: ['model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.4.self_attn.rotary_emb.inv_freq', 'model.layers.28.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.2.self_attn.rotary_emb.inv_freq', 'model.layers.29.self_attn.rotary_emb.inv_freq', 'model.layers.7.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.19.self_attn.rotary_emb.inv_freq', 'model.layers.21.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.11.self_attn.rotary_emb.inv_freq', 'model.layers.9.self_attn.rotary_emb.inv_freq', 'mode

In [3]:
# Data processing
import pandas as pd

train_prompt_data = pd.read_csv("/data/yingfei/cancer_data/llm_prompt_data/train_prompt_data_task2_simple.csv")
print(train_prompt_data.shape)
train_prompt_data.head()

(623, 3)


Unnamed: 0,cell_id,prompt,answer
0,ACH-000001,Think step by step and decide the best drug op...,IMATINIB
1,ACH-000002,Think step by step and decide the best drug op...,VX-11E
2,ACH-000004,Think step by step and decide the best drug op...,CETUXIMAB
3,ACH-000007,Think step by step and decide the best drug op...,AVAGACESTAT
4,ACH-000008,Think step by step and decide the best drug op...,AZD5363


In [4]:
print(train_prompt_data.prompt[0])

Think step by step and decide the best drug option for the cell line with given mutations: [Drug Name], [Reasoning].
Drug 1: The drug is UNC1215. The drug SMILES structure is C1CCN(C1)C2CCN(CC2)C(=O)C3=CC(=C(C=C3)C(=O)N4CCC(CC4)N5CCCC5)NC6=CC=CC=C6. Drug target is L3MBTL3. Drug target pathway is Chromatin other.
Drug 2: The drug is IMATINIB. The drug SMILES structure is CC1=C(C=C(C=C1)NC(=O)C2=CC=C(C=C2)CN3CCN(CC3)C)NC4=NC=CC(=N4)C5=CN=CC=C5. Drug target is ABL, KIT, PDGFR. Drug target pathway is Other, kinases.
Drug 3: The drug is AT7867. The drug SMILES structure is C1CNCCC1(C2=CC=C(C=C2)C3=CNN=C3)C4=CC=C(C=C4)Cl. Drug target is AKT. Drug target pathway is PI3K/MTOR signaling.
The mutations of the cell line are NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, WHSC1L1.
Best drug option: ?


In [5]:
instruction = "Think step by step and decide the best drug option for the cell line with given mutations: [Drug Name], [Reasoning]."

q_a_lst = [] 
for i in range(len(train_prompt_data)):
    question = train_prompt_data.loc[i, "prompt"].split("[Reasoning].")[1].replace("\n", "")
    answer = train_prompt_data.loc[i, "answer"]
    q_a_lst.append((question, answer))
print(q_a_lst[:2])

[('Drug 1: The drug is UNC1215. The drug SMILES structure is C1CCN(C1)C2CCN(CC2)C(=O)C3=CC(=C(C=C3)C(=O)N4CCC(CC4)N5CCCC5)NC6=CC=CC=C6. Drug target is L3MBTL3. Drug target pathway is Chromatin other.Drug 2: The drug is IMATINIB. The drug SMILES structure is CC1=C(C=C(C=C1)NC(=O)C2=CC=C(C=C2)CN3CCN(CC3)C)NC4=NC=CC(=N4)C5=CN=CC=C5. Drug target is ABL, KIT, PDGFR. Drug target pathway is Other, kinases.Drug 3: The drug is AT7867. The drug SMILES structure is C1CNCCC1(C2=CC=C(C=C2)C3=CNN=C3)C4=CC=C(C=C4)Cl. Drug target is AKT. Drug target pathway is PI3K/MTOR signaling.The mutations of the cell line are NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, WHSC1L1.Best drug option: ?', 'IMATINIB'), ('Drug 1: The drug is VX-11E. The drug SMILES structure is CC1=CN=C(NC2=C(Cl)C=C(F)C=C2)N=C1C3=CNC(C(N[C@@H](C4=CC(Cl)=CC=C4)CO)=O)=C3. Drug target is ERK2. Drug target pathway is ERK MAPK signaling.Drug 2: The drug is XMD8-92. The drug SMILES structure is CCOC1=C(C=CC(=C1)N2CCC(CC2)O)NC3=NC=C4C(=N3)N(C5=

In [6]:
# Dataset
prompt_template = PromptTemplate(
    input_variables=["instruction", "question", "answer"], template="<s>[INST] <<SYS>>{instruction}<</SYS>>{question}[/INST]{answer}</s>"
)

prompt_data = [prompt_template.format(instruction=instruction, question=q, answer=a) for q, a in q_a_lst[:10]]

dataset = Dataset.from_dict({"text": prompt_data})

In [7]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

based_model.config.use_cache = False
based_model.config.pretraining_tp = 1

In [8]:
peft_params = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    logging_steps=1,
    learning_rate=2e-4,
    fp16=True
)

trainer = SFTTrainer(
    model=based_model,
    train_dataset=dataset,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)

trainer.train()



Map:   0%|          | 0/10 [00:00<?, ? examples/s]

    There is an imbalance between your GPUs. You may want to exclude GPU 2 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,2.5905
2,2.5308


TrainOutput(global_step=2, training_loss=2.5606207847595215, metrics={'train_runtime': 25.5428, 'train_samples_per_second': 0.783, 'train_steps_per_second': 0.078, 'total_flos': 187433617489920.0, 'train_loss': 2.5606207847595215, 'epoch': 2.0})

In [9]:
new_model = "/data/yingfei/models/llm/llama2/llama/llama_finetune/llama2_task2"

trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

('/data/yingfei/models/llm/llama2/llama/llama_finetune/llama2_task2/tokenizer_config.json',
 '/data/yingfei/models/llm/llama2/llama/llama_finetune/llama2_task2/special_tokens_map.json',
 '/data/yingfei/models/llm/llama2/llama/llama_finetune/llama2_task2/tokenizer.model',
 '/data/yingfei/models/llm/llama2/llama/llama_finetune/llama2_task2/added_tokens.json',
 '/data/yingfei/models/llm/llama2/llama/llama_finetune/llama2_task2/tokenizer.json')

In [10]:
print(train_prompt_data.prompt[0])
train_prompt_data.iloc[21:25]

Think step by step and decide the best drug option for the cell line with given mutations: [Drug Name], [Reasoning].
Drug 1: The drug is UNC1215. The drug SMILES structure is C1CCN(C1)C2CCN(CC2)C(=O)C3=CC(=C(C=C3)C(=O)N4CCC(CC4)N5CCCC5)NC6=CC=CC=C6. Drug target is L3MBTL3. Drug target pathway is Chromatin other.
Drug 2: The drug is IMATINIB. The drug SMILES structure is CC1=C(C=C(C=C1)NC(=O)C2=CC=C(C=C2)CN3CCN(CC3)C)NC4=NC=CC(=N4)C5=CN=CC=C5. Drug target is ABL, KIT, PDGFR. Drug target pathway is Other, kinases.
Drug 3: The drug is AT7867. The drug SMILES structure is C1CNCCC1(C2=CC=C(C=C2)C3=CNN=C3)C4=CC=C(C=C4)Cl. Drug target is AKT. Drug target pathway is PI3K/MTOR signaling.
The mutations of the cell line are NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, WHSC1L1.
Best drug option: ?


Unnamed: 0,cell_id,prompt,answer
21,ACH-000052,Think step by step and decide the best drug op...,VX-11E
22,ACH-000053,Think step by step and decide the best drug op...,YK-4-279
23,ACH-000054,Think step by step and decide the best drug op...,GSK650394
24,ACH-000055,Think step by step and decide the best drug op...,FORETINIB


In [11]:
pipe = pipeline(task="text-generation", model=trainer.model, tokenizer=trainer.tokenizer, max_length=2000)

instruction: "Think step by step and decide the best drug option for the cell line with given mutations: [Drug Name], [Reasoning]."
prompt = "Drug 1: The drug is UNC1215. The drug SMILES structure is C1CCN(C1)C2CCN(CC2)C(=O)C3=CC(=C(C=C3)C(=O)N4CCC(CC4)N5CCCC5)NC6=CC=CC=C6. Drug target is L3MBTL3. Drug target pathway is Chromatin other.\
Drug 2: The drug is IMATINIB. The drug SMILES structure is CC1=C(C=C(C=C1)NC(=O)C2=CC=C(C=C2)CN3CCN(CC3)C)NC4=NC=CC(=N4)C5=CN=CC=C5. Drug target is ABL, KIT, PDGFR. Drug target pathway is Other, kinases.\
Drug 3: The drug is AT7867. The drug SMILES structure is C1CNCCC1(C2=CC=C(C=C2)C3=CNN=C3)C4=CC=C(C=C4)Cl. Drug target is AKT. Drug target pathway is PI3K/MTOR signaling.\
The mutations of the cell line are NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, WHSC1L1.\
Best drug option: ?"
prompt_content = f"<s>[INST] <<SYS>>{instruction}<</SYS>>{prompt}[/INST]"

# Run prompt and pipeline
result = pipe(prompt_content)
print(result[0]['generated_text']) # Not in one word???

    PyTorch 2.2.2+cu121 with CUDA 1201 (you have 1.10.1+cu111)
    Python  3.9.19 (you have 3.9.12)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHe

<s>[INST] <<SYS>>Think step by step and decide the best drug option for the cell line with given mutations: [Drug Name], [Reasoning].<</SYS>>Drug 1: The drug is UNC1215. The drug SMILES structure is C1CCN(C1)C2CCN(CC2)C(=O)C3=CC(=C(C=C3)C(=O)N4CCC(CC4)N5CCCC5)NC6=CC=CC=C6. Drug target is L3MBTL3. Drug target pathway is Chromatin other.Drug 2: The drug is IMATINIB. The drug SMILES structure is CC1=C(C=C(C=C1)NC(=O)C2=CC=C(C=C2)CN3CCN(CC3)C)NC4=NC=CC(=N4)C5=CN=CC=C5. Drug target is ABL, KIT, PDGFR. Drug target pathway is Other, kinases.Drug 3: The drug is AT7867. The drug SMILES structure is C1CNCCC1(C2=CC=C(C=C2)C3=CNN=C3)C4=CC=C(C=C4)Cl. Drug target is AKT. Drug target pathway is PI3K/MTOR signaling.The mutations of the cell line are NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, WHSC1L1.Best drug option: ?[/INST]  Based on the information provided, the best drug option for the cell line with the given mutations would be Drug 3, AT7867.
Reasoning:
1. Drug target: AT7867 targets AKT, whic