In [1]:
!nvidia-smi

Thu Apr  4 13:48:53 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.05    Driver Version: 455.23.05    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  On   | 00000000:3B:00.0 Off |                    0 |
| N/A   30C    P0    25W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla P100-PCIE...  On   | 00000000:5E:00.0 Off |                    0 |
| N/A   29C    P0    27W / 250W |      2MiB / 16280MiB |      0%      Default |
|       

In [2]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, BitsAndBytesConfig, Trainer, pipeline
from peft import LoraConfig
from datasets import Dataset
from langchain.prompts.prompt import PromptTemplate

from trl import SFTTrainer

model_name = "/data/yingfei/models/llm/llama2/llama/llama-2-7b-hf"

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=getattr(torch, "float16"),
    bnb_4bit_use_double_quant=False,
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load llama2 model
based_model = AutoModelForCausalLM.from_pretrained(model_name, 
  quantization_config=quant_config, 
  device_map={'':0}
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at /data/yingfei/models/llm/llama2/llama/llama-2-7b-hf and are newly initialized: ['model.layers.1.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.7.self_attn.rotary_emb.inv_freq', 'model.layers.27.self_attn.rotary_emb.inv_freq', 'model.layers.19.self_attn.rotary_emb.inv_freq', 'model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.26.self_attn.rotary_emb.inv_freq', 'model.layers.24.self_attn.rotary_emb.inv_freq', 'model.layers.25.self_attn.rotary_emb.inv_freq', 'model.layers.4.self_attn.rotary_emb.inv_freq', 'model.layers.21.self_attn.rotary_emb.inv_freq', 'model.layers.16.self_attn.rotary_emb.inv_freq', 'model.layers.2.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.11.self_attn.rotary_emb.inv_freq', 'model.la

In [3]:
# Data processing
import pandas as pd

train_prompt_data = pd.read_csv("/data/yingfei/cancer_data/llm_prompt_data/train_prompt_data_task1_simple.csv")
print(train_prompt_data.shape)
train_prompt_data.head()

(20628, 3)


Unnamed: 0,cell_id,prompt,answer
0,ACH-000001,Think step by step and decide in a single word...,Sensitive
1,ACH-000001,Think step by step and decide in a single word...,Sensitive
2,ACH-000001,Think step by step and decide in a single word...,Sensitive
3,ACH-000001,Think step by step and decide in a single word...,Sensitive
4,ACH-000001,Think step by step and decide in a single word...,Resistant


In [4]:
print(train_prompt_data.prompt[0])

Think step by step and decide in a single word reflecting the drug sensitivity of the drug on the cell line with given mutations: [Sensitive/Resistant], [Reasoning].
Drug and cell line mutations: 
The drug is VX-11E. The drug SMILES structure is CC1=CN=C(NC2=C(Cl)C=C(F)C=C2)N=C1C3=CNC(C(N[C@@H](C4=CC(Cl)=CC=C4)CO)=O)=C3. Drug target is ERK2. Drug target pathway is ERK MAPK signaling.
The mutations of the cell line are NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, WHSC1L1.
Drug Sensitivity: ?


In [5]:
instruction = "Think step by step and decide in a single word reflecting the drug sensitivity of the drug on the cell line with given mutations: [Sensitive/Resistant]."

q_a_lst = [] 
for i in range(len(train_prompt_data)):
    question = train_prompt_data.loc[i, "prompt"].split("[Reasoning].")[1].replace("\n", "")
    answer = train_prompt_data.loc[i, "answer"]
    q_a_lst.append((question, answer))
print(q_a_lst[:2])

[('Drug and cell line mutations: The drug is VX-11E. The drug SMILES structure is CC1=CN=C(NC2=C(Cl)C=C(F)C=C2)N=C1C3=CNC(C(N[C@@H](C4=CC(Cl)=CC=C4)CO)=O)=C3. Drug target is ERK2. Drug target pathway is ERK MAPK signaling.The mutations of the cell line are NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, WHSC1L1.Drug Sensitivity: ?', 'Sensitive'), ('Drug and cell line mutations: The drug is AGI-6780. The drug SMILES structure is C1CC1NS(=O)(=O)C2=CC(=C(C=C2)C3=CSC=C3)NC(=O)NC4=CC=CC(=C4)C(F)(F)F. Drug target is IDH2(R140Q). Drug target pathway is Metabolism.The mutations of the cell line are NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, WHSC1L1.Drug Sensitivity: ?', 'Sensitive')]


In [6]:
# Dataset
prompt_template = PromptTemplate(
    input_variables=["instruction", "question", "answer"], template="<s>[INST] <<SYS>>{instruction}<</SYS>>{question}[/INST]{answer}</s>"
)

prompt_data = [prompt_template.format(instruction=instruction, question=q, answer=a) for q, a in q_a_lst[:100]]

dataset = Dataset.from_dict({"text": prompt_data})

In [7]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

based_model.config.use_cache = False
based_model.config.pretraining_tp = 1

In [None]:
peft_params = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    logging_steps=1,
    learning_rate=2e-4,
    fp16=True
)

trainer = SFTTrainer(
    model=based_model,
    train_dataset=dataset,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)

trainer.train()



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

    There is an imbalance between your GPUs. You may want to exclude GPU 2 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
new_model = "/data/yingfei/models/llm/llama2/llama/llama_finetune/llama2_7b_task1"

trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

In [None]:
print(train_prompt_data.prompt[0])
train_prompt_data.iloc[101:105]

In [None]:
pipe = pipeline(task="text-generation", model=trainer.model, tokenizer=trainer.tokenizer, max_length=2000)

instruction: "Think step by step and decide in a single word reflecting the drug sensitivity of the drug on the cell line: [Sensitive/Resistant]"
prompt = "Drug and cell line mutations: The drug is VX-11E. The drug SMILES structure is CC1=CN=C(NC2=C(Cl)C=C(F)C=C2)N=C1C3=CNC(C(N[C@@H](C4=CC(Cl)=CC=C4)CO)=O)=C3. Drug target is ERK2. Drug target pathway is ERK MAPK signaling.The mutations of the cell line are NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, WHSC1L1.Drug Sensitivity: ?"
prompt_content = f"<s>[INST] <<SYS>>{instruction}<</SYS>>{prompt}[/INST]"

# Run prompt and pipeline
result = pipe(prompt_content)
print(result[0]['generated_text'])

In [26]:
### results from 7b_chat
# pipe = pipeline(task="text-generation", model=trainer.model, tokenizer=trainer.tokenizer, max_length=2000)

# instruction: "Think step by step and decide in a single word reflecting the drug sensitivity of the drug on the cell line: [Sensitive/Resistant]"
# prompt = "Drug and cell line mutations: The drug is VX-11E. The drug SMILES structure is CC1=CN=C(NC2=C(Cl)C=C(F)C=C2)N=C1C3=CNC(C(N[C@@H](C4=CC(Cl)=CC=C4)CO)=O)=C3. Drug target is ERK2. Drug target pathway is ERK MAPK signaling.The mutations of the cell line are NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, WHSC1L1.Drug Sensitivity: ?"
# prompt_content = f"<s>[INST] <<SYS>>{instruction}<</SYS>>{prompt}[/INST]"

# # Run prompt and pipeline
# result = pipe(prompt_content)
# print(result[0]['generated_text']) # Not in one word???

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausal

<s>[INST] <<SYS>>Think step by step and decide in a single word reflecting the drug sensitivity of the drug on the cell line with given mutations: [Sensitive/Resistant].<</SYS>>Drug and cell line mutations: The drug is VX-11E. The drug SMILES structure is CC1=CN=C(NC2=C(Cl)C=C(F)C=C2)N=C1C3=CNC(C(N[C@@H](C4=CC(Cl)=CC=C4)CO)=O)=C3. Drug target is ERK2. Drug target pathway is ERK MAPK signaling.The mutations of the cell line are NOTCH1, NOTCH3, PIK3R1, PPP2R1A, TP53, TSC2, WHSC1L1.Drug Sensitivity: ?[/INST]  Based on the information provided, I would classify the drug sensitivity of VX-11E on the cell line with the given mutations as follows:
* NOTCH1: Resistant
* NOTCH3: Resistant
* PIK3R1: Sensitive
* PPP2R1A: Sensitive
* TP53: Sensitive
* TSC2: Resistant
* WHSC1L1: Sensitive

The drug target of VX-11E is ERK2, which is a key player in the ERK MAPK signaling pathway. The mutations in the cell line may affect the expression or activity of this pathway, leading to changes in drug sensiti