In [1]:
import huggingface_hub
import torch
import time
import pandas as pd
import numpy as np
from datasets import Dataset, load_dataset, concatenate_datasets
import random
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from trl import SFTTrainer
import json

In [2]:
import os
os.environ['HF_TOKEN'] = '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'

In [3]:
def get_model(model_id,is_8bit = True):
    if is_8bit:
        bnb_config = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_use_double_quant=True,
            bnb_8bit_quant_type="nf4",
            bnb_8bit_compute_dtype=torch.bfloat16
        )
    else:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config = bnb_config,
        device_map = "auto",
        cache_dir = "cache"
    )
    return model

In [4]:
def get_tokenizer(model_id,stop_tokens=True):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token_id = (
            tokenizer.eos_token_id
        )    
    return tokenizer

In [5]:
MAX_LEN = 1000
def tokenize(tokenizer, prompt, add_eos_token=True):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=MAX_LEN,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < MAX_LEN
        and add_eos_token
    ):
        result["input_ids"][MAX_LEN-1] = tokenizer.eos_token_id
        result["attention_mask"][MAX_LEN-1] = 1
    
    result["labels"] = result["input_ids"].copy()
    
    return result

### Data Preparation

In [6]:
BLACK_BOX = False

In [7]:
prompt_black = """
Capture entity values from the LAST UTTERANCE of the conversation.
FOCUS ONLY ON THE VALUES MENTIONED IN THE LAST UTTERANCE.
Format the output as a valid JSON object, and for each entity-value pair, along with their pair-level confidence (0-1).
Format: {{"state": {{"_entity_":"_value_"}}, "confidence": "X"}}
Fill the actual entity value into the placeholder encapsulated with underscores.
Put "```" as EOS token at the end of response.
{}
Do not capture any other values!
If not specified, do not respond to that slot-value.

Provide 1 posiible entity values based on the last utterance, along with their confidence (0-1). MAKE SURE TO SEPARATE EACH SLOT-VALUE WITH ITS CONFIDENCE PAIR.
Format the output as:
```json{{[{{"state": {{"_entity1_":"_value1_"}}, "confidence": "X"}}, {{"state": {{"_entity2_":"_value2_"}}, "confidence": "X"}}]}}```
Where X is the Confidence of the answer.

Now complete the following example, AND PROVIDE CONFIDENCE THAT IT'S CORRECT:
input: {}  

***Output JSON format***
Output: ```json{{"""

In [8]:
# prompt5
prompt = """
<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
Capture entity values from the LAST UTTERANCE of the conversation.
FOCUS ONLY ON THE VALUES MENTIONED IN THE LAST UTTERANCE.
Format the output as a valid JSON object for each entity-value pair.
Format: {{"state": {{"_entity_":"_value_"}}}}
Fill the actual entity value into the placeholder encapsulated with underscores.
Put "```" as EOS token at the end of response.
Values that should be captured are:
{}
Do not capture any other values!
If not specified, do not respond to that slot-value.

MAKE SURE TO SEPARATE EACH SLOT-VALUE PAIR.
Format the output as:
```json
[
  {{"state": {{"_entity1_": "_value1_"}}}},
  {{"state": {{"_entity2_": "_value2_"}}}}
]```

Now complete the following example, AND PROVIDE CONFIDENCE THAT IT'S CORRECT:
input: <|eot_id|>
<|start_header_id|>user<|end_header_id|>
{}
<|eot_id|>

<|start_header_id|>assistant<|end_header_id|>
***Output JSON format***
Output: ```json
"""

In [9]:
def get_turn_info(dataset):
    for dialog in dataset:
        dialog_id = dialog["dialogue_id"].split('.')[0].lower()
        
        last_state = {}
        for tn in range(0, len(dialog["turns"]["utterance"]), 2):
            context = [f"Customer: {t}" if n % 2== 0 else f"Assistant: {t}" for n, t in enumerate(dialog["turns"]["utterance"][:tn+1])]
            state = dialog["turns"]["frames"][tn]["state"]
            
            gt_domain = []
            if len(state) == 0:
                state = {}
            else:
                state = [state[i]["slots_values"] for i in range(len(state))]
                state = [{k: v[0] for k, v in zip(state[i]["slots_values_name"], state[i]["slots_values_list"])} for i in range(len(state)) if len(state[i]["slots_values_name"]) > 0]
            
            new_state = last_state
            for i in range(len(state)):
                for sl, val in state[i].items():
                    domain, name = sl.split("-")
                    if domain not in new_state:
                        new_state[domain] = {name: val}
                    else:
                        new_state[domain][name] = val
                        
            state_update = {}
            for domain, domain_state in new_state.items():
                for slot, value in domain_state.items():
                    if slot not in last_state.get(domain, {}) or last_state[domain][slot] != value:
                        if domain not in gt_domain:
                            print(f"append domain: {domain}")
                            gt_domain.append(domain)
                        if domain not in state_update:
                            state_update[domain] = {}
                        state_update[domain][slot] = value
                        
            last_state = new_state
            
            turn = {
                "question":dialog["turns"]["utterance"][tn],
                "gt_state": last_state, # total state
                "dialog_id": dialog_id,
                "metadata": {
                    "domain": gt_domain,
                    "turn_state": state_update,
                    "total_state": last_state,
                    "context": "\n".join(context[-6:])
                }
            }
            yield turn

In [10]:
from slot_description import DOMAIN_SLOT_DESCRIPTION, DOMAIN_EXPECTED_SLOT, EXPECTED_DOMAIN

In [11]:
def generate_instruction_dataset(data_point):
    ### INFO
    gt_domain = data_point["metadata"]["domain"]
    context = data_point["metadata"]["context"]
    # print("context:\n")
    # print(context)
    utterance = data_point["question"]
    # print("utterance:\n")
    # print(utterance)
    turn_state = data_point["metadata"]["turn_state"]
    domain_description = ""
    if gt_domain:
        # print(f"gt_domain: {gt_domain}")
        for domain in gt_domain:
            domain_description += DOMAIN_SLOT_DESCRIPTION[domain]
    
    target_str = ""
    for domain in turn_state.keys():
        for slot, value in turn_state[domain].items():
            buf = "{" + "\"" + "state\": " + "{\"" + str(slot) + "\": \"" + str(value) + "\"}}, "
            # print(buf)
            target_str += buf

    if target_str.endswith(", "):
        target_str = target_str[:-2]

    target_str = "[" + target_str + "]" + "```" 
    text = "###Prompt###" + prompt.format(domain_description, context) + "###Completion###\n" + target_str + tokenizer.eos_token
    
    return {"text": text, "labels": target_str}
    
        

In [12]:
def process_dataset(data):
    dataset = []
    for i in range(len(data)):
        dialog = data[i]
        dataset.append(generate_instruction_dataset(dialog))
    return dataset

In [13]:
import copy
def get_turn_info(dataset):
    all_turns = []
    for dialog in dataset:
        dialog_id = dialog["dialogue_id"].split('.')[0].lower()
        
        last_state = {}
        for tn in range(0, len(dialog["turns"]["utterance"]), 2):
            context = [f"Customer: {t}" if n % 2== 0 else f"Assistant: {t}" for n, t in enumerate(dialog["turns"]["utterance"][:tn+1])]
            state = dialog["turns"]["frames"][tn]["state"]
            
            gt_domain = []
            if len(state) == 0:
                state = {}
            else:
                state = [state[i]["slots_values"] for i in range(len(state))]
                state = [{k: v[0] for k, v in zip(state[i]["slots_values_name"], state[i]["slots_values_list"])} for i in range(len(state)) if len(state[i]["slots_values_name"]) > 0]

            new_state = copy.deepcopy(last_state)
            for i in range(len(state)):
                for sl, val in state[i].items():
                    domain, name = sl.split("-")
                    if domain not in new_state:
                        new_state[domain] = {name: val}
                    else:
                        new_state[domain][name] = val
            state_update = {}
            for domain, domain_state in new_state.items():
                for slot, value in domain_state.items():
                    if slot not in last_state.get(domain, {}) or last_state[domain][slot] != value:
                        if domain not in state_update:
                            state_update[domain] = {}
                        state_update[domain][slot] = value
                        
            for domain, domain_state in state_update.items():
                gt_domain.append(domain)
            # if len(gt_domain) > 1:
            #     print(f"multiple gt_domain: {gt_domain}")
            last_state = new_state
            
            turn = {
                "question":dialog["turns"]["utterance"][tn],
                "gt_state": copy.deepcopy(last_state), # total state
                "dialog_id": copy.deepcopy(dialog_id),
                "metadata": {
                    "domain": copy.deepcopy(gt_domain),
                    "turn_state": copy.deepcopy(state_update),
                    "total_state": copy.deepcopy(last_state),
                    "context": "\n".join(context[-6:])
                }
            }
            all_turns.append(turn)
    
    return all_turns
            

In [14]:
def get_train_valid_data(sample):
    dataset = load_dataset("multi_woz_v22")

    train_data = dataset["train"]
    valid_data = dataset["validation"]
    

    if sample:
        train_data = train_data.select([i for i in range(sample['train_size'])])
        valid_data = valid_data.select([i for i in range(sample['valid_size'])])

    train_turn_data = get_turn_info(train_data)
    valid_turn_data = get_turn_info(valid_data)
    
    train_turn_data = process_dataset(train_turn_data)
    valid_turn_data = process_dataset(valid_turn_data)
    
    return train_turn_data, valid_turn_data

In [19]:
def get_model_setup(model,batch_size,OUTPUT_DIR,epochs):
    
    lora_config = LoraConfig(
        r=16,
        lora_alpha=64,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], #specific to Llama models.
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM"
    )
    
    model.gradient_checkpointing_enable()
    model = prepare_model_for_kbit_training(model)
    
    print(model)
    
    model = get_peft_model(model, lora_config)
    
    training_arguments = TrainingArguments(
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=4,
        optim="adamw_torch",
        logging_steps=1,
        learning_rate=1e-6,
        fp16=True,
        max_grad_norm=0.3,
        num_train_epochs=epochs,
        evaluation_strategy="steps",
        eval_steps=0.2,
        warmup_ratio=0.05,
        save_strategy="epoch",
        group_by_length=True,
        output_dir=OUTPUT_DIR,
        report_to="wandb",
        save_safetensors=True,
        lr_scheduler_type="cosine",
        seed=42,
    )
    model.config.use_cache = True  # silence the warnings. Please re-enable for inference!
    return lora_config,training_arguments,model


### Data loading

In [20]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
batch_size = 5
epochs = 1
OUTPUT_DIR = "llama3-confidence"
sample = None
# sample = {"train_size" : 2000, "valid_size":400} # None

In [21]:
tokenizer = get_tokenizer(model_id,stop_tokens=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [22]:
train_data, validation_data = get_train_valid_data(sample)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [23]:
print(len(train_data))
print(len(validation_data))

56776
7374


In [24]:
print(train_data[0]["text"])

###Prompt###
<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
Capture entity values from the LAST UTTERANCE of the conversation.
FOCUS ONLY ON THE VALUES MENTIONED IN THE LAST UTTERANCE.
Format the output as a valid JSON object for each entity-value pair.
Format: {"state": {"_entity_":"_value_"}}
Fill the actual entity value into the placeholder encapsulated with underscores.
Put "```" as EOS token at the end of response.
Values that should be captured are:


In the DOMAIN of "restaurant", the values that should be captured are:
 - "pricerange" that specifies the price range of the restaurant (cheap/moderate/expensive)
 - "area" that specifies the area where the restaurant is located (north/east/west/south/centre)
 - "food" that specifies the type of food the restaurant serves
 - "name" that specifies the name of the restaurant
 - "bookday" that specifies the day of the booking
 - "booktime" that specifies the time of the booking
 - "bookpeople" that specifies for how many 

In [25]:
print(validation_data[0]["text"])

###Prompt###
<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
Capture entity values from the LAST UTTERANCE of the conversation.
FOCUS ONLY ON THE VALUES MENTIONED IN THE LAST UTTERANCE.
Format the output as a valid JSON object for each entity-value pair.
Format: {"state": {"_entity_":"_value_"}}
Fill the actual entity value into the placeholder encapsulated with underscores.
Put "```" as EOS token at the end of response.
Values that should be captured are:


In the DOMAIN of "restaurant", the values that should be captured are:
 - "pricerange" that specifies the price range of the restaurant (cheap/moderate/expensive)
 - "area" that specifies the area where the restaurant is located (north/east/west/south/centre)
 - "food" that specifies the type of food the restaurant serves
 - "name" that specifies the name of the restaurant
 - "bookday" that specifies the day of the booking
 - "booktime" that specifies the time of the booking
 - "bookpeople" that specifies for how many 

In [26]:
model_loaded = "false"
if model_loaded == "false":
    model = get_model(model_id,is_8bit = True)

Unused kwargs: ['bnb_8bit_use_double_quant', 'bnb_8bit_quant_type', 'bnb_8bit_compute_dtype']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [27]:
lora_config,training_arguments,model = get_model_setup(model,batch_size,OUTPUT_DIR,epochs)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear8bitLt(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )


In [28]:
from transformers import TrainerCallback
class CustomWandbLoggingCallback(TrainerCallback):
    def on_log(self, args, state, control, **kwargs):
        if state.is_world_process_zero:
            logs = {k: v for k, v in state.log_history[-1].items() if isinstance(v, (int, float))}
            wandb.log(logs)

In [29]:
def fine_tune_model(train_subset, valid_subset, saved_model):
    trainer = SFTTrainer(
        model=model,
        train_dataset=train_subset,
        eval_dataset=valid_subset,
        peft_config=lora_config,
        dataset_text_field="text",
        max_seq_length=1500,
        tokenizer=tokenizer,
        args=training_arguments,
        callbacks=[CustomWandbLoggingCallback()],
    )
    trainer.train()
    
    trainer.model.save_pretrained(saved_model)
    tokenizer.save_pretrained(saved_model)
    return trainer

In [30]:
def evaluate_model(saved_model):
    model = AutoModelForCausalLM.from_pretrained(saved_model, cache_dir="cache")
    tokenizer = AutoTokenizer.from_pretrained(saved_model, cache_dir="cache")
    
    command = [
        "python", "run.py",
        "--model_name=saved_model",
        "--dials_total=1",
        "--temperature=1"
    ]
    
    result = subprocess.run(command, capture_output=True, text=True)
    print(f"saved_model: {saved_model}")
    print(f"result: {result}")
    
    return result

In [31]:
import matplotlib.pyplot as plt
import subprocess

## Fine-Tuning

### Full training set

In [32]:
import wandb
wandb.init(project="llama3-finetuning-DST", entity="jennysun-cs09", name="fullset_epoch1_lr6")
wandb.config.update(training_arguments)

train_subset = Dataset.from_dict({key: [dic[key] for dic in train_data] for key in train_data[0]})
valid_subset = Dataset.from_dict({key: [dic[key] for dic in validation_data] for key in validation_data[0]})

saved_model = os.path.join("saved_model_5", f"fullset_epoch1_lr6")
model = fine_tune_model(train_subset, valid_subset, saved_model)

wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjennysun-cs09[0m. Use [1m`wandb login --relogin`[0m to force relogin


Map:   0%|          | 0/56776 [00:00<?, ? examples/s]

Map:   0%|          | 0/7374 [00:00<?, ? examples/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
568,0.8006,0.838747
1136,0.8155,0.594729
1704,0.5198,0.533505
2272,0.4548,0.504787


VBox(children=(Label(value='0.007 MB of 0.007 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval/loss,█▃▂▁
eval/runtime,█▁▆▃
eval/samples_per_second,▁█▂▆
eval/steps_per_second,▁█▃▆
eval_loss,█▃▂▁
eval_runtime,█▁▆▃
eval_samples_per_second,▁█▂▆
eval_steps_per_second,▁█▃▆
grad_norm,█▇▂▂▃▅▅▂▂▂▂▄▂▂▁▂▃▁▁▁▁▂▃▁▂▁▁▂▁▁▂▂▁▁▁▁▁▁▁▁

0,1
epoch,1.0
eval/loss,0.50479
eval/runtime,627.6184
eval/samples_per_second,11.749
eval/steps_per_second,1.469
eval_loss,0.50479
eval_runtime,627.6184
eval_samples_per_second,11.749
eval_steps_per_second,1.469
grad_norm,5.02621


### 250 samples

In [None]:
train_subset = train_data[:train_samples]
print(f"train_subset len: {len(train_subset)}")
train_data = train_data[train_samples:]
print(f"train_data len: {len(train_data)}")

train_subset = Dataset.from_dict({key: [dic[key] for dic in train_data] for key in train_subset[0]})
valid_subset = Dataset.from_dict({key: [dic[key] for dic in validation_data[:valid_samples]] for key in validation_data[0]})
print(f"valid_subset len: {len(valid_subset)}")
saved_model = os.path.join("saved_model_5", f"250_sample")
model = fine_tune_model(train_subset, valid_subset, saved_model)

# result = evaluate_model(saved_model)
# JGA.append(result["JGA"])
# ECE.append(result["ECE"])
# AUC.append(result["AUC"])

In [None]:
saved_model = os.path.join("saved_model_2", f"250_sample")
model = AutoModelForCausalLM.from_pretrained(saved_model, cache_dir="cache")
tokenizer = AutoTokenizer.from_pretrained(saved_model, cache_dir="cache")
!python run.py --model_name="saved_model" --dials_total=100 --temperature=1

In [None]:
# https://wandb.ai/jennysun-cs09/prompting-strategies/runs/czxogh1t
JGA.append(0.2468)
ECE.append(0.2366)
AUC.append(0.8219)

In [None]:
train_subset = train_data[:train_samples]
print(f"train_subset len: {len(train_subset)}")
train_data = train_data[train_samples:]
print(f"train_data len: {len(train_data)}")

### 500 Samples

In [None]:
saved_model = os.path.join("saved_model", f"250_sample")
model = AutoModelForCausalLM.from_pretrained(saved_model, cache_dir="cache")
tokenizer = AutoTokenizer.from_pretrained(saved_model, cache_dir="cache")

In [None]:
train_subset = train_data[:train_samples]
print(f"train_subset len: {len(train_subset)}")
train_data = train_data[train_samples:]
print(f"train_data len: {len(train_data)}")

train_subset = Dataset.from_dict({key: [dic[key] for dic in train_data] for key in train_subset[0]})
valid_subset = Dataset.from_dict({key: [dic[key] for dic in validation_data[:valid_samples]] for key in validation_data[0]})
print(f"valid_subset len: {len(valid_subset)}")
saved_model = os.path.join("saved_model", f"500_sample")
model = fine_tune_model(train_subset, valid_subset, saved_model)

In [None]:
saved_model = os.path.join("saved_model", f"500_sample")
model = AutoModelForCausalLM.from_pretrained(saved_model, cache_dir="cache")
tokenizer = AutoTokenizer.from_pretrained(saved_model, cache_dir="cache")
!python run.py --model_name="saved_model" --dials_total=100 --temperature=1

In [None]:
# https://wandb.ai/jennysun-cs09/prompting-strategies/runs/82ksgaq2
JGA.append(0.2624)
ECE.append(0.2328)
AUC.append(0.8223)

### 750 Samples

In [None]:
saved_model = os.path.join("saved_model", f"500_sample")
model = AutoModelForCausalLM.from_pretrained(saved_model, cache_dir="cache")
tokenizer = AutoTokenizer.from_pretrained(saved_model, cache_dir="cache")

In [None]:
train_subset = train_data[:train_samples]
print(f"train_subset len: {len(train_subset)}")
train_data = train_data[train_samples:]
print(f"train_data len: {len(train_data)}")

train_subset = Dataset.from_dict({key: [dic[key] for dic in train_data] for key in train_subset[0]})
valid_subset = Dataset.from_dict({key: [dic[key] for dic in validation_data[:valid_samples]] for key in validation_data[0]})
print(f"valid_subset len: {len(valid_subset)}")
saved_model = os.path.join("saved_model", f"750_sample")
model = fine_tune_model(train_subset, valid_subset, saved_model)

In [None]:
saved_model = os.path.join("saved_model", f"750_sample")
model = AutoModelForCausalLM.from_pretrained(saved_model, cache_dir="cache")
tokenizer = AutoTokenizer.from_pretrained(saved_model, cache_dir="cache")
!python run.py --model_name="saved_model" --dials_total=100 --temperature=1

In [None]:
# https://wandb.ai/jennysun-cs09/prompting-strategies/runs/l78q0sgn
JGA.append(0.2560)
ECE.append(0.2212)
AUC.append(0.8173)

### 1000 Samples

In [None]:
saved_model = os.path.join("saved_model", f"750_sample")
model = AutoModelForCausalLM.from_pretrained(saved_model, cache_dir="cache")
tokenizer = AutoTokenizer.from_pretrained(saved_model, cache_dir="cache")

In [None]:
train_subset = train_data[:train_samples]
print(f"train_subset len: {len(train_subset)}")
train_data = train_data[train_samples:]
print(f"train_data len: {len(train_data)}")

train_subset = Dataset.from_dict({key: [dic[key] for dic in train_data] for key in train_subset[0]})
valid_subset = Dataset.from_dict({key: [dic[key] for dic in validation_data[:valid_samples]] for key in validation_data[0]})
print(f"valid_subset len: {len(valid_subset)}")
saved_model = os.path.join("saved_model", f"1000_sample")
model = fine_tune_model(train_subset, valid_subset, saved_model)

In [None]:
saved_model = os.path.join("saved_model", f"1000_sample")
model = AutoModelForCausalLM.from_pretrained(saved_model, cache_dir="cache")
tokenizer = AutoTokenizer.from_pretrained(saved_model, cache_dir="cache")
!python run.py --model_name="saved_model" --dials_total=100 --temperature=1

In [None]:
# https://wandb.ai/jennysun-cs09/prompting-strategies/runs/o2ok12o3
JGA.append(0.2751)
ECE.append(0.2136)
AUC.append(0.8114)

### 1250 Samples

In [None]:
saved_model = os.path.join("saved_model", f"1000_sample")
model = AutoModelForCausalLM.from_pretrained(saved_model, cache_dir="cache")
tokenizer = AutoTokenizer.from_pretrained(saved_model, cache_dir="cache")

In [None]:
train_subset = train_data[:train_samples]
print(f"train_subset len: {len(train_subset)}")
train_data = train_data[train_samples:]
print(f"train_data len: {len(train_data)}")

train_subset = Dataset.from_dict({key: [dic[key] for dic in train_data] for key in train_subset[0]})
valid_subset = Dataset.from_dict({key: [dic[key] for dic in validation_data[:valid_samples]] for key in validation_data[0]})
print(f"valid_subset len: {len(valid_subset)}")
saved_model = os.path.join("saved_model", f"1250_sample")
model = fine_tune_model(train_subset, valid_subset, saved_model)

In [None]:
saved_model = os.path.join("saved_model", f"1250_sample")
model = AutoModelForCausalLM.from_pretrained(saved_model, cache_dir="cache")
tokenizer = AutoTokenizer.from_pretrained(saved_model, cache_dir="cache")
!python run.py --model_name="saved_model" --dials_total=100 --temperature=1

In [None]:
# https://wandb.ai/jennysun-cs09/prompting-strategies/runs/bn6z99b0
JGA.append(0.2624)
ECE.append(0.2328)
AUC.append(0.8223)

### 1500 Samples

In [None]:
saved_model = os.path.join("saved_model", f"1250_sample")
model = AutoModelForCausalLM.from_pretrained(saved_model, cache_dir="cache")
tokenizer = AutoTokenizer.from_pretrained(saved_model, cache_dir="cache")

In [None]:
train_subset = train_data[:train_samples]
print(f"train_subset len: {len(train_subset)}")
train_data = train_data[train_samples:]
print(f"train_data len: {len(train_data)}")

train_subset = Dataset.from_dict({key: [dic[key] for dic in train_data] for key in train_subset[0]})
valid_subset = Dataset.from_dict({key: [dic[key] for dic in validation_data[:valid_samples]] for key in validation_data[0]})
print(f"valid_subset len: {len(valid_subset)}")
saved_model = os.path.join("saved_model", f"1500_sample")
model = fine_tune_model(train_subset, valid_subset, saved_model)

In [None]:
saved_model = os.path.join("saved_model", f"1500_sample")
model = AutoModelForCausalLM.from_pretrained(saved_model, cache_dir="cache")
tokenizer = AutoTokenizer.from_pretrained(saved_model, cache_dir="cache")
!python run.py --model_name="saved_model" --dials_total=100 --temperature=1

### 1750 Samples

In [None]:
saved_model = os.path.join("saved_model", f"1500_sample")
model = AutoModelForCausalLM.from_pretrained(saved_model, cache_dir="cache")
tokenizer = AutoTokenizer.from_pretrained(saved_model, cache_dir="cache")

In [None]:
train_subset = train_data[:train_samples]
print(f"train_subset len: {len(train_subset)}")
train_data = train_data[train_samples:]
print(f"train_data len: {len(train_data)}")

train_subset = Dataset.from_dict({key: [dic[key] for dic in train_data] for key in train_subset[0]})
valid_subset = Dataset.from_dict({key: [dic[key] for dic in validation_data[:valid_samples]] for key in validation_data[0]})
print(f"valid_subset len: {len(valid_subset)}")
saved_model = os.path.join("saved_model", f"1750_sample")
model = fine_tune_model(train_subset, valid_subset, saved_model)

In [None]:
saved_model = os.path.join("saved_model", f"1750_sample")
model = AutoModelForCausalLM.from_pretrained(saved_model, cache_dir="cache")
tokenizer = AutoTokenizer.from_pretrained(saved_model, cache_dir="cache")
!python run.py --model_name="saved_model" --dials_total=100 --temperature=1

### 2000 Samples

In [None]:
saved_model = os.path.join("saved_model", f"1750_sample")
model = AutoModelForCausalLM.from_pretrained(saved_model, cache_dir="cache")
tokenizer = AutoTokenizer.from_pretrained(saved_model, cache_dir="cache")

In [None]:
train_subset = train_data[:train_samples]
print(f"train_subset len: {len(train_subset)}")
train_data = train_data[train_samples:]
print(f"train_data len: {len(train_data)}")

train_subset = Dataset.from_dict({key: [dic[key] for dic in train_data] for key in train_subset[0]})
valid_subset = Dataset.from_dict({key: [dic[key] for dic in validation_data[:valid_samples]] for key in validation_data[0]})
print(f"valid_subset len: {len(valid_subset)}")
saved_model = os.path.join("saved_model", f"2000_sample")
model = fine_tune_model(train_subset, valid_subset, saved_model)

In [None]:
saved_model = os.path.join("saved_model", f"2000_sample")
model = AutoModelForCausalLM.from_pretrained(saved_model, cache_dir="cache")
tokenizer = AutoTokenizer.from_pretrained(saved_model, cache_dir="cache")
!python run.py --model_name="saved_model" --dials_total=100 --temperature=1

In [None]:
plt.figure()
plt.plot(training_size, JGA, label="JGA")
plt.xlabel("Number of Training Samples (Turn)")
plt.ylabel("JGA")
plt.legend()
plt.show()

plt.figure()
plt.plot(training_size, ECE, label="ECE")
plt.xlabel("Number of Training Samples (Turn)")
plt.ylabel("ECE")
plt.legend()
plt.show()

plt.figure()
plt.plot(training_size, AUC, label="AUC")
plt.xlabel("Number of Training Samples (Turn)")
plt.ylabel("AUC")
plt.legend()
plt.show()

### Testing

In [None]:
save_path = "saved_model"
model = AutoModelForCausalLM.from_pretrained(save_path, cache_dir="cache")
tokenizer = AutoTokenizer.from_pretrained(save_path, cache_dir="cache")

In [None]:
!python run.py --model_name="saved_model" --dials_total=100 --temperature=1

In [None]:
# Defining a custom stopping criteria class for the model's text generation.
from transformers import StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer

class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        stop_ids = [74694,55375,5658,14196]  # IDs of tokens where the generation should stop.
        for stop_id in stop_ids:
            if input_ids[0][-1] == stop_id:  # Checking if the last generated token is a stop token.
                return True
        return False

stop = StopOnTokens()

In [None]:
streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)

In [None]:
data_index = 10
input_text = validation_data[data_index]['text']
labels = validation_data[data_index]['labels']
inputs = tokenizer(input_text, return_tensors="pt").input_ids.cuda()

In [None]:
print(input_text)

In [None]:
outputs = model.generate(
    inputs,
    streamer=streamer,
    max_new_tokens=200,  # Adjust max_length as needed
    early_stopping=True,
    top_p=0.95,
    top_k=50,
    temperature=0.7,
    repetition_penalty=1.0,
    num_beams=1,
    stopping_criteria=StoppingCriteriaList([stop])
)
output = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
print("### output #### : ", output.strip())
print("### labels #### : ", labels)