In [12]:
import pandas as pd
import torch, math, os, re, json
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling, AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset, Dataset
from peft import get_peft_model, LoraConfig, TaskType, PeftModel, PeftConfig
from bert_score import score
from nltk import ngrams
from collections import Counter
from tqdm import tqdm

In [2]:
# Load files
targets_df = pd.read_csv("targets/test_targets.csv")
structured_df = pd.read_csv("structured/test_structured.csv")

In [3]:
# Initialize model/tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [4]:
tokenizer.pad_token = tokenizer.eos_token  # for safety

# Count tokens for each prompt
token_counts = [len(tokenizer.encode(prompt)) for prompt in targets_df["Prompt"]]

# Display stats
print(f"Max prompt length (in tokens): {max(token_counts)}")
print(f"Average prompt length (in tokens): {sum(token_counts) / len(token_counts):.2f}")
print(f"Number of prompts: {len(token_counts)}")

Max prompt length (in tokens): 829
Average prompt length (in tokens): 203.61
Number of prompts: 2610


In [5]:
# Generate responses
generated_responses = []
first_sentences = []
tokenizer.pad_token = tokenizer.eos_token
sentence_splitter = re.compile(r"(?<=[\.\?!…])\s+")
for prompt in tqdm(targets_df["Prompt"], desc="Generating responses"):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            pad_token_id=tokenizer.pad_token_id
        )
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Remove prompt from beginning if present
    if generated.startswith(prompt):
        generated = generated[len(prompt):].strip()
    generated_responses.append(generated)

    # Extract the first sentence using regex
    sentences = sentence_splitter.split(generated.strip())
    first_sentence = sentences[0].strip() if sentences else ""
    first_sentences.append(first_sentence)

# Match Dialogue_ID and Utterance_ID
structured_ids = structured_df[["Dialogue_ID", "Utterance_ID"]]
baseline_df = structured_ids.copy()
baseline_df["Response"] = first_sentences
baseline_df["Original Response"] = generated_responses

# Save to file
os.makedirs("baseline", exist_ok=True)
baseline_df.to_csv("baseline/test_baseline.csv", index=False)

Generating responses: 100%|██████████| 2610/2610 [21:33<00:00,  2.02it/s]


In [4]:
# 1. Perplexity
def compute_perplexity(sentences):
    ppl_list = []
    for sent in sentences:
        enc = tokenizer.encode(sent, return_tensors='pt').to(device)
        with torch.no_grad():
            outputs = model(enc, labels=enc)
        loss = outputs.loss
        ppl = math.exp(loss.item())
        ppl_list.append(ppl)
    return sum(ppl_list) / len(ppl_list)

# 2. Distinct-n
def compute_dist_n(sentences, n):
    all_ngrams = []
    for sent in sentences:
        tokens = sent.split()
        all_ngrams.extend(ngrams(tokens, n))
    total = len(all_ngrams)
    unique = len(set(all_ngrams))
    return unique / total if total > 0 else 0

In [12]:
# Get targets
reference_responses = structured_df["Response"].tolist()

# Metrics computation
ppl = compute_perplexity(generated_responses)
dist1 = compute_dist_n(generated_responses, 1)
dist2 = compute_dist_n(generated_responses, 2)
# bertscore = compute_bertscore(generated_responses, reference_responses)
P, R, F1 = score(generated_responses, reference_responses, lang="en")
bertscore = {
    "precision": P.mean().item(),
    "recall": R.mean().item(),
    "f1": F1.mean().item()
}

# Print results
# print("Perplexity:", ppl)
# print("Dist-1:", dist1)
# print("Dist-2:", dist2)
# print("BERTScore:", bertscore)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Collect all metrics
metrics = {
    "Perplexity": ppl,
    "Dist-1": dist1,
    "Dist-2": dist2,
    "BERTScore": bertscore
}

# Print to console
for k, v in metrics.items():
    print(f"{k}:", v)

# Save to JSON
os.makedirs("baseline", exist_ok=True)
with open("baseline/test_baseline_metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

Perplexity: 14.028045468447138
Dist-1: 0.07552239508002022
Dist-2: 0.3687123817712812
BERTScore: {'precision': 0.792173445224762, 'recall': 0.8516309857368469, 'f1': 0.8206309080123901}


In [16]:
# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # avoid padding warning
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias='none',
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)



In [17]:
# Load and preprocess your dataset
df = pd.read_csv("targets/train_targets.csv")
dataset = Dataset.from_pandas(df)

def tokenize(batch):
    texts = [
        prompt + tokenizer.eos_token + response
        for prompt, response in zip(batch["Prompt"], batch["Response"])
    ]
    return tokenizer(texts, truncation=True, padding="max_length", max_length=1024)

tokenized_dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/9989 [00:00<?, ? examples/s]

In [None]:
# Training setup
training_args = TrainingArguments(
    output_dir="gpt2-lora-checkpoints",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=100,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [19]:
# Train
trainer.train()

# Save
model.save_pretrained("gpt2-lora-finetuned")
tokenizer.save_pretrained("gpt2-lora-finetuned")

Step,Training Loss
10,3.4778
20,3.3546
30,3.3669
40,3.4142
50,3.3099
60,3.2685
70,3.3674
80,3.2064
90,3.085
100,3.1149


('gpt2-lora-finetuned\\tokenizer_config.json',
 'gpt2-lora-finetuned\\special_tokens_map.json',
 'gpt2-lora-finetuned\\vocab.json',
 'gpt2-lora-finetuned\\merges.txt',
 'gpt2-lora-finetuned\\added_tokens.json')

In [9]:
# Load tokenizer and base model
tokenizer = AutoTokenizer.from_pretrained("gpt2-lora-finetuned")
tokenizer.pad_token = tokenizer.eos_token  # Important!

# Load PEFT config and fine-tuned model
config = PeftConfig.from_pretrained("gpt2-lora-finetuned")
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, "gpt2-lora-finetuned")

# Move to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
           

In [10]:
# Generate responses
generated_responses = []
first_sentences = []

sentence_splitter = re.compile(r"(?<=[\.\?!…])\s+")
for prompt in tqdm(targets_df["Prompt"], desc="Generating responses"):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            pad_token_id=tokenizer.pad_token_id
        )
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Remove prompt from beginning if present
    if generated.startswith(prompt):
        generated = generated[len(prompt):].strip()
    generated_responses.append(generated)

    # Extract the first sentence using regex
    sentences = sentence_splitter.split(generated.strip())
    first_sentence = sentences[0].strip() if sentences else ""
    first_sentences.append(first_sentence)

# Match Dialogue_ID and Utterance_ID
structured_ids = structured_df[["Dialogue_ID", "Utterance_ID"]]
baseline_df = structured_ids.copy()
baseline_df["Response"] = first_sentences
baseline_df["Original Response"] = generated_responses

# Save to file
os.makedirs("baseline", exist_ok=True)
baseline_df.to_csv("baseline/test_finetuned.csv", index=False)

Generating responses: 100%|██████████| 2610/2610 [28:39<00:00,  1.52it/s]


In [11]:
# Get targets
reference_responses = structured_df["Response"].tolist()

# Metrics computation
ppl = compute_perplexity(generated_responses)
dist1 = compute_dist_n(generated_responses, 1)
dist2 = compute_dist_n(generated_responses, 2)

P, R, F1 = score(generated_responses, reference_responses, lang="en")
bertscore = {
    "precision": P.mean().item(),
    "recall": R.mean().item(),
    "f1": F1.mean().item()
}

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Collect all metrics
metrics = {
    "Perplexity": ppl,
    "Dist-1": dist1,
    "Dist-2": dist2,
    "BERTScore": bertscore
}

# Print to console
for k, v in metrics.items():
    print(f"{k}:", v)

# Save to JSON
os.makedirs("baseline", exist_ok=True)
with open("baseline/test_finetuned_metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

Perplexity: 8.42267339915282
Dist-1: 0.0483044691669534
Dist-2: 0.28082316045926964
BERTScore: {'precision': 0.7918421626091003, 'recall': 0.8541335463523865, 'f1': 0.8216411471366882}
