In [12]:
import pandas as pd
import torch, math, os, re
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from bert_score import score
from nltk import ngrams
from peft import LoraConfig, get_peft_model, PeftConfig, PeftModel
from tqdm import tqdm
import json
from datasets import Dataset

In [13]:
# Load files
targets_df = pd.read_csv("targets/test_targets.csv")
structured_df = pd.read_csv("structured/test_structured.csv")

In [14]:
from collections import defaultdict

train_df = pd.read_csv("data/train_sent_emo.csv")
dev_df = pd.read_csv("data/dev_sent_emo.csv")
test_df = pd.read_csv("data/test_sent_emo.csv")

def create_formatted_inputs(df):
    dialogues = defaultdict(list)

    # Group utterances by dialogue
    for _, row in df.iterrows():
        dialogues[row["Dialogue_ID"]].append((row["Speaker"], row["Utterance"], row["Emotion"]))

    # Convert to list format for easy processing
    dialogues = list(dialogues.values())

    # Generate formatted inputs with context
    formatted_inputs = []
    for dialogue in dialogues:
        context = ""
        for idx, (speaker, utterance, emotion) in enumerate(dialogue):
            context += f"{speaker}: {utterance}\n"

            last_speaker = speaker
            # Determine next speaker based on alternation if possible
            if idx + 1 < len(dialogue):
                next_speaker = dialogue[idx + 1][0]
            else:
                next_speaker = speaker  # Default to last speaker if no next available

            prompt = r"""### INSTRUCTIONS ###
Continue the conversation by generating **only the next line** spoken by the indicated character.
Your response must be empathetic, showing understanding or emotional attunement to the preceding dialogue.

### EXAMPLE ###

=== DIALOGUE HISTORY ===
Rachel: Hey!
Ross: Hi!
Rachel: What are you doing here?
Ross: Ah y'know, this building is on my paper route so I...
Rachel: Oh.
Ross: Hi.
Rachel: Hi.
Ross: How'd did it go?
Rachel: Oh well, the woman I interviewed with was pretty tough, but y'know thank God Mark coached me, because once I started talking about the fall line, she got all happy and wouldn't shut up.
Ross:

=== RESPONSE ===
That sounds like a huge relief.

### TASK ###

=== DIALOGUE HISTORY ===
{dialogue_hist}

=== RESPONSE ===
            """

            # full_input = (
            #     "### TASK ###\n"
            #     "Continue the conversation by generating **only one line** as the next speaker.\n"
            #     "This response should be **empathetic**, acknowledging or reflecting the emotional tone of the previous dialogue.\n"
            #     "DO NOT generate multiple lines.\n"
            #     "DO NOT summarize, analyze, or explain.\n"
            #     "Only generate one line and nothing more.\n\n"
            #     "### DIALOGUE HISTORY ###\n"
            #     f"{context.strip()}\n"
            #     f"{next_speaker}:"
            # )

            formatted_inputs.append(prompt.format(dialogue_hist=f"{context}{next_speaker}:"))

    return formatted_inputs

# Apply to each split
train_formatted_inputs = create_formatted_inputs(train_df)
dev_formatted_inputs = create_formatted_inputs(dev_df)
test_formatted_inputs = create_formatted_inputs(test_df)

print(f"Train inputs: {len(train_formatted_inputs)}")
print(f"Dev inputs: {len(dev_formatted_inputs)}")
print(f"Test inputs: {len(test_formatted_inputs)}")

Train inputs: 9989
Dev inputs: 1109
Test inputs: 2610


In [28]:
print(train_formatted_inputs[0])

### INSTRUCTIONS ###
Continue the conversation by generating **only the next line** spoken by the indicated character.
Your response must be empathetic, showing understanding or emotional attunement to the preceding dialogue.

### EXAMPLE ###

=== DIALOGUE HISTORY ===
Rachel: Hey!
Ross: Hi!
Rachel: What are you doing here?
Ross: Ah y'know, this building is on my paper route so I...
Rachel: Oh.
Ross: Hi.
Rachel: Hi.
Ross: How'd did it go?
Rachel: Oh well, the woman I interviewed with was pretty tough, but y'know thank God Mark coached me, because once I started talking about the fall line, she got all happy and wouldn't shut up.
Ross:

=== RESPONSE ===
That sounds like a huge relief.

### TASK ###

=== DIALOGUE HISTORY ===
Chandler: also I was the point person on my company’s transition from the KL-5 to GR-6 system.
The Interviewer:

=== RESPONSE ===
            


In [None]:
# Initialize model/tokenizer
os.environ["HF_TOKEN"] = "HUGGING_FACE_TOKEN"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "mistralai/Mistral-7B-Instruct-v0.3"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True
)
model.eval()

Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00,  1.99s/it]


MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32768, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): Mist

In [16]:
# Generate responses
generated_responses = []
first_sentences = []
tokenizer.pad_token = tokenizer.eos_token
sentence_splitter = re.compile(r"(?<=[\.\?!…])\s+")
for prompt in tqdm(test_formatted_inputs, desc="Generating responses"):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=25,
            # do_sample=True,
            # top_k=50,
            # top_p=0.95,
            pad_token_id=tokenizer.pad_token_id
        )
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Remove prompt from beginning if present
    if generated.startswith(prompt):
        generated = generated[len(prompt):].strip()
    generated_responses.append(generated)

    # Extract the first sentence using regex
    sentences = sentence_splitter.split(generated.strip())
    first_sentence = sentences[0].strip() if sentences else ""
    first_sentences.append(first_sentence)

# Match Dialogue_ID and Utterance_ID
structured_ids = structured_df[["Dialogue_ID", "Utterance_ID"]]
baseline_df = structured_ids.copy()
baseline_df["Response"] = first_sentences
baseline_df["Original Response"] = generated_responses

# Save to file
os.makedirs("baseline", exist_ok=True)
baseline_df.to_csv("baseline/mistral_25_test_baseline.csv", index=False)

Generating responses: 100%|██████████| 2610/2610 [37:49<00:00,  1.15it/s]


In [17]:
# 1. Perplexity
def compute_perplexity(sentences):
    total_loss = 0.0
    total_tokens = 0

    for sent in sentences:
        enc = tokenizer.encode(sent, return_tensors='pt').to(device)
        with torch.no_grad():
            outputs = model(enc, labels=enc)
        loss = outputs.loss  # This is *average loss per token* for the sentence
        num_tokens = enc.size(1)  # Sequence length

        total_loss += loss.item() * num_tokens  # Recover total loss for all tokens
        total_tokens += num_tokens

    avg_loss = total_loss / total_tokens
    ppl = math.exp(avg_loss)
    return ppl


# 2. Distinct-n
def compute_dist_n(sentences, n):
    all_ngrams = []
    for sent in sentences:
        tokens = sent.split()
        all_ngrams.extend(ngrams(tokens, n))
    total = len(all_ngrams)
    unique = len(set(all_ngrams))
    return unique / total if total > 0 else 0

In [18]:
# Get targets
reference_responses = structured_df["Response"].tolist()

trimmed_responses = baseline_df["Response"].tolist()

# Metrics computation
ppl = compute_perplexity(trimmed_responses)
dist1 = compute_dist_n(trimmed_responses, 1)
dist2 = compute_dist_n(trimmed_responses, 2)
# bertscore = compute_bertscore(generated_responses, reference_responses)
P, R, F1 = score(trimmed_responses, reference_responses, lang="en")
bertscore = {
    "precision": P.mean().item(),
    "recall": R.mean().item(),
    "f1": F1.mean().item()
}

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# Collect all metrics
metrics = {
    "Perplexity": ppl,
    "Dist-1": dist1,
    "Dist-2": dist2,
    "BERTScore": bertscore
}

# Print to console
for k, v in metrics.items():
    print(f"{k}:", v)

# Save to JSON
os.makedirs("baseline", exist_ok=True)
with open("baseline/mistral_25_baseline_metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

Perplexity: 23.559440080183258
Dist-1: 0.09818864590236359
Dist-2: 0.31100787198348173
BERTScore: {'precision': 0.8756473660469055, 'recall': 0.8778427243232727, 'f1': 0.8765256404876709}


In [None]:
os.environ["HF_TOKEN"] = "HUGGING_FACE_TOKEN"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "mistralai/Mistral-7B-Instruct-v0.3"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True
)

# Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    # bias="none",
    task_type="CAUSAL_LM",
    # target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)
model = get_peft_model(model, lora_config)

Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00,  1.69s/it]


In [6]:
# Load and preprocess your dataset
df = pd.read_csv("targets/train_targets.csv")
dataset = Dataset.from_pandas(df)

# Attach train_formatted_inputs directly to the dataset
dataset = dataset.add_column("Formatted_Prompt", train_formatted_inputs)

def tokenize(batch):
    texts = [
        prompt + tokenizer.eos_token + response
        for prompt, response in zip(batch["Formatted_Prompt"], batch["Response"])
    ]
    return tokenizer(texts, truncation=True, padding="max_length", max_length=2048)

tokenized_dataset = dataset.map(tokenize, batched=True)


Map: 100%|██████████| 9989/9989 [00:04<00:00, 2154.05 examples/s]


In [7]:
# Training setup
training_args = TrainingArguments(
    output_dir="mistral-lora-checkpoints",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=100,
    save_total_limit=2,
    fp16=True,
    report_to="none"
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
# Train
trainer.train()

# Save
model.save_pretrained("mistral-lora-finetuned")
tokenizer.save_pretrained("mistral-lora-finetuned")

Step,Training Loss
50,1.3588
100,0.7198
150,0.6837
200,0.6743
250,0.6793
300,0.6522
350,0.6509


In [6]:
# Load tokenizer and base model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained("mistral-lora-finetuned", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# Load PEFT config and fine-tuned model
config = PeftConfig.from_pretrained("mistral-lora-finetuned")
base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True
)
model = PeftModel.from_pretrained(base_model, "mistral-lora-finetuned")

# Move to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

Loading checkpoint shards: 100%|██████████| 3/3 [02:12<00:00, 44.03s/it]


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32768, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj):

In [8]:
# Generate responses
targets_df = pd.read_csv("targets/test_targets.csv")
structured_df = pd.read_csv("structured/test_structured.csv")

generated_responses = []
first_sentences = []

sentence_splitter = re.compile(r"(?<=[\.\?!…])\s+")
for prompt in tqdm(test_formatted_inputs, desc="Generating responses"):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=25,
            # do_sample=True,
            # top_k=50,
            # top_p=0.95,
            pad_token_id=tokenizer.pad_token_id
        )
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if generated.startswith(prompt):
        generated = generated[len(prompt):].strip()
    generated_responses.append(generated)

    sentences = sentence_splitter.split(generated.strip())
    first_sentence = sentences[0].strip() if sentences else ""
    first_sentences.append(first_sentence)

# Match Dialogue_ID and Utterance_ID
structured_ids = structured_df[["Dialogue_ID", "Utterance_ID"]]
baseline_df = structured_ids.copy()
baseline_df["Response"] = first_sentences
baseline_df["Original Response"] = generated_responses

# Save to file
os.makedirs("baseline", exist_ok=True)
baseline_df.to_csv("baseline/mistral_test_finetuned.csv", index=False)

Generating responses: 100%|██████████| 2610/2610 [32:54<00:00,  1.32it/s]


In [10]:
# Get targets
reference_responses = structured_df["Response"].tolist()

trimmed_responses = baseline_df["Response"].tolist()

# Metrics computation
ppl = compute_perplexity(trimmed_responses)
dist1 = compute_dist_n(trimmed_responses, 1)
dist2 = compute_dist_n(trimmed_responses, 2)
# bertscore = compute_bertscore(generated_responses, reference_responses)
P, R, F1 = score(trimmed_responses, reference_responses, lang="en")
bertscore = {
    "precision": P.mean().item(),
    "recall": R.mean().item(),
    "f1": F1.mean().item()
}

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Collect all metrics
metrics = {
    "Perplexity": ppl,
    "Dist-1": dist1,
    "Dist-2": dist2,
    "BERTScore": bertscore
}

# Print to console
for k, v in metrics.items():
    print(f"{k}:", v)

# Save to JSON
os.makedirs("baseline", exist_ok=True)
with open("baseline/mistral_finetuned_metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

Perplexity: 17.815717100430387
Dist-1: 0.06838336727320501
Dist-2: 0.19654324902877984
BERTScore: {'precision': 0.8906997442245483, 'recall': 0.8879204392433167, 'f1': 0.8890686631202698}
