# Load Fine-Tuned Model

In [1]:
import nbimporter
from finetune_mllm import EmpatheticMLLM

  warn(f"Failed to load image Python extension: {e}")


In [2]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = EmpatheticMLLM()
model.load_state_dict(torch.load("finetuned_mllm.pth"))
model.to(device)
model.eval()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

EmpatheticMLLM(
  (llm): PeftModelForCausalLM(
    (base_model): LoraModel(
      (model): MistralForCausalLM(
        (model): MistralModel(
          (embed_tokens): Embedding(32768, 4096)
          (layers): ModuleList(
            (0-31): 32 x MistralDecoderLayer(
              (self_attn): MistralAttention(
                (q_proj): lora.Linear4bit(
                  (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=4096, out_features=8, bias=False)
                  )
                  (lora_B): ModuleDict(
                    (default): Linear(in_features=8, out_features=4096, bias=False)
                  )
                  (lora_embedding_A): ParameterDict()
                  (lora_embedding_B): ParameterDict()
                  (lora_magn

# Generate Responses for Test Set

In [3]:
from finetune_mllm import prepare_split, MultimodalMELD
from torch.utils.data import DataLoader

test_data = prepare_split('test')
test_dataset = MultimodalMELD(test_data)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [4]:
from tqdm import tqdm

generated_responses = []
target_responses = []
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Generating Responses"):
        x = {
            'multimodal_embed': batch['multimodal_embed'].to(device),
            'prompt': batch['prompt']
        }

        generated_ids = model(x)
        generated_response = model.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
        generated_responses.append(generated_response)
        target_responses.append(batch['target_response'][0])

Generating Responses:   0%|          | 0/2157 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Responses:   0%|          | 1/2157 [00:01<54:26,  1.51s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Responses:   0%|          | 2/2157 [00:02<33:03,  1.09it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Responses:   0%|          | 3/2157 [00:03<36:46,  1.02s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Responses:   0%|          | 4/2157 [00:03<32:27,  1.11it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Responses:   0%|          | 5/2157 [00:05<35:43,  1.00it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Responses:   0%|          | 6/2157 [00:05<31:50,  1.13it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Responses:   0%|          | 7/2

In [5]:
import csv

with open('mllm_test_outputs.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Prompt', 'Generated Response', 'Target Response'])

    for i in range(len(test_data)):
        prompt = test_data[i]['Prompt']
        generated = generated_responses[i]
        target = target_responses[i]
        writer.writerow([prompt, generated, target])

# Calculate BERTScore

In [6]:
from bert_score import score

P, R, F1 = score(generated_responses, target_responses, lang="en")

print(f"PBERT: {P.mean():.4f}")
print(f"RBERT: {R.mean():.4f}")
print(f"FBERT: {F1.mean():.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PBERT: 0.8883
RBERT: 0.8961
FBERT: 0.8919


# Calculate Perplexity

In [7]:
import torch
import math
from tqdm import tqdm

total_log_likelihood = 0.0
total_token_count = 0

for response in tqdm(target_responses, desc="Calculating Perplexity"):
    input_ids = model.tokenizer.encode(response, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model.llm(input_ids, labels=input_ids)
    loss = outputs.loss  # average loss over tokens in this sequence
    num_tokens = input_ids.size(1)
    
    # multiply by number of tokens to get total negative log-likelihood for this sequence
    total_log_likelihood += loss.item() * num_tokens
    total_token_count += num_tokens

# compute per-token perplexity
avg_loss = total_log_likelihood / total_token_count
perplexity = math.exp(avg_loss)

print(f"Per-token PPL: {perplexity:.4f}")

Calculating Perplexity: 100%|██████████| 2157/2157 [02:34<00:00, 14.00it/s]

Per-token PPL: 22.3506





# Calculate Diversity

In [8]:
from nltk import ngrams

def compute_dist_n(responses, n):
    all_ngrams = []
    for response in responses:
        tokens = response.split()
        all_ngrams.extend(ngrams(tokens, n))
    total = len(all_ngrams)
    unique = len(set(all_ngrams))
    return unique / total if total > 0 else 0

dist1 = compute_dist_n(generated_responses, 1)
dist2 = compute_dist_n(generated_responses, 2)

print(f"Dist-1: {dist1:.4f}")
print(f"Dist-2: {dist2:.4f}")

Dist-1: 0.0914
Dist-2: 0.2907
