<a href="https://colab.research.google.com/github/januverma/llm-cross-encoders-for-recsys/blob/main/LLM_Classifier_for_Movie_Rec_only_prompt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tune a LLM for Movie Rating Prediction

## Install Dependencies

In [None]:
! pip install -U "transformers[torch]" datasets accelerate peft bitsandbytes evaluate



## Load and Process Data

In [None]:
import pandas as pd
import re
from datasets import Dataset
import json

In [None]:
train_data = pd.read_csv('./train_data_it.csv')
test_data = pd.read_csv('./test_data_it.csv')

In [None]:
train_data['past_movies'] = train_data['past_movies'].apply(eval)
test_data['past_movies'] = test_data['past_movies'].apply(eval)

In [None]:
train_data.head()

Unnamed: 0,userId,movie_count,past_movies,past_movie_ids,candidate,movieId,rating
0,1,11,"[Girl, Interrupted (1999):::Drama:::4, Titanic...","[3186, 1721, 1270, 1022, 2340, 1836, 3408, 120...",Wallace & Gromit: The Best of Aardman Animatio...,720,3.0
1,1,11,[One Flew Over the Cuckoo's Nest (1975):::Dram...,"[1193, 919, 608, 2692, 1961, 2028, 3105, 938, ...",Mary Poppins (1964):::Children's|Comedy|Musical,1028,5.0
2,1,11,"[Bambi (1942):::Animation|Children's:::4, Apol...","[2018, 150, 1097, 914, 1287, 2797, 1246, 2762,...","Secret Garden, The (1993):::Children's|Drama",531,4.0
3,1,11,[Toy Story 2 (1999):::Animation|Children's|Com...,"[3114, 2791, 1029, 2321, 1197, 594, 2398, 1545...",Beauty and the Beast (1991):::Animation|Childr...,595,5.0
4,1,3,[Aladdin (1992):::Animation|Children's|Comedy|...,"[588, 1]",Tarzan (1999):::Animation|Children's,2687,3.0


In [None]:
test_data.head()

Unnamed: 0,userId,past_movies,past_movie_ids,candidate,movieId,rating
0,238,"[Philadelphia (1993):::Drama:::4, Crimes of th...","[508, 2738, 3546, 1747, 3135, 3071, 3194, 85, ...",Clara's Heart (1988):::Drama,3714,3.0
1,591,"[Postino, Il (The Postman) (1994):::Drama|Roma...","[58, 3370, 2611, 3765, 2000, 1246, 509, 3135, ...","Untouchables, The (1987):::Action|Crime|Drama",2194,3.0
2,165,"[Man Who Knew Too Much, The (1956):::Thriller:...","[2183, 2700, 3061, 2863, 2946, 2747, 3922, 203...",Repo Man (1984):::Comedy|Sci-Fi,1965,4.0
3,631,[Superman (1978):::Action|Adventure|Sci-Fi:::5...,"[2640, 2302, 3081, 1732, 1042, 440, 986, 2278,...",God Said 'Ha!' (1998):::Comedy,2499,5.0
4,588,[Singin' in the Rain (1952):::Musical|Romance:...,"[899, 2130, 1734, 2697, 3588, 1247, 1041, 3545...",What Ever Happened to Baby Jane? (1962):::Dram...,3546,4.0


In [None]:
train_data.iloc[0]

Unnamed: 0,0
userId,1
movie_count,11
past_movies,"[Girl, Interrupted (1999):::Drama:::4, Titanic..."
past_movie_ids,"[3186, 1721, 1270, 1022, 2340, 1836, 3408, 120..."
candidate,Wallace & Gromit: The Best of Aardman Animatio...
movieId,720
rating,3.0


In [None]:
train_data.past_movies.iloc[1]

["One Flew Over the Cuckoo's Nest (1975):::Drama:::5",
 "Wizard of Oz, The (1939):::Adventure|Children's|Drama|Musical:::4",
 'Fargo (1996):::Crime|Drama|Thriller:::4',
 'Run Lola Run (Lola rennt) (1998):::Action|Crime|Romance:::4',
 'Rain Man (1988):::Drama:::5',
 'Saving Private Ryan (1998):::Action|Drama|War:::5',
 'Awakenings (1990):::Drama:::5',
 'Gigi (1958):::Musical:::4',
 'Sound of Music, The (1965):::Musical:::5',
 'Driving Miss Daisy (1989):::Drama:::4']

In [None]:
# convert past_movies column to only contain list of titles which are 0-th entry after splitting on :::

train_data['past_movies'] = train_data['past_movies'].apply(lambda x: [item.split(':::')[0] for item in x])
test_data['past_movies'] = test_data['past_movies'].apply(lambda x: [item.split(':::')[0] for item in x])

In [None]:
train_data.past_movies.iloc[1]

["One Flew Over the Cuckoo's Nest (1975)",
 'Wizard of Oz, The (1939)',
 'Fargo (1996)',
 'Run Lola Run (Lola rennt) (1998)',
 'Rain Man (1988)',
 'Saving Private Ryan (1998)',
 'Awakenings (1990)',
 'Gigi (1958)',
 'Sound of Music, The (1965)',
 'Driving Miss Daisy (1989)']

In [None]:
test_data.past_movies.iloc[1]

['Postino, Il (The Postman) (1994)',
 'Betrayed (1988)',
 'Winslow Boy, The (1998)',
 'Hot Spot, The (1990)',
 'Lethal Weapon (1987)',
 'Dead Poets Society (1989)',
 'Piano, The (1993)',
 'Great Santini, The (1979)',
 'Quiz Show (1994)',
 'Breaking the Waves (1996)']

## Create Hugging Face datasets

In [None]:
train_dataset = Dataset.from_pandas(train_data)
train_dataset

Dataset({
    features: ['userId', 'movie_count', 'past_movies', 'past_movie_ids', 'candidate', 'movieId', 'rating'],
    num_rows: 84297
})

In [None]:
test_dataset = Dataset.from_pandas(test_data)
test_dataset

Dataset({
    features: ['userId', 'past_movies', 'past_movie_ids', 'candidate', 'movieId', 'rating'],
    num_rows: 2000
})

In [None]:
# Evaluate the model on a small sample of the evaluation dataset
val_sample_size = 1000  # Adjust the sample size as needed
val_dataset = test_dataset.select(range(min(val_sample_size, len(test_dataset))))

## Tokenize

In [None]:
import os
import numpy as np
import torch
import evaluate
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    DataCollatorWithPadding, TrainingArguments, Trainer,
    BitsAndBytesConfig
)

In [None]:
MODEL_NAME = "Qwen/Qwen2.5-7B"
NUM_LABELS = 5
MAX_LEN    = 512

In [None]:
tok = AutoTokenizer.from_pretrained(MODEL_NAME)
# tok.add_special_tokens({"additional_special_tokens": ["[CAND]"]})

if tok.pad_token is None:
    tok.pad_token = tok.eos_token

In [None]:
def preprocess(ex):
    user_movies = "\n".join(ex["past_movies"])
    prompt = (
        "Given a user's past watched movies, predict the rating the user will give to the candidate movie. "
        "The ratings vary from 1.0 to 5.0.\n"
        f"{user_movies}\n"
        f"The candidate movie is {ex['candidate']}. "
        "The rating given by user is?"
    )
    enc = tok(prompt, truncation=True, max_length=MAX_LEN)
    enc["labels"] = int(ex["rating"]) - 1
    return enc


In [None]:
train_tok = train_dataset.map(
    preprocess, remove_columns=train_dataset.column_names)

val_tok   = val_dataset.map(
    preprocess, remove_columns=val_dataset.column_names)

test_tok  = test_dataset.map(
    preprocess, remove_columns=test_dataset.column_names)

Map:   0%|          | 0/84297 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

## Load Model

In [None]:
## BnB config
bnb_cfg = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True)

base = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    device_map="auto",
    quantization_config=bnb_cfg,
    torch_dtype=torch.bfloat16,
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-7B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
base.config.pad_token_id = tok.pad_token_id   # keep config consistent

In [None]:
base.resize_token_embeddings(len(tok))   # add the "[CAND]" token

Embedding(151665, 3584)

## LoRA Setup

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_cfg = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Qwen attention
)
model = get_peft_model(base, lora_cfg)
model.print_trainable_parameters()

trainable params: 10,110,464 || all params: 7,079,317,504 || trainable%: 0.1428


## Training Setup

In [None]:
args = TrainingArguments(
    output_dir      = "./qwen_cls_lora",
    per_device_train_batch_size = 4,
    per_device_eval_batch_size  = 4,
    gradient_accumulation_steps = 8,   # effective 32
    num_train_epochs = 5,
    learning_rate   = 2e-5,
    bf16            = torch.cuda.is_available(),
    logging_steps   = 50,
    eval_strategy = "steps",
    eval_steps = 500,
    save_strategy   = "steps",
    load_best_model_at_end = True,
    metric_for_best_model = "accuracy",
    label_names=["labels"],
)

In [None]:
# -- 4. Metrics -----------------------------------------------------------
acc = evaluate.load("accuracy")
def compute_metrics(pred):
    logits, labels = pred
    return {"accuracy": acc.compute(
        predictions=np.argmax(logits, axis=-1),
        references=labels
    )["accuracy"]}

In [None]:
## Data collator for padding
collator = DataCollatorWithPadding(tok)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    data_collator=collator,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    compute_metrics=compute_metrics,
)

## Training

In [None]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mj-verma5[0m ([33mj-verma5-self[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy
500,1.4267,1.527605,0.333
1000,1.4354,1.561064,0.33
1500,1.3944,1.440115,0.343
2000,1.3873,1.440268,0.332
2500,1.3634,1.438043,0.34
3000,1.3757,1.411107,0.339
3500,1.3628,1.409562,0.365
4000,1.3449,1.41402,0.342
4500,1.3372,1.420563,0.344
5000,1.3635,1.399691,0.369




TrainOutput(global_step=13175, training_loss=1.3528579144414508, metrics={'train_runtime': 32618.6341, 'train_samples_per_second': 12.922, 'train_steps_per_second': 0.404, 'total_flos': 2.905773769551443e+18, 'train_loss': 1.3528579144414508, 'epoch': 5.0})

In [None]:
trainer.evaluate(test_tok)

{'eval_loss': 1.3712871074676514,
 'eval_accuracy': 0.3755,
 'eval_runtime': 64.4444,
 'eval_samples_per_second': 31.034,
 'eval_steps_per_second': 7.759,
 'epoch': 5.0}

In [None]:
test_dataset[0]

{'userId': 238,
 'past_movies': ['Philadelphia (1993)',
  'Crimes of the Heart (1986)',
  'What Ever Happened to Baby Jane? (1962)',
  'Wag the Dog (1997)',
  'Great Santini, The (1979)',
  'Stand and Deliver (1987)',
  'Way We Were, The (1973)',
  'Angels and Insects (1995)',
  'American Buffalo (1996)',
  'Oscar and Lucinda (a.k.a. Oscar & Lucinda) (1997)'],
 'past_movie_ids': '[508, 2738, 3546, 1747, 3135, 3071, 3194, 85, 806, 2801]',
 'candidate': "Clara's Heart (1988):::Drama",
 'movieId': 3714,
 'rating': 3.0}

In [None]:
def predict_label(history, candidate):
    """
    Given a conversation history and a candidate utterance, returns the predicted label
    (instead of the full score distribution).

    Args:
        history (List[str]): List of previous turns.
        candidate (str): The new utterance to score.

    Returns:
        str: The label with the highest predicted probability.
    """
    # Build the input text
    user_movies = "\n".join(history)
    text = (
        "Given a user's past watched movies, predict the rating the user will give to the candidate movie. "
        "The ratings vary from 1.0 to 5.0.\n"
        f"{user_movies}\n"
        f"The candidate movie is {candidate}. "
        "The rating given by user is?"
    )
    # Tokenize and move to model device
    ids = tok(text, return_tensors="pt").to(model.device)

    # Run the model without tracking gradients
    with torch.no_grad():
        logits = model(**ids).logits  # shape: (batch_size=1, num_labels=5)

    # Find the index of the highest logit (i.e., predicted class)
    predicted_idx = int(logits.argmax(dim=-1).item())

    # Map to human-readable label
    return predicted_idx + 1

In [None]:
predict_label(test_dataset[0]['past_movies'], test_dataset[0]['candidate'])

NameError: name 'ex' is not defined

In [None]:
preds = []
actuals = []
for i in range(len(test_dataset)):
    preds.append(predict_label(test_dataset[i]['past_movies'], test_dataset[i]['candidate']))
    actuals.append(test_dataset[i]['rating'])

In [None]:
len(preds), len(actuals)

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error

rmse = np.sqrt(mean_squared_error(actuals, preds))
mae = mean_absolute_error(actuals, preds)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")

In [None]:
train_dataset[0]

In [None]:
preds[:20]

In [None]:
# from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    accuracy_score,
    confusion_matrix
)
actuals_binary = [1 if x >= 4 else 0 for x in actuals]
preds_binary = [1 if x >= 4 else 0 for x in preds]

In [None]:
acc = accuracy_score(actuals_binary, preds_binary)
precision = precision_score(actuals_binary, preds_binary)
recall = recall_score(actuals_binary, preds_binary)
f1 = f1_score(actuals_binary, preds_binary)
auc = roc_auc_score(actuals_binary, preds_binary)

In [None]:
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1:        {f1:.4f}")
print(f"AUC:       {auc:.4f}")
print(f"Accuracy:  {acc:.4f}")