In [None]:
# Install necessary libraries
!pip install transformers peft datasets torch torchvision open-clip-torch decord moviepy

In [None]:

from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import torch
from torchvision import transforms
from PIL import Image
import numpy as np
import open_clip
from decord import VideoReader, cpu

# Setup
BASE_MODEL = "meta-llama/Llama-3.2-11B"
OUTPUT_DIR = "./llama-movie-recommender-multimodal"
DEVICE_MAP = "auto"
IMAGE_MODEL = "openai/clip-vit-base-patch32"
QUANTIZATION_CONFIG = {
    "load_in_4bit": True,
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_compute_dtype": torch.float16,
    "bnb_4bit_use_double_quant": True,
}

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map=DEVICE_MAP,
    **QUANTIZATION_CONFIG,
)

# Load CLIP model for image and video features
clip_model, preprocess = open_clip.create_model_and_transforms(
    model_name="ViT-B/32", pretrained="openai"
)
clip_model.eval()

# Video preprocessing
video_preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
    

In [None]:

# Define functions for feature extraction
def extract_image_features(image_path):
    image = Image.open(image_path).convert("RGB")
    image_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        image_features = clip_model.encode_image(image_tensor).numpy()
    return image_features

def extract_trailer_features(trailer_path):
    vr = VideoReader(trailer_path, ctx=cpu(0))
    frame_features = []
    for frame in vr:
        image = video_preprocess(frame.asnumpy())
        image = image.unsqueeze(0).to("cuda")
        with torch.no_grad():
            features = clip_model.encode_image(image)
            frame_features.append(features.cpu().numpy())
    video_features = np.mean(frame_features, axis=0)
    return video_features
    

In [None]:

# Load dataset and add multimodal features
print("Loading Dataset...")
dataset = load_dataset("movielens", split="train")

# Add features to the dataset
def add_multimodal_features(row):
    row["poster_features"] = extract_image_features(row["poster_path"])
    row["trailer_features"] = extract_trailer_features(row["trailer_path"])
    row["movie_features"] = np.concatenate([row["poster_features"], row["trailer_features"]])
    return row

dataset = dataset.map(add_multimodal_features, num_proc=4)

# Format data for fine-tuning
def format_multimodal(row):
    instruction = "You are a multimodal movie recommender system. Suggest movies based on user preferences, posters, and trailers."
    user_input = f"User Preferences: {row['user_preferences']}"
    movie_features = f"Movie Features: {row['movie_features']}"
    recommended_movies = f"Recommended Movies: {row['recommended_movies']}"
    row["text"] = f"{instruction}\n{user_input}\n{movie_features}\n{recommended_movies}"
    return row

dataset = dataset.map(format_multimodal, num_proc=4)
    

In [None]:

# Configure LoRA
print("Setting up LoRA...")
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
    

In [None]:

# Set training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    num_train_epochs=1,
    learning_rate=2e-4,
    warmup_steps=10,
    fp16=True,
    save_steps=100,
    logging_steps=10,
    logging_dir="./logs",
    evaluation_strategy="steps",
    eval_steps=100,
    save_total_limit=2,
)

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
)
    

In [None]:

# Start training
print("Starting Training...")
trainer.train()
    

In [None]:

# Save the fine-tuned model
print("Saving Model...")
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"Fine-tuned model saved to {OUTPUT_DIR}")
    

## model turned out to be overfitted . Fixing IT !