In [31]:
import os
import json
import pandas as pd
from ast import literal_eval
import random

### Format Metadata

In [32]:
metadata_path = "../data/movie_metadata.csv"
metadata_df = pd.read_csv(metadata_path)
metadata_df.set_index("id", inplace=True)
metadata_df.head()

Unnamed: 0_level_0,Unnamed: 0.1,Unnamed: 0,title,release_date,revenue,runtime,adult,budget,overview,popularity,poster_path,genres,keywords,year,actors,emotions,director
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
565770,0,0,Blue Beetle,2023-08-16,124818235,128,False,120000000,Recent college grad Jaime Reyes returns home f...,2994.357,/mXLOHHc1Zeuwsl4xYKjKh2280oL.jpg,"['Action', 'Science Fiction', 'Adventure']","armor, superhero, family relationships, family...",2023,"['Xolo Mariduena', 'Bruna Marquezine', 'Susan ...","['excitement', 'pride', 'inspiration']",Ángel Manuel Soto
980489,1,1,Gran Turismo,2023-08-09,114800000,135,False,60000000,The ultimate wish-fulfillment tale of a teenag...,2680.593,/51tqzRtKMMZEYUpSYkrUE7v9ehm.jpg,"['Action', 'Drama', 'Adventure']","based on true story, racing, based on video ga...",2023,"['Archie Madekwe', 'David Harbour', 'Orlando B...","['tension', 'determination', 'hope']",Neill Blomkamp
968051,3,3,The Nun II,2023-09-06,231200000,110,False,38500000,"In 1956 France, a priest is violently murdered...",1692.778,/c9kVD7W8CT5xe4O3hQ7bFWwk68U.jpg,"['Horror', 'Mystery', 'Thriller']","france, bullying, sequel, religion, demon, got...",2023,"['Taissa Farmiga', 'Jonas Bloquet', 'Storm Rei...","['fear', 'suspense', 'dread']",Michael Chaves
615656,4,4,Meg 2: The Trench,2023-08-02,384056482,116,False,129000000,An exploratory dive into the deepest depths of...,1567.273,/4m1Au3YkjqsxF8iwQy0fPYSxE0h.jpg,"['Action', 'Science Fiction', 'Horror']","based on novel or book, sequel, shark, kaiju, ...",2023,"['Jason Statham', 'Wu Jing', 'Shuya Sophia Cai...","['fear', 'adrenaline', 'awe']",Ben Wheatley
762430,5,5,Retribution,2023-08-23,12905464,91,False,20000000,When a mysterious caller puts a bomb under his...,1547.22,/oUmmY7QWWn7OhKlcPOnirHJpP1F.jpg,"['Action', 'Mystery', 'Thriller', 'Crime']","assassination, bomb, remake, divorce, duringcr...",2023,"['Liam Neeson', 'Lilly Aspell', 'Jack Champion...","['anger', 'revenge', 'justice']",Nimród Antal


In [33]:
def format_metadata(row):
    def safe_eval(val):
        try:
            return ", ".join(literal_eval(val)) if isinstance(val, str) else ""
        except:
            return ""
    
    return f"""Title: {row['title']}
            Genres: {safe_eval(row.get('genres', ''))}
            Tags: {row.get('keywords', '')}
            Actors: {safe_eval(row.get('actors', ''))}
            Emotion Tags: {safe_eval(row.get('emotions', ''))}
            Overview: {row.get('overview', '')}"""

### Json/Jsonl Loaders

In [34]:
def load_json_array(filepath):
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            return json.load(f)
    except Exception as e:
        print(f"Skipping file {filepath} due to error: {e}")
        return []

def load_json_lines(filepath):
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            return [json.loads(line.strip()) for line in f if line.strip()]
    except Exception as e:
        print(f"Skipping file {filepath} due to error: {e}")
        return []

### Clarification Batches

In [35]:
clarification_data = []
clarification_dir = "../data/clarification_batches"

for file in os.listdir(clarification_dir):
    if file.endswith(".json"):
        path = os.path.join(clarification_dir, file)
        samples = load_json_array(path)
        for sample in samples:
            clarification_data.append({
                "instruction": sample["instruction"],
                "input": "",
                "output": sample["output"]
            })

In [36]:
clarification_data[0]

{'instruction': 'Looking for something fun to watch tonight',
 'input': '',
 'output': 'Hey! Are you in the mood for a laugh, some action, or maybe a bit of drama? 🎭'}

In [37]:
len(clarification_data)

300

### Direct Recommendation

In [38]:
def convert_direct_dataset(folder_path):
    data = []
    for file in os.listdir(folder_path):
        if file.endswith(".jsonl"):
            movie_id = int(os.path.splitext(file)[0])
            if movie_id not in metadata_df.index:
                continue
            metadata = format_metadata(metadata_df.loc[movie_id])
            path = os.path.join(folder_path, file)
            samples = load_json_lines(path)
            for sample in samples:
                data.append({
                    "instruction": sample["instruction"],
                    "input": metadata,
                    "output": sample["output"]
                })
    return data

simple_direct_dir = "../data/simple_recommendation_batches"
specific_direct_dir = "../data/specific_recommendation_batches"

simple_direct_data = convert_direct_dataset(simple_direct_dir)
specific_direct_data = convert_direct_dataset(specific_direct_dir)

In [39]:
simple_direct_data[0]

{'instruction': 'looking for a scary movie with a killer mask and some mystery',
 'input': "Title: Scream 4\n            Genres: Horror, Mystery\n            Tags: rescue, mask, sheriff, jealousy, victim, knife, self-inflicted injury, sociopath, murder, house party, teen movie, serial killer, author, hospital, slasher, home invasion, self-referential, cousin cousin relationship, phone call, millennials, publicist, fame-seeking, new generation, return to hometown, masked killer, remakes\n            Actors: David Arquette, Neve Campbell, Courteney Cox, Emma Roberts, Hayden Panettiere, Anthony Anderson, Adam Brody, Rory Culkin, Mary McDonnell, Marley Shelton, Alison Brie, Marielle Jaffe, Nico Tortorella, Erik Knudsen, Anna Paquin, Kristen Bell, Lucy Hale, Shenae Grimes-Beech, Britt Robertson, Aimee Teegarden, Roger L. Jackson, Nancy O'Dell, Justin Michael Brandt, Dane Farwell, Gordon Michaels, John Lepard, Kim Adams, Devin Scillian, Mark Aaron Buerkle, Julia Ho, Alexander Punch, Glenda L

In [40]:
specific_direct_data[0]

{'instruction': 'an intense crime thriller with a gritty vibe about a heist gone wrong and a hunt for buried cash, preferably featuring complex characters and lots of tension',
 'input': 'Title: Paydirt\n            Genres: Action, Crime, Thriller\n            Tags: nan\n            Actors: Luke Goss, Val Kilmer, Mike Hatton, Paul Sloan, Nick Vallelonga, Mirtha Michelle, Veronika Bozeman, Murielle Telio, Mercedes Kilmer, Lorin McCraley, Mary Christina Brown, Teodora Djuric, Mark Sherman, Jay Montalvo, Mara Fimbres\n            Emotion Tags: darkness, madness, tension\n            Overview: A parolee teams up with his old crew determined to find a buried bag of cash stolen a decade ago from a DEA bust gone bad, while being tracked by a retired Sheriff.',
 'output': 'If you’re craving a gritty crime thriller packed with tension and a twisted hunt for stolen cash, you’ve gotta check out Paydirt! 💥 Luke Goss and Val Kilmer bring some serious edge as a parolee and his crew digging up a dark

In [41]:
len(simple_direct_data), len(specific_direct_data)

(12363, 2085)

### Multi-turn Dataset

In [42]:
# STEP 4: Process all multi-turn samples
multi_turn_data = []
multi_turn_dir = "../data/multi_turn_recommendation_batches"  

for file in os.listdir(multi_turn_dir):
    if file.endswith(".jsonl"):
        try:
            movie_id = int(os.path.splitext(file)[0])
        except ValueError:
            print(f"Skipping invalid filename: {file}")
            continue

        if movie_id not in metadata_df.index:
            print(f"Skipping unknown movie ID: {movie_id}")
            continue

        metadata = format_metadata(metadata_df.loc[movie_id])
        file_path = os.path.join(multi_turn_dir, file)
        samples = load_json_lines(file_path)

        for sample in samples:
            convo = sample.get("conversation", [])
            if not convo or len(convo) < 2:
                continue

            # Flatten turns except last assistant reply
            turns = [f"{'User' if t['role'] == 'user' else 'Assistant'}: {t['content']}" for t in convo[:-1]]
            instruction = "\n".join(turns)
            output = convo[-1]["content"]

            multi_turn_data.append({
                "instruction": instruction,
                "input": metadata,
                "output": output
            })

In [43]:
multi_turn_data[0]

{'instruction': "User: I'm in the mood for a scary movie with a strong mystery plot.\nAssistant: Do you prefer a classic slasher vibe or something with a modern twist?\nUser: I'd love a movie that brings back some nostalgia but also feels fresh and has a killer mystery.",
 'input': "Title: Scream 4\n            Genres: Horror, Mystery\n            Tags: rescue, mask, sheriff, jealousy, victim, knife, self-inflicted injury, sociopath, murder, house party, teen movie, serial killer, author, hospital, slasher, home invasion, self-referential, cousin cousin relationship, phone call, millennials, publicist, fame-seeking, new generation, return to hometown, masked killer, remakes\n            Actors: David Arquette, Neve Campbell, Courteney Cox, Emma Roberts, Hayden Panettiere, Anthony Anderson, Adam Brody, Rory Culkin, Mary McDonnell, Marley Shelton, Alison Brie, Marielle Jaffe, Nico Tortorella, Erik Knudsen, Anna Paquin, Kristen Bell, Lucy Hale, Shenae Grimes-Beech, Britt Robertson, Aimee 

In [44]:
len(multi_turn_data), len(os.listdir(multi_turn_dir))

(4502, 4506)

### Sample and Format

In [45]:
random.seed(42)  # For reproducibility

# Sample 10K from your full sets
clar_subset = clarification_data  # all 300
simple_subset = random.sample(simple_direct_data, 5000)
spec_subset = random.sample(specific_direct_data, 1500)
multi_subset = random.sample(multi_turn_data, 3200)

# Combine
quantization_samples = clar_subset + simple_subset + spec_subset + multi_subset

In [46]:
huggingface_format_data = []

for example in quantization_samples:
    user_msg = example["instruction"]
    if example["input"]:
        user_msg += "\n\n" + example["input"]

    huggingface_format_data.append({
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": user_msg},
            {"role": "assistant", "content": example["output"]}
        ]
    })

In [47]:
with open("../data/llama3_quant_10k_hf_chat.jsonl", "w", encoding="utf-8") as f:
    for item in huggingface_format_data:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")