In [1]:
import os
from huggingface_hub import login
from datasets import load_dataset
import pandas as pd
from mlx_lm import load, generate

In [2]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Dataset preparation

In [3]:
dataset = load_dataset("jbeiroa/resume-summarization-dataset")
train = dataset["train"]
val = dataset["validation"]
test = dataset["test"]

In [19]:
CHUNK_SIZE = 1500
OVERLAP = 250

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

def split_resume_into_chunks(resume_text, chunk_size=CHUNK_SIZE, overlap=OVERLAP):
    tokens = tokenizer.encode(resume_text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = start + chunk_size
        chunk = tokens[start:end]
        text = tokenizer.decode(chunk)
        chunks.append(text)
        start += chunk_size - overlap
    return chunks

In [20]:
PROMPT_TEMPLATE = (
    "Summarize the following resume in 3-4 sentences, focusing on key skills, experience, and education.\n\n"
)

In [59]:
def convert_split(split):
    training_pairs = []
    for example in split:
        chunks = split_resume_into_chunks(example["resume"])
        for chunk in chunks:
            training_pairs.append({
                "prompt": PROMPT_TEMPLATE + chunk,
                "response": example["summary"]
            })
    return training_pairs

In [60]:
train_mlx = convert_split(train)
val_mlx = convert_split(val)
test_mlx = convert_split(test)

In [61]:
import json

def save_jsonl(data, filename):
    with open(filename, "w") as f:
        for item in data:
            json.dump(item, f)
            f.write("\n")

save_jsonl(train_mlx, "/Users/juanbeiroa/Code/thereisnohr/data/finetuning/train.jsonl")
save_jsonl(val_mlx, "/Users/juanbeiroa/Code/thereisnohr/data/finetuning/valid.jsonl")
save_jsonl(test_mlx, "/Users/juanbeiroa/Code/thereisnohr/data/finetuning/test.jsonl")

# Finetuning

In [21]:
models_to_finetune = {
    "phi2": "microsoft/phi-2",
    "tinyllama": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    "gemma2b": "mlx-community/gemma-2-2b-it",
    "llama3.2": "mlx-community/Llama-3.2-1B-Instruct-MLXTuned"
}

In [None]:
!mlx_lm.lora \
    --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
    --train \
    --data ~/Code/thereisnohr/data/finetuning \
    --adapter-path ~/Code/thereisnohr/adapters/tinyllama \
    --batch-size 2 \
    --num-layers 4 \
    --grad-checkpoint \
    --test