In [1]:
import os
from huggingface_hub import login
from datasets import load_dataset
import pandas as pd
from mlx_lm import load, generate

In [2]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Dataset preparation

In [3]:
dataset = load_dataset("jbeiroa/resume-summarization-dataset")
train = dataset["train"]
val = dataset["validation"]
test = dataset["test"]

In [4]:
CHUNK_SIZE = 1500
OVERLAP = 250
PROMPT_TEMPLATE = (
    "Summarize the following resume in 3-4 sentences, focusing on key skills, experience, and education.\n\n"
)

from transformers import AutoTokenizer
import json

models_to_finetune = [
    "microsoft/Phi-3.5-mini-instruct",
    "mlx-community/TinyLlama-1.1B-Chat-v1.0-mlx",
    "mlx-community/gemma-2-2b-it",
    "mlx-community/Llama-3.2-1B-Instruct-MLXTuned"
]

out_dirs = [
    "/Users/juanbeiroa/Code/thereisnohr/data/phi-finetuning/",
    "/Users/juanbeiroa/Code/thereisnohr/data/tinyllama-finetuning/",
    "/Users/juanbeiroa/Code/thereisnohr/data/gemma-finetuning/",
    "/Users/juanbeiroa/Code/thereisnohr/data/llama-finetuning/"
]

def split_resume_into_chunks(resume_text, tokenizer, chunk_size=CHUNK_SIZE, overlap=OVERLAP):
    tokens = tokenizer.encode(resume_text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = start + chunk_size
        chunk = tokens[start:end]
        text = tokenizer.decode(chunk)
        chunks.append(text)
        start += chunk_size - overlap
    return chunks

def convert_split(split, tokenizer):
    training_pairs = []
    for example in split:
        chunks = split_resume_into_chunks(example["resume"], tokenizer)
        for chunk in chunks:
            training_pairs.append({
                "prompt": PROMPT_TEMPLATE + chunk,
                "completion": example["summary"]
            })
    return training_pairs

def save_jsonl(data, filename):
    with open(filename, "w+") as f:
        for item in data:
            json.dump(item, f)
            f.write("\n")

for model, path in zip(models_to_finetune, out_dirs):
    tokenizer = AutoTokenizer.from_pretrained(model)
    train_mlx = convert_split(train, tokenizer)
    print(f"Splitting of train data for {model} finished.")
    val_mlx = convert_split(val, tokenizer)
    print(f"Splitting of validation data for {model} finished.")
    test_mlx = convert_split(test, tokenizer)
    print(f"Splitting of test data for {model} finished.")
    save_jsonl(train_mlx, path + "train.jsonl")
    print(f"Saved train.jsonl to {path}")
    save_jsonl(val_mlx, path + "valid.jsonl")
    print(f"Saved valid.jsonl to {path}")
    save_jsonl(test_mlx, path + "test.jsonl")
    print(f"Saved test.jsonl to {path}")

Splitting of train data for microsoft/Phi-3.5-mini-instruct finished.
Splitting of validation data for microsoft/Phi-3.5-mini-instruct finished.
Splitting of test data for microsoft/Phi-3.5-mini-instruct finished.
Saved train.jsonl to /Users/juanbeiroa/Code/thereisnohr/data/phi-finetuning/
Saved valid.jsonl to /Users/juanbeiroa/Code/thereisnohr/data/phi-finetuning/
Saved test.jsonl to /Users/juanbeiroa/Code/thereisnohr/data/phi-finetuning/


Token indices sequence length is longer than the specified maximum sequence length for this model (2800 > 2048). Running this sequence through the model will result in indexing errors


Splitting of train data for mlx-community/TinyLlama-1.1B-Chat-v1.0-mlx finished.
Splitting of validation data for mlx-community/TinyLlama-1.1B-Chat-v1.0-mlx finished.
Splitting of test data for mlx-community/TinyLlama-1.1B-Chat-v1.0-mlx finished.
Saved train.jsonl to /Users/juanbeiroa/Code/thereisnohr/data/tinyllama-finetuning/
Saved valid.jsonl to /Users/juanbeiroa/Code/thereisnohr/data/tinyllama-finetuning/
Saved test.jsonl to /Users/juanbeiroa/Code/thereisnohr/data/tinyllama-finetuning/
Splitting of train data for mlx-community/gemma-2-2b-it finished.
Splitting of validation data for mlx-community/gemma-2-2b-it finished.
Splitting of test data for mlx-community/gemma-2-2b-it finished.
Saved train.jsonl to /Users/juanbeiroa/Code/thereisnohr/data/gemma-finetuning/
Saved valid.jsonl to /Users/juanbeiroa/Code/thereisnohr/data/gemma-finetuning/
Saved test.jsonl to /Users/juanbeiroa/Code/thereisnohr/data/gemma-finetuning/
Splitting of train data for mlx-community/Llama-3.2-1B-Instruct-MLX

# Finetuning

In [None]:
!mlx_lm.lora \
    --model mlx-community/Llama-3.2-1B-Instruct-MLXTuned \
    --train \
    --data ~/Code/thereisnohr/data/llama-finetuning \
    --adapter-path ~/Code/thereisnohr/adapters/llama-3.2-1b \
    --batch-size 2 \
    --num-layers 4 \
    --test

In [None]:
!mlx_lm.lora \
    --model mlx-community/gemma-2-2b-it \
    --train \
    --data ~/Code/thereisnohr/data/gemma-finetuning \
    --adapter-path ~/Code/thereisnohr/adapters/gemma \
    --batch-size 2 \
    --num-layers 4 \
    --grad-checkpoint \
    --test

In [None]:
!mlx_lm.lora \
    --model microsoft/Phi-3.5-mini-instruct \
    --train \
    --data ~/Code/thereisnohr/data/phi-finetuning \
    --adapter-path ~/Code/thereisnohr/adapters/phi-2 \
    --batch-size 2 \
    --num-layers 4 \
    --grad-checkpoint \
    --test

In [None]:
!mlx_lm.lora \
    --model mlx-community/TinyLlama-1.1B-Chat-v1.0-mlx \
    --train \
    --data ~/Code/thereisnohr/data/tinyllama-finetuning \
    --adapter-path ~/Code/thereisnohr/adapters/tinyllama \
    --batch-size 2 \
    --num-layers 4 \
    --test