# Fine-tune a small LM on a customer-support dataset (Local MacBook)

This notebook is adapted to run on a local MacBook (CPU or Apple MPS). It uses a small model (`distilgpt2`) by default and is conservative with batch sizes/epochs so it can run locally for demonstration. If the public dataset is unavailable the notebook falls back to a tiny synthetic dataset.

### How to use
1. (Optional) In the first code cell uncomment the `!pip install` line to install dependencies.
2. Adjust settings in the `User settings` cell (model, dataset, epochs, batch_size).
3. Run cells top-to-bottom. Training on CPU/MPS is slow; expect longer runtimes.


In [20]:
%pip install --upgrade transformers datasets accelerate peft bitsandbytes trl

Note: you may need to restart the kernel to use updated packages.


In [21]:
import os
# Prevent tokenizers parallelism warnings & deadlocks
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")

import torch
# If using MPS or CPU, pin_memory should be False to avoid warnings
# We'll set a flag used in TrainingArguments below
use_pin_memory = False if (not torch.cuda.is_available()) else True

In [22]:
import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)

# Device selection (robust)
if torch.cuda.is_available():
    device = 'cuda'
elif getattr(torch.backends, 'mps', None) is not None and torch.backends.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'

# Choose dtype depending on device to avoid unsupported dtypes on CPU
if device == 'cuda' or device == 'mps':
    model_dtype = torch.float16
else:
    model_dtype = torch.float32

print(f'Using device: {device} (model dtype={model_dtype})')


Using device: mps (model dtype=torch.float16)


In [23]:
# ----------------- User settings (adjust before run) -----------------
MODEL_ID = os.environ.get('MODEL_ID', 'microsoft/Phi-3-mini-4k-instruct') # CHANGED
DATASET_ID = os.environ.get('DATASET_ID', 'bitext/Bitext-customer-support-llm-chatbot-training-dataset')
OUTPUT_DIR = os.environ.get('OUTPUT_DIR', './local_ft_output') # CHANGED OUTPUT DIR to avoid mixing checkpoints
EPOCHS = int(os.environ.get('EPOCHS', '3'))
BATCH_SIZE = int(os.environ.get('BATCH_SIZE', '1')) # Adjusted BATCH_SIZE for larger model
MAX_LENGTH = int(os.environ.get('MAX_LENGTH', '256')) # Increased length for modern model
USE_LORA = os.environ.get('USE_LORA', 'true').lower() in ('1', 'true', 'yes')

print('Settings:')
print(f' MODEL_ID={MODEL_ID}')
print(f' DATASET_ID={DATASET_ID}')
print(f' OUTPUT_DIR={OUTPUT_DIR}')
print(f' EPOCHS={EPOCHS}, BATCH_SIZE={BATCH_SIZE}, MAX_LENGTH={MAX_LENGTH}, USE_LORA={USE_LORA}')
# --------------------------------------------------------------------

Settings:
 MODEL_ID=microsoft/Phi-3-mini-4k-instruct
 DATASET_ID=bitext/Bitext-customer-support-llm-chatbot-training-dataset
 OUTPUT_DIR=./local_ft_output
 EPOCHS=3, BATCH_SIZE=1, MAX_LENGTH=256, USE_LORA=True


In [24]:
def safe_load_customer_dataset(dataset_id):
    try:
        ds = load_dataset(dataset_id)
        if isinstance(ds, dict) and 'train' in ds:
            return ds['train']
        return ds
    except Exception as e:
        print(f'Could not load dataset {dataset_id}: {e}')
        print('Falling back to a tiny synthetic customer support dataset for demo.')
        samples = [
            {'customer': "My order hasn't arrived, it's been 10 days.", 'agent': "I'm sorry. Can you share your order id?"},
            {'customer': 'I was charged twice for the same order.', 'agent': "I can help. Please share the transaction id."},
            {'customer': 'How do I return an item?', 'agent': "You can start a return from your orders page."},
        ]
        return Dataset.from_list(samples)

def build_prompt(row):
    if 'customer' in row and 'agent' in row:
        return f"Human: {row['customer']}\nAssistant: {row['agent']}\n"
    if 'input' in row and 'output' in row:
        return f"Human: {row['input']}\nAssistant: {row['output']}\n"
    if 'text' in row:
        return row['text'] + "\n"
    return str(row)

print('Helper functions defined')

Helper functions defined


In [25]:
raw_ds = safe_load_customer_dataset(DATASET_ID)
print(f'Loaded dataset size: {len(raw_ds)} (showing first 2 examples)')
for i,ex in enumerate(raw_ds[:2]):
    print('\n--- example', i, '---')
    print(ex)

# Map to text prompts
if isinstance(raw_ds[0], dict):
    def map_to_prompt(example):
        return {'text': build_prompt(example)}
    ds = raw_ds.map(map_to_prompt)
else:
    ds = raw_ds.map(lambda x: {'text': str(x)})

# Split and reduce for local run
if len(ds) > 2000:
    ds = ds.train_test_split(test_size=0.05, shuffle=True, seed=42)
    train_ds = ds['train'].select(range(4096))
    eval_ds = ds['test'].select(range(128))
else:
    split = ds.train_test_split(test_size=0.1, seed=42)
    train_ds = split['train']
    eval_ds = split['test']

print(f'Train size: {len(train_ds)}, Eval size: {len(eval_ds)}')

Loaded dataset size: 26872 (showing first 2 examples)

--- example 0 ---
flags

--- example 1 ---
instruction

--- example 2 ---
category

--- example 3 ---
intent

--- example 4 ---
response
Train size: 4096, Eval size: 128


In [26]:
# ----------------- Tokenization and Model Loading -----------------
# Load tokenizer and model with a dtype appropriate for the device

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
# ensure pad token exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '<|padding|>'})

# Load the model. Avoid forcing float16 on CPU.
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    torch_dtype=model_dtype,
    low_cpu_mem_usage=True
)

# Resize token embeddings if we added special tokens
model.resize_token_embeddings(len(tokenizer))

print('Model and tokenizer loaded.')

# Tokenization helper that uses the 'text' column (created earlier by map_to_prompt)
def tokenize_for_lm(examples):
    """Robust tokenizer that normalizes nested text structures to flat strings."""
    raw_texts = examples.get('text')
    normalized = []
    for t in raw_texts:
        if t is None:
            normalized.append("")
        elif isinstance(t, str):
            normalized.append(t)
        elif isinstance(t, (list, tuple)):
            # Flatten nested lists or tokens into a single string
            flat = []
            for x in t:
                if isinstance(x, (list, tuple)):
                    flat += [str(y) for y in x]
                else:
                    flat.append(str(x))
            normalized.append(" ".join(flat))
        else:
            normalized.append(str(t))

    # Now safely tokenize normalized strings
    outputs = tokenizer(
        normalized,
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length"  # ensures consistent tensor shapes
    )
    outputs["labels"] = outputs["input_ids"].copy()
    return outputs


print('Tokenizing datasets (this may take a bit)...')
train_tok = train_ds.map(tokenize_for_lm, batched=True, remove_columns=[c for c in train_ds.column_names if c!='text'])
eval_tok = eval_ds.map(tokenize_for_lm, batched=True, remove_columns=[c for c in eval_ds.column_names if c!='text'])

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
print('Tokenization complete. Examples:')
for i in range(min(2, len(train_tok))):
    print(train_tok[i]['text'])

Loading checkpoint shards: 100%|██████████| 2/2 [00:23<00:00, 11.72s/it]


Model and tokenizer loaded.
Tokenizing datasets (this may take a bit)...


Map: 100%|██████████| 128/128 [00:00<00:00, 2301.66 examples/s]

Tokenization complete. Examples:
{'flags': 'BILQ', 'instruction': 'what do i have to do to recover my profile key', 'category': 'ACCOUNT', 'intent': 'recover_password', 'response': 'Indeed! I\'m here to assist you in recovering your profile key. Let\'s tackle this together:\n\n1. Access our platform\'s "{{Login Page URL}}" to initiate the recovery process.\n2. Locate the "{{Forgot Password}}" option and select it to proceed.\n3. You will be prompted to provide your email address associated with your profile. Kindly input the relevant information.\n4. Keep an eye on your inbox as you should receive an email containing detailed instructions on how to recover your profile key. In case you don\'t find it in your primary inbox, please check your spam or other folders.\n5. Follow the instructions provided in the email carefully to regain access to your profile key.\n\nShould you face any hurdles or experience delays in receiving the email, don\'t hesitate to reach out. Remember, I\'m here to




In [27]:
# ----------------- PEFT / LoRA Configuration (optional) -----------------
use_peft = False
if USE_LORA:
    try:
        from peft import LoraConfig, get_peft_model
        use_peft = True
        lora_config = LoraConfig(
            r=8,
            lora_alpha=16,
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], 
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM",
        )
        model = get_peft_model(model, lora_config)
        print('Applied LoRA adapter to model (PEFT).')
    except Exception as e:
        print(f'PEFT/LoRA unavailable or failed: {e}. Continuing without LoRA.')

print('use_peft =', use_peft)


Applied LoRA adapter to model (PEFT).
use_peft = True


In [28]:
# ----------------- Training Arguments and Trainer -----------------
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_steps=10,
    save_total_limit=2,
    fp16=(device in ('cuda', 'mps')),
    remove_unused_columns=False,
    push_to_hub=False,
    dataloader_num_workers=2,
    dataloader_pin_memory=use_pin_memory
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=eval_tok,
    data_collator=data_collator,
)

print('Starting training... (this may be slow on CPU/MPS)')

Starting training... (this may be slow on CPU/MPS)


In [29]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Load tokenizer from the OUTPUT_DIR first (this contains any special tokens you saved)
gen_tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR, trust_remote_code=True)

# Load model with ignore_mismatched_sizes so from_pretrained won't error on mismatch.
gen_model = AutoModelForCausalLM.from_pretrained(
    OUTPUT_DIR,
    trust_remote_code=True,
    torch_dtype=model_dtype,
    low_cpu_mem_usage=True,
    ignore_mismatched_sizes=True  # allow different embedding sizes to be adjusted
)

# Ensure model embedding matrix matches tokenizer size
gen_model.resize_token_embeddings(len(gen_tokenizer))

# Move to device and build pipeline
gen_model.to(device)
device_id = 0 if device == 'cuda' else -1
generator = pipeline('text-generation', model=gen_model, tokenizer=gen_tokenizer, device=device_id)

prompt = "Human: I haven't received my refund after 10 days. What can I do?\nAssistant:"
print('Generating an example response:')
print(generator(prompt, max_length=200, do_sample=True, top_p=0.9, num_return_sequences=1)[0]['generated_text'])


RuntimeError: Error(s) in loading state_dict for GPT2LMHeadModel:
	size mismatch for transformer.wte.weight: copying a param with shape torch.Size([50258, 768]) from checkpoint, the shape in current model is torch.Size([50257, 768]).
	size mismatch for lm_head.weight: copying a param with shape torch.Size([50258, 768]) from checkpoint, the shape in current model is torch.Size([50257, 768]).