In [6]:
import sys, platform
print("Python:", sys.version)
print("Executable:", sys.executable)
print("Platform:", platform.platform())

Python: 3.13.7 (v3.13.7:bcee1c32211, Aug 14 2025, 19:10:51) [Clang 16.0.0 (clang-1600.0.26.6)]
Executable: /usr/local/bin/python3.13
Platform: macOS-15.1-arm64-arm-64bit-Mach-O


In [7]:
%pip install -U "torch>=2.3,<3" "transformers>=4.44,<5" "datasets>=2.20,<3" "accelerate>=0.33,<1" huggingface_hub sentencepiece

Collecting datasets<3,>=2.20
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting accelerate<1,>=0.33
  Downloading accelerate-0.34.2-py3-none-any.whl.metadata (19 kB)
Collecting huggingface_hub
  Using cached huggingface_hub-1.1.2-py3-none-any.whl.metadata (13 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets<3,>=2.20)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting fsspec>=0.8.5 (from torch<3,>=2.3)
  Downloading fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
Collecting multiprocess (from datasets<3,>=2.20)
  Downloading multiprocess-0.70.17-py313-none-any.whl.metadata (7.2 kB)
  Using cached multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m17.9

In [1]:
### Step 3:
import os, torch
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"  # avoid hf_transfer requirement

if torch.backends.mps.is_available():
    device_map = {"": "mps"}; dtype = torch.float16; dev = "mps"
else:
    device_map = {"": "cpu"}; dtype = torch.float32; dev = "cpu"

print("Device:", dev, "| dtype:", dtype)


Device: mps | dtype: torch.float16


In [13]:
### Step 4: Download the model snapshot
MODEL_ID  = "HuggingFaceTB/SmolLM2-135M"
CACHE_DIR = "./_hf_cache_smol"

from huggingface_hub import snapshot_download
local_model_path = snapshot_download(
    repo_id=MODEL_ID,
    local_dir=CACHE_DIR,
    allow_patterns=["*.safetensors","*.bin","*.json","*.model","tokenizer*","*merges*"],
    resume_download=True,
    max_workers=8,
)



Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

In [14]:
### Step 5: Load tokenizer and model
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(local_model_path, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    local_model_path,
    torch_dtype=dtype,          # fp16 on MPS, fp32 on CPU
    low_cpu_mem_usage=True,
    device_map=device_map,
    attn_implementation="sdpa",
)
model.resize_token_embeddings(len(tokenizer))


Embedding(49152, 576)

In [31]:
import re
from transformers import StoppingCriteria, StoppingCriteriaList
import torch

# tiny stop: cut if model starts a new turn marker
class StopOnMarkers(StoppingCriteria):
    def __init__(self, markers, tokenizer):
        self.markers = markers
        self.tokenizer = tokenizer
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs):
        text = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
        return any(m in text for m in self.markers)

In [36]:
def qa_complete(prompt: str) -> str:
    # Use tags the base model hasn't memorized as a dialogue format
    text = f"Instruction: {prompt}\nAnswer:"
    enc = tokenizer(text, return_tensors="pt")
    dev = next(model.parameters()).device
    enc = {k: v.to(dev) for k, v in enc.items()}

    # Block common turn/openers the model drifts into
    bad_phrases = ["\nQ:", "\nA:", "\nB:", "\nB)", "\nC:", "\nC)", "Q:", "B:", "B)", "C:", "C)"]
    bad_words_ids = [tokenizer.encode(p, add_special_tokens=False) for p in bad_phrases if p.strip()]

    with torch.inference_mode():
        out = model.generate(
            **enc,
            max_new_tokens=16,          # tight to avoid rambling
            do_sample=False,            # greedy for stability
            use_cache=True,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
            bad_words_ids=bad_words_ids,
        )

    dec = tokenizer.decode(out[0], skip_special_tokens=True)
    ans = dec.split("Answer:", 1)[-1]          # keep only after "Answer:"
    ans = ans.split("\n", 1)[0].strip()        # cut at first newline
    # keep a single short sentence at most
    import re
    m = re.search(r'(.+?[.!?])(\s|$)', ans)
    return (m.group(1).strip() if m else ans.strip())

In [37]:
print("hello →", qa_complete("Say hello in one short sentence."))
print("math  →", qa_complete("what's 2 times 2?"))

hello → I am a student.
math  → 4


In [18]:
### Clear GPU memory
import gc, torch
gc.collect()
if torch.backends.mps.is_available():
    torch.mps.empty_cache()
