In [None]:
%%capture
!pip install unsloth
# 获取最新版本的Unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
# 挂载Google Drive
from google.colab import drive
drive.mount('/content/drive')

# 创建项目目录
import os
project_dir = '/content/drive/MyDrive/ID2223_Lab2'
os.makedirs(project_dir, exist_ok=True)
print(f"✅ 项目目录: {project_dir}")

Mounted at /content/drive
✅ 项目目录: /content/drive/MyDrive/ID2223_Lab2


In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None  # 自动检测
load_in_4bit = True  # 使用4bit量化节省内存

# 加载Llama-3.2 1B模型（更小，训练更快）
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct",  # 建议用1B，更快
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

print("模型加载成功！")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.11.3: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.10G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

模型加载成功！


In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,  # LoRA rank
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

print("LoRA适配器添加成功！")

Unsloth 2025.11.3 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


LoRA适配器添加成功！


In [None]:
from unsloth.chat_templates import get_chat_template, standardize_sharegpt
from datasets import load_dataset

# 设置chat模板
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

# 格式化函数
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False)
             for convo in convos]
    return {"text": texts}

# 加载FineTome数据集（这是Lab要求的数据集）
print("正在下载数据集...")
dataset = load_dataset("mlabonne/FineTome-100k", split="train")

# 转换数据格式
dataset = standardize_sharegpt(dataset)
dataset = dataset.map(formatting_prompts_func, batched=True)

print(f"数据集准备完成！共 {len(dataset)} 条数据")

正在下载数据集...


README.md:   0%|          | 0.00/982 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/117M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100000 [00:00<?, ? examples/s]

Unsloth: Standardizing formats (num_proc=2):   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

数据集准备完成！共 100000 条数据


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
import os

# ⭐️ 使用Google Drive作为checkpoint目录
checkpoint_dir = "/content/drive/MyDrive/ID2223_Lab2/checkpoints"
output_dir = "/content/drive/MyDrive/ID2223_Lab2/outputs"

# 创建目录
os.makedirs(checkpoint_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)

# 检查是否有checkpoint
resume_from_checkpoint = None
if os.path.exists(checkpoint_dir):
    checkpoints = [os.path.join(checkpoint_dir, d) for d in os.listdir(checkpoint_dir)
                   if d.startswith("checkpoint-")]
    if checkpoints:
        resume_from_checkpoint = max(checkpoints, key=os.path.getctime)
        print(f"🔄 从checkpoint恢复: {resume_from_checkpoint}")
        # 显示当前进度
        step_num = int(resume_from_checkpoint.split("-")[-1])
        print(f"📊 当前进度: 步骤 {step_num}/1000")

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = checkpoint_dir,  # 直接保存到Drive
        save_strategy = "steps",
        save_steps = 100,
        report_to = "none",
    ),
)

trainer_stats = trainer.train(resume_from_checkpoint=resume_from_checkpoint)

🔄 从checkpoint恢复: /content/drive/MyDrive/ID2223_Lab2/checkpoints/checkpoint-62
📊 当前进度: 步骤 62/1000


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/100000 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100,000 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 11,272,192 of 1,247,086,592 (0.90% trained)
	save_steps: 100 (from args) != 50 (from trainer_state.json)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
63,0.9095


In [None]:
# 方法1：保存到本地（临时）
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")

# 方法2：推送到Hugging Face（推荐！）
hf_username = "kkkkkkatherine"  # 替换成你的用户名
hf_token = "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"  # 替换成你的token

model.push_to_hub(
    f"{hf_username}/llama-3.2-1b-finetome",
    token = hf_token
)
tokenizer.push_to_hub(
    f"{hf_username}/llama-3.2-1b-finetome",
    token = hf_token
)

print("✅ 模型已上传到Hugging Face！")

README.md:   0%|          | 0.00/619 [00:00<?, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:   0%|          | 23.3kB / 45.1MB            

Saved model to https://huggingface.co/kkkkkkatherine/llama-3.2-1b-finetome


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...mpnxkdrbpp/tokenizer.json: 100%|##########| 17.2MB / 17.2MB            

No files have been modified since last commit. Skipping to prevent empty commit.


✅ 模型已上传到Hugging Face！


In [None]:
FastLanguageModel.for_inference(model)

messages = [
    {"role": "user", "content": "What is machine learning?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt=True)
_ = model.generate(
    input_ids = inputs,
    streamer = text_streamer,
    max_new_tokens = 128,
    use_cache = True,
)

Machine learning is a subset of artificial intelligence (AI) that involves training algorithms to make predictions or decisions based on data. It's a way to improve the accuracy of computer systems by using patterns and relationships in the data to make predictions. Machine learning can be used to classify objects into different categories, recognize images, or predict the outcome of a given situation.

There are several types of machine learning, including supervised learning, unsupervised learning, and reinforcement learning. Supervised learning is the most common type, where the algorithm is trained on labeled data to learn patterns and make predictions. Unsupervised learning is used to find patterns in unlabeled


In [None]:
# Cell 1: Load and merge LoRA model
print("Loading base model and LoRA adapter from HuggingFace...")

from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    "unsloth/Llama-3.2-1B-Instruct",
    dtype=torch.float16,
    device_map="auto",
)

# Load LoRA adapter
model = PeftModel.from_pretrained(
    base_model,
    "kkkkkkatherine/llama-3.2-1b-finetome",
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "unsloth/Llama-3.2-1B-Instruct"
)

# Merge LoRA weights
print("Merging LoRA weights into base model...")
model = model.merge_and_unload()

# Save merged model
print("Saving merged model...")
model.save_pretrained("/content/merged_model")
tokenizer.save_pretrained("/content/merged_model")

print("Merged model saved successfully!")

Loading base model and LoRA adapter from HuggingFace...
Merging LoRA weights into base model...
Saving merged model...
Merged model saved successfully!


In [None]:
# Cell 2: Download llama.cpp
import os

os.chdir('/content')

print("Downloading llama.cpp...")
!rm -rf llama.cpp
!git clone --depth=1 https://github.com/ggerganov/llama.cpp.git

print("llama.cpp downloaded successfully!")

Downloading llama.cpp...
Cloning into 'llama.cpp'...
remote: Enumerating objects: 2137, done.[K
remote: Counting objects: 100% (2137/2137), done.[K
remote: Compressing objects: 100% (1630/1630), done.[K
remote: Total 2137 (delta 478), reused 1574 (delta 434), pack-reused 0 (from 0)[K
Receiving objects: 100% (2137/2137), 25.74 MiB | 4.98 MiB/s, done.
Resolving deltas: 100% (478/478), done.
llama.cpp downloaded successfully!


In [None]:
# Cell 3: Install dependencies
print("Installing dependencies...")

# Install build tools
!apt-get update -qq
!apt-get install -y -qq cmake build-essential

# Install Python packages
!pip install -q gguf numpy sentencepiece

print("Dependencies installed!")

Installing dependencies...
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Dependencies installed!


In [None]:
# Cell 4: Build llama.cpp with CMake
import os

os.chdir('/content/llama.cpp')

print("Building llama.cpp with CMake...")
print("This may take 3-5 minutes...\n")

# Configure build
!cmake -B build -DCMAKE_BUILD_TYPE=Release

# Build (using 8 parallel jobs)
!cmake --build build --config Release -j 8

print("\nBuild completed!")
print("Checking for quantize tool...")
!ls -lh /content/llama.cpp/build/bin/llama-quantize

Building llama.cpp with CMake...
This may take 3-5 minutes...

-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
[0mCMAKE_BUILD_TYPE=Release[0m
-- Found Git: /usr/bin/git (found version "2.34.1")
-- The ASM compiler identification is GNU
-- Found assembler: /usr/bin/cc
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success
-- Found Threads: TRUE
-- CMAKE_SYSTEM_PROCESSOR: x86_64
-- GGML_SYSTEM_ARCH: x86
-- Including CPU backend
-- Found OpenMP_C: -fopenmp (found version "4.5")
-- Found OpenMP_CXX:

In [None]:
# Cell 5: Convert to FP16 GGUF
import os

os.chdir('/content')

print("Converting merged model to FP16 GGUF format...")
print("This may take 5-10 minutes...\n")

!python3 /content/llama.cpp/convert_hf_to_gguf.py \
    /content/merged_model \
    --outfile /content/model_fp16.gguf \
    --outtype f16

# Check file size
fp16_size = os.path.getsize('/content/model_fp16.gguf') / (1024 * 1024)
print(f"\nFP16 GGUF created: {fp16_size:.2f} MB")

Converting merged model to FP16 GGUF format...
This may take 5-10 minutes...

INFO:hf-to-gguf:Loading model: merged_model
INFO:hf-to-gguf:Model architecture: LlamaForCausalLM
INFO:hf-to-gguf:gguf: indexing model part 'model.safetensors'
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {32}
INFO:hf-to-gguf:token_embd.weight,           torch.float16 --> F16, shape = {2048, 128256}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.float16 --> F32, shape = {2048}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.float16 --> F16, shape = {8192, 2048}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.float16 --> F16, shape = {2048, 8192}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.float16 --> F16, shape = {2048, 8192}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,       torch.float16 --> F32, shape = {2048}
INFO:hf-to-gguf:blk.0.attn_k.weight,         torch.flo

In [None]:
# Cell 6: Quantize to Q4_K_M
import os

os.chdir('/content')

print("Quantizing to Q4_K_M format...")
print("This will reduce size by ~70%...\n")

!/content/llama.cpp/build/bin/llama-quantize \
    /content/model_fp16.gguf \
    /content/model.gguf \
    Q4_K_M

# Check results
fp16_size = os.path.getsize('/content/model_fp16.gguf') / (1024 * 1024)
q4_size = os.path.getsize('/content/model.gguf') / (1024 * 1024)

print(f"\nQuantization completed!")
print(f"Original FP16: {fp16_size:.2f} MB")
print(f"Quantized Q4_K_M: {q4_size:.2f} MB")
print(f"Size reduction: {(1-q4_size/fp16_size)*100:.1f}%")

# Clean up temporary FP16 file
print("\nCleaning up temporary FP16 file...")
!rm /content/model_fp16.gguf

print("Done! Final model: /content/model.gguf")

Quantizing to Q4_K_M format...
This will reduce size by ~70%...

main: build = 1 (3f3a4fb)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04.2) 11.4.0 for x86_64-linux-gnu
main: quantizing '/content/model_fp16.gguf' to '/content/model.gguf' as Q4_K_M
llama_model_loader: loaded meta data with 29 key-value pairs and 147 tensors from /content/model_fp16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Merged_Model
llama_model_loader: - kv   3:                         general.size_label str              = 1.2B
llama_model_loader: - kv   4:                          llama.block_count u32              = 16
llama_model_loader: - kv

In [None]:
# Cell 7: Test Q4_K_M model
print("Testing Q4_K_M model...")

!pip install -q llama-cpp-python

from llama_cpp import Llama

# Load model
print("Loading model...")
llm = Llama(
    model_path="/content/model.gguf",
    n_ctx=2048,
    n_threads=4,
    n_gpu_layers=0,
    verbose=False,
)

# Test inference
print("Running test inference...\n")
test_prompt = "User: What is machine learning?\nAssistant:"

response = llm(
    test_prompt,
    max_tokens=100,
    temperature=0.7,
    stop=["User:", "\n\n"],
)

print("Test Response:")
print(response['choices'][0]['text'].strip())
print("\nModel is working correctly!")

Testing Q4_K_M model...
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.7/50.7 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
Loading model...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Running test inference...

Test Response:
Machine learning is a subset of artificial intelligence (AI) that involves training algorithms to learn patterns and make predictions based on data. Unlike traditional AI, which relies on programming and manual data analysis, machine learning algorithms can learn from data, recognize patterns, and make decisions without being explicitly programmed. This enables machines to improve their performance over time, leading to advancements in fields such as computer vision, natural language processing, and predictive analytics.

Model is working correctly!


In [None]:
# Cell 8: Create new repo and upload to HuggingFace
from huggingface_hub import HfApi, create_repo

print("Creating new repository and uploading model...")

api = HfApi()
token = "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"
new_repo_id = "kkkkkkatherine/llama-3.2-1b-finetome-gguf"

# Step 1: Create the repository
print(f"Creating repository: {new_repo_id}")
try:
    create_repo(
        repo_id=new_repo_id,
        token=token,
        repo_type="model",
        exist_ok=True,  # Don't error if repo already exists
    )
    print("Repository created successfully!")
except Exception as e:
    print(f"Note: {e}")
    print("Repository might already exist, continuing...")

# Step 2: Upload the GGUF file
print(f"\nUploading model.gguf to {new_repo_id}...")
api.upload_file(
    path_or_fileobj="/content/model.gguf",
    path_in_repo="model.gguf",
    repo_id=new_repo_id,
    token=token,
)

print("\nUpload successful!")
print(f"Model URL: https://huggingface.co/{new_repo_id}/blob/main/model.gguf")
print(f"\nYou now have two repos:")
print(f"  1. Original (PyTorch): kkkkkkatherine/llama-3.2-1b-finetome")
print(f"  2. GGUF version: {new_repo_id}")

Creating new repository and uploading model...
Creating repository: kkkkkkatherine/llama-3.2-1b-finetome-gguf
Repository created successfully!

Uploading model.gguf to kkkkkkatherine/llama-3.2-1b-finetome-gguf...

Upload successful!
Model URL: https://huggingface.co/kkkkkkatherine/llama-3.2-1b-finetome-gguf/blob/main/model.gguf

You now have two repos:
  1. Original (PyTorch): kkkkkkatherine/llama-3.2-1b-finetome
  2. GGUF version: kkkkkkatherine/llama-3.2-1b-finetome-gguf


In [None]:
# Cell 9: Backup to Google Drive (optional)
from google.colab import drive
import os

# Mount Drive if not already mounted
if not os.path.exists('/content/drive'):
    print("Mounting Google Drive...")
    drive.mount('/content/drive')

# Copy file to Drive
drive_path = '/content/drive/MyDrive/ID2223_Lab2/model.gguf'
os.makedirs(os.path.dirname(drive_path), exist_ok=True)

print("Copying to Google Drive...")
!cp /content/model.gguf {drive_path}

print(f"Backup saved to: {drive_path}")

Copying to Google Drive...
Backup saved to: /content/drive/MyDrive/ID2223_Lab2/model.gguf


In [None]:
# Cell 10: Final summary
import os

print("="*60)
print("GGUF CONVERSION COMPLETE!")
print("="*60)

# File info
file_size = os.path.getsize('/content/model.gguf') / (1024 * 1024)

print(f"\nFinal Model Details:")
print(f"  Format: GGUF Q4_K_M")
print(f"  Size: {file_size:.2f} MB")
print(f"  Location: /content/model.gguf")
print(f"  HuggingFace: kkkkkkatherine/llama-3.2-1b-finetome")

print(f"\nNext Steps:")
print(f"  1. Model is ready to use")
print(f"  2. Update your app.py on HuggingFace Spaces")
print(f"  3. Upload model.gguf to your Space")

print("\n" + "="*60)

GGUF CONVERSION COMPLETE!

Final Model Details:
  Format: GGUF Q4_K_M
  Size: 770.28 MB
  Location: /content/model.gguf
  HuggingFace: kkkkkkatherine/llama-3.2-1b-finetome

Next Steps:
  1. Model is ready to use
  2. Update your app.py on HuggingFace Spaces
  3. Upload model.gguf to your Space

