In [None]:
#google colab link:-
# https://colab.research.google.com/drive/1K68jDQhKwvy4IL_X48AuBC9HrieOgHDz#scrollTo=PRj1HrUTaUyV

In [None]:
# Step 1: Clean uninstall
!pip uninstall -y torch torchvision torchaudio transformers accelerate peft datasets numpy numba -q

In [None]:


# Step 2: Install compatible versions (CUDA 11.8)
!pip install --index-url https://download.pytorch.org/whl/cu118 torch==2.1.2+cu118 torchvision==0.16.2+cu118 torchaudio==2.1.2+cu118 -q
!pip install transformers==4.38.2 peft==0.7.1 accelerate==0.30.1 datasets==2.18.0 numpy==1.26.4 numba==0.60.0 -q


In [None]:
# # Install compatible libraries
# !pip uninstall -y torch torchaudio torchvision transformers datasets peft accelerate numpy -q
# !pip install -q torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 transformers==4.41.2 datasets==2.20.0 peft==0.11.1 accelerate==0.31.1 numpy==1.26.4

import json
import torch
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback
from peft import LoraConfig, get_peft_model
from datasets import Dataset
from google.colab import files

# --- Precaution 1: Set Output Directory ---
OUTPUT_DIR = "/content/finetuned_qwen"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- Precaution 2: Validate Dataset ---
DATA_PATH = "/content/finetune_data.jsonl"
print("Validating dataset...")
try:
    with open(DATA_PATH, "r", encoding="utf-8") as f:
        lines = f.readlines()
        num_lines = len(lines)
        if num_lines < 100:
            print(f"Warning: dataset has only {num_lines} lines. Consider adding more data.")
        index_detected = False
        for i, line in enumerate(lines[:10]):
            try:
                entry = json.loads(line.strip())
                if not entry.get("text"):
                    print(f"Error: empty text field in line {i+1}")
                    continue
                if any(term in entry["text"].lower() for term in ["a/b testing, 359", "accuracy, 22"]):
                    print(f"Warning: index data detected in line {i+1}. Filtering now...")
                    index_detected = True
            except json.JSONDecodeError:
                print(f"Error: malformed JSON in line {i+1}")
        if index_detected:
            print("Filtering index data...")
            clean_lines = [line for line in lines if not any(term in json.loads(line.strip())["text"].lower() for term in ["a/b testing", "accuracy, 22"])]
            with open(DATA_PATH, "w", encoding="utf-8") as f:
                f.writelines(clean_lines)
            num_lines = len(clean_lines)
            print(f"Cleaned dataset: {num_lines} lines remain.")
    print(f"Dataset validation complete: {num_lines} lines found.")
except FileNotFoundError:
    print("Error: finetune_data.jsonl not found. Uploading now...")
    uploaded = files.upload()
    if not os.path.exists(DATA_PATH):
        raise FileNotFoundError("Upload failed. Please upload finetune_data.jsonl")

# --- Precaution 3: Check for Existing Checkpoints ---
latest_checkpoint = None
if os.path.exists(OUTPUT_DIR):
    checkpoints = [d for d in os.listdir(OUTPUT_DIR) if d.startswith("checkpoint-")]
    if checkpoints:
        latest_checkpoint = max(checkpoints, key=lambda x: int(x.split("-")[1]))
        print(f"Found checkpoint: {latest_checkpoint}. Will resume training.")

# --- Precaution 4: Load Dataset ---
print("Loading dataset...")
try:
    # Load JSONL manually to avoid caching issues
    data = []
    with open(DATA_PATH, "r", encoding="utf-8") as f:
        for line in f:
            try:
                data.append(json.loads(line.strip()))
            except json.JSONDecodeError:
                print(f"Warning: skipping malformed JSON line")
    dataset = Dataset.from_list(data)
    if len(dataset) == 0:
        raise ValueError("Dataset is empty. Check finetune_data.jsonl content.")
    print(f"Loaded dataset with {len(dataset)} examples.")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise e

# Create validation split (10% of data, if possible)
if len(dataset) > 10:
    dataset = dataset.train_test_split(test_size=0.1, seed=42)
    train_dataset = dataset["train"]
    eval_dataset = dataset["test"]
else:
    train_dataset = dataset
    eval_dataset = None
    print("Warning: dataset too small for validation split. Skipping early stopping.")

# --- Precaution 5: Load Model and Tokenizer ---
model_name = "Qwen/Qwen2.5-1.5B-Instruct"

try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        low_cpu_mem_usage=True
    )
except Exception as e:
    print(f"Error loading model: {e}. Check internet connection or model name.")
    raise e

# --- Precaution 6: Tokenize Dataset ---
def tokenize_function(example):
    # Tokenize the text
    tokenized_output = tokenizer(
        example["text"],
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors="pt"
    )
    # For causal language modeling, labels are the same as input_ids
    tokenized_output["labels"] = tokenized_output["input_ids"].clone()
    return tokenized_output

try:
    tokenized_train = train_dataset.map(tokenize_function, batched=True)
    if eval_dataset:
        tokenized_eval = eval_dataset.map(tokenize_function, batched=True)
except Exception as e:
    print(f"Error tokenizing dataset: {e}. Check dataset content or memory.")
    raise e

# Remove non-tensor columns, but keep 'labels'
tokenized_train = tokenized_train.remove_columns(["text"])
if eval_dataset:
    tokenized_eval = tokenized_eval.remove_columns(["text"])

# --- Precaution 7: Configure LoRA ---
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# --- Precaution 8: Training Arguments ---
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-5,
    save_strategy="epoch",  # Align with evaluation_strategy
    logging_steps=10,
    save_total_limit=2,
    fp16=True,
    logging_dir="/content/logs",
    report_to="none",
    # Changed 'evaluation_strategy' to 'eval_strategy' for compatibility with transformers==4.38.2
    eval_strategy="epoch" if eval_dataset else "no",
    load_best_model_at_end=True if eval_dataset else False,
    metric_for_best_model="loss" if eval_dataset else None,
    greater_is_better=False if eval_dataset else None
)

# --- Precaution 9: Initialize Trainer with Early Stopping ---
callbacks = []
if eval_dataset:
    callbacks.append(EarlyStoppingCallback(
        early_stopping_patience=1,
        early_stopping_threshold=0.01
    ))

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval if eval_dataset else None,
    tokenizer=tokenizer,
    callbacks=callbacks
)

# --- Precaution 10: Resume Training ---
try:
    trainer.train(resume_from_checkpoint=latest_checkpoint)
except Exception as e:
    print(f"Error during training: {e}. Check logs in /content/logs for details.")
    raise e

# --- Precaution 11: Save Final Model ---
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# --- Precaution 12: Save Logs ---
with open(f"{OUTPUT_DIR}/training_log.txt", "w", encoding="utf-8") as f:
    json.dump(trainer.state.log_history, f, indent=2)

# --- Precaution 13: Zip and Download Model ---
!zip -r finetuned_qwen.zip {OUTPUT_DIR}
files.download("finetuned_qwen.zip")
print(f"Model saved to {OUTPUT_DIR}. Download finetuned_qwen.zip for use.")

In [None]:
"from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct", torch_dtype=torch.bfloat16, device_map="auto")
model = PeftModel.from_pretrained(model, "/content/finetuned_qwen")
tokenizer = AutoTokenizer.from_pretrained("/content/finetuned_qwen")
inputs = tokenizer("what is OvO in multiclass classification,do not hellucinate", return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_length=400, do_sample=True, top_p=0.9)
print(tokenizer.decode(outputs[0]))