In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [None]:
pip install trl

Collecting trl
  Downloading trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.15.2-py3-none-any.whl (318 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.9/318.9 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.15.2


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset

# ✅ Load dataset (Ensure correct path)
dataset_path = "lamini_fraud_detection.jsonl"  # Update this if needed
dataset = load_dataset("json", data_files=dataset_path)

# ✅ Load tokenizer
model_name = "google/gemma-2b"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# ✅ Tokenization function
def tokenize_function(example):
    # Ensure all fields are strings
    instruction = " ".join(example["instruction"]) if isinstance(example["instruction"], list) else example["instruction"]
    input_text = " ".join(example["input"]) if isinstance(example["input"], list) else example["input"]
    output_text = " ".join(example["output"]) if isinstance(example["output"], list) else example["output"]

    full_text = instruction + " " + (input_text if input_text else "") + " " + output_text
    return tokenizer(full_text, truncation=True, padding="max_length", max_length=256)


# ✅ Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["instruction", "input", "output"])

# ✅ Split into train & eval (90% train, 10% eval)
split_dataset = tokenized_dataset["train"].train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]


Map:   0%|          | 0/119028 [00:00<?, ? examples/s]

In [None]:

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", quantization_config=bnb_config)

# ✅ Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# ✅ Adjust model config for rotary embeddings (if needed)
model.config.rope_theta = 10000
model.config.max_position_embeddings = 256

# ✅ Disable caching during training
model.config.use_cache = False

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
training_args = TrainingArguments(
    output_dir="./gemma-lora-finetuned",
    run_name="gemma_finetuned_run",
    per_device_train_batch_size=2,
    num_train_epochs=1,
    logging_dir="./logs",
    logging_steps=100,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    fp16=True,  # Enable mixed precision for better performance
    report_to="none"  # Change to ["wandb"] if using Weights & Biases
)

# ✅ Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# ✅ Start training
trainer.train()



RuntimeError: The size of tensor a (8) must match the size of tensor b (256) at non-singleton dimension 3

In [None]:
# Install required packages (run this cell once)
!pip install transformers datasets accelerate peft bitsandbytes trl torch

import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig
)
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model

# --- Step 1: Load the Dataset ---
# We assume your full dataset is in one JSONL file.
dataset = load_dataset("json", data_files={"train": "lamini_fraud_detection.jsonl"})

# Split the dataset into 90% train and 10% test
split_dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# --- Step 2: Load the Tokenizer ---
model_name = "google/gemma-2b"  # Update as needed
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Set the pad token to the eos token if not set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# --- Step 3: Define the Tokenization Function ---
def tokenize_function(example):
    # Get each field; these should be strings in your JSONL file.
    instruction = example.get("instruction", "")
    input_text = example.get("input", "")
    output_text = example.get("output", "")

    # If any field is accidentally a list, join it.
    if isinstance(instruction, list):
        instruction = " ".join(instruction)
    if isinstance(input_text, list):
        input_text = " ".join(input_text)
    if isinstance(output_text, list):
        output_text = " ".join(output_text)

    # Combine into one full text.
    full_text = f"{instruction} {input_text} {output_text}".strip()

    # Tokenize the text.
    tokens = tokenizer(
        full_text,
        truncation=True,
        padding="max_length",
        max_length=256,
    )

    # Return tokens ensuring they are lists.
    return {
        "input_ids": tokens["input_ids"],
        "attention_mask": tokens["attention_mask"]
    }

# --- Step 4: Tokenize the Dataset (Process Each Example Individually) ---
# Use batched=False so that each sample is processed individually.
tokenized_train = train_dataset.map(tokenize_function, batched=False, remove_columns=["instruction", "input", "output"])
tokenized_eval = eval_dataset.map(tokenize_function, batched=False, remove_columns=["instruction", "input", "output"])

# Optional: Check a sample from the tokenized dataset:
print(tokenized_train[0])
# Expected output: {"input_ids": [ ... list of ints ... ], "attention_mask": [ ... list of ints ... ]}

# --- Step 5: Load the Model with 4-bit Quantization ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config
)

# --- Step 6: Apply LoRA for Fine-Tuning ---
lora_config = LoraConfig(
    r=8,                # LoRA rank
    lora_alpha=16,      # Scaling factor
    lora_dropout=0.05,  # Dropout probability
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # Verify trainable parameters

# --- Step 7: Set Up Training Arguments ---
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=1,
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="no",
    save_total_limit=2,
    fp16=True,
    report_to="none"  # Change to ["wandb"] if you want to log to Weights & Biases
)

# --- Step 8: Initialize the Trainer ---
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    args=training_args,
    tokenizer=tokenizer,
)

# --- Step 9: Start Training ---
trainer.train()




{'input_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 99211, 1013, 573, 13678, 603, 73233, 3482, 611, 573, 2764, 4691, 235265, 1993, 235248, 235315, 235283, 235304, 235276, 235283, 235284, 235276, 235274, 235321, 696, 235248, 235274, 235304, 235292, 235310, 235274, 235269, 476, 13678, 576, 697, 235274, 235276, 235265, 235304, 235276, 729, 1644, 2177, 35471, 37793, 575, 5571, 190908, 235269, 22302, 235265, 714, 36818, 9701, 3409, 591, 129283, 235275, 729, 235248, 235308, 235

ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 