In [None]:
!pip install transformers datasets accelerate peft
!pip install bitsandbytes




In [None]:
!pip install -U bitsandbytes



In [None]:
import json

def load_jsonl(filepath):
    data = []
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

# Replace 'your_data.jsonl' with the exact filename you uploaded.
data = load_jsonl("lamini_fraud_detection.jsonl")
print(f"Loaded {len(data)} examples.")

# Format the data: combine 'instruction' and 'input' into one prompt; 'output' is the label.
formatted_data = []
for entry in data:
    prompt = f"{entry['instruction']}\n{entry['input']}\nAnswer:"
    formatted_data.append({
        "prompt": prompt,
        "response": entry["output"]
    })

print(f"Formatted data into {len(formatted_data)} prompt-response pairs.")


Loaded 119028 examples.
Formatted data into 119028 prompt-response pairs.


In [None]:
import os
from google.colab import userdata
hf_token=userdata.get('HF_TOKEN')

if hf_token:
    print("HF token retrieved successfully!")
else:
    print("HF token not found. Please check your Colab secrets.")


HF token retrieved successfully!


In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

 # Ensure your HF token is set
model_name = "google/gemma-7b"  # Replace with your model identifier

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)

# Create a quantization configuration for 4-bit loading
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,                      # Use 4-bit quantization
    bnb_4bit_quant_type="nf4",              # "nf4" is a good choice for quantization quality
    bnb_4bit_use_double_quant=True,         # Improves quantization accuracy
    bnb_4bit_compute_dtype=torch.float16    # Compute in FP16
)

# Use a sequential device map to load the model in parts
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="sequential",  # Loads model modules sequentially; this helps manage limited GPU memory
    token=hf_token
)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
from peft import LoraConfig, get_peft_model

# Configure LoRA parameters – these values (r, lora_alpha, lora_dropout) can be tuned.
lora_config = LoraConfig(
    r=8,                   # Rank of the adaptation matrices
    lora_alpha=32,         # Scaling factor
    lora_dropout=0.1,      # Dropout rate for LoRA layers
    bias="none",           # Do not update bias terms
    task_type="CAUSAL_LM"  # Task: Causal Language Modeling
)

# Wrap your quantized model with LoRA
model = get_peft_model(model, lora_config)


In [None]:
import json
from datasets import Dataset

# Function to load JSONL data from a file
def load_jsonl(filepath):
    data = []
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

# Replace 'your_data.jsonl' with your actual filename
data = load_jsonl("lamini_fraud_detection.jsonl")
print(f"Loaded {len(data)} examples.")

# Format the data: combine 'instruction' and 'input' into a prompt
formatted_data = []
for entry in data:
    # Create a prompt that includes the instruction and input, followed by "Answer:"
    prompt = f"{entry['instruction']}\n{entry['input']}\nAnswer:"
    formatted_data.append({"prompt": prompt, "response": entry["output"]})

# Create a Hugging Face Dataset from the formatted data
dataset = Dataset.from_list(formatted_data)
print(dataset[0])


Loaded 119028 examples.
{'prompt': 'Determine if the transaction is fraudulent based on the given details.\nOn 2/2/2004 at 09:50, a transaction of $122.08 was made using Online Transaction in  ONLINE, nan. The merchant category code (MCC) was 5712. \nAnswer:', 'response': 'Fraudulent'}


In [None]:
def tokenize_function(examples):
    # Concatenate prompt and response element-wise
    full_text = [p + " " + r for p, r in zip(examples["prompt"], examples["response"])]
    tokenized = tokenizer(full_text, truncation=True, padding="max_length", max_length=256)
    # Add the labels: copy the input_ids for causal LM training
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Remap your dataset using the updated function
tokenized_dataset = dataset.map(tokenize_function, batched=True)
print(tokenized_dataset[0].keys())




Map:   0%|          | 0/119028 [00:00<?, ? examples/s]

dict_keys(['prompt', 'response', 'input_ids', 'attention_mask', 'labels'])


In [None]:
print(tokenized_dataset[0].keys())


dict_keys(['prompt', 'response', 'input_ids', 'attention_mask', 'labels'])


In [None]:
from transformers import TrainingArguments, Trainer

# Disable caching to ensure loss is computed
model.config.use_cache = False
training_args = TrainingArguments(
    output_dir="./gemma-lora-finetuned",
    run_name="gemma_finetuned_run",
    report_to=["wandb"],
    per_device_train_batch_size=2,  # Reduce batch size
    num_train_epochs=1,
    logging_dir="./logs",
    logging_steps=500,
    save_strategy="no",
    eval_strategy="epoch",
    fp16=True,  # Use fp16 instead of bf16
    bf16=False,
    dataloader_num_workers=4,
)


# Split dataset into train and eval (if you haven't done so)
split_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

trainer.train()




Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 