In [None]:
!pip install transformers datasets peft bitsandbytes accelerate


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer, BitsAndBytesConfig, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset, ClassLabel
import numpy as np
import json

# Ensure we have a GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


In [None]:

# 1) Load the raw JSON dataset
raw = load_dataset("json", data_files="spanglish_dataset.json")["train"]

# 2) Convert the text labels into a ClassLabel feature
#    This maps "positive","negative","neutral" → 0,1,2 under the hood.
raw = raw.cast_column(
    "sentiment_label",
    ClassLabel(names=["positive", "negative", "neutral"])
)

# 3) Now you can stratify by that ClassLabel column
split = raw.train_test_split(
    test_size=0.2,
    seed=42,
    stratify_by_column="sentiment_label"
)
train_dataset = split["train"]
test_dataset  = split["test"]

print(f"Train size: {len(train_dataset)}, Test size: {len(test_dataset)}")
print(train_dataset.features)  # to verify sentiment_label is now ClassLabel


In [None]:
# Specify the model name (a LLaMA-based 3B model from Hugging Face)
model_name = "openlm-research/open_llama_3b_v2"

# Load the tokenizer (use the LlamaTokenizer with use_fast=False as recommended for OpenLLaMA)
tokenizer = LlamaTokenizer.from_pretrained(model_name, use_fast=False)
# Use the end-of-sequence token as the padding token for batching
tokenizer.pad_token_id = tokenizer.eos_token_id

# Define the prompt template with proper formatting
PROMPT_TEMPLATE_DETAILED = (
    "Perform sentiment analysis on this sentence, which may contain a mix of English and Spanish (Spanglish). Consider both languages and cultural context to determine sentiment. Choose from:\n"
    "{{\"positive\", \"negative\", \"neutral\"}}\n\n"
    "Sentence: {sentence}\n\n"
    "Respond in JSON format with:\n"
    "- \"prediction\": the predicted category\n"
    "- \"confidence\": your confidence score (0-1)\n"
    "- \"reason\": include any bilingual or cultural cues that influenced your decision\n\n"
    "Example response:\n"
    "{{\"prediction\": \"positive\", \"confidence\": 0.92, \"reason\": \"The sentence contains affectionate language and positive slang common in both cultures\"}}\n"
)

In [None]:
# Configure 8-bit loading
bnb_config = BitsAndBytesConfig(load_in_8bit=True)
# Load the model in 8-bit mode
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")

# Prepare the model for k-bit training (this hooks up grad support in int8 weights)
model = prepare_model_for_kbit_training(model)

# Disable use_cache before enabling gradient checkpointing
model.config.use_cache = False

# Enable gradient checkpointing
model.gradient_checkpointing_enable()

# Set up LoRA configuration
lora_config = LoraConfig(
    r=16,             # Rank of LoRA decomposition
    lora_alpha=32,    # LoRA scaling factor
    target_modules=["q_proj", "k_proj", "v_proj"],  # Target certain transformer sub-modules
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
# Add LoRA adapters to the model
model = get_peft_model(model, lora_config)


# Print trainable parameter information for verification
trainable = [n for n, p in model.named_parameters() if p.requires_grad]


In [None]:
training_args = TrainingArguments(
    output_dir="spanglish_sentiment_model",
    overwrite_output_dir=True,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,  # accumulate to simulate larger batch
    num_train_epochs=3,
    learning_rate=1e-4,
    fp16=True,
    logging_steps=50,
    optim="adamw_torch",
    report_to="none",        # disable hub logging
    save_steps=5000,         # checkpoint every 5000 steps
    save_total_limit=2       # only keep last 2 checkpoints
)


# Data collator to pad sequences and mask labels properly
def data_collator(batch):
    # Each item in batch already has input_ids and labels as lists
    max_length = max(len(item["input_ids"]) for item in batch)
    input_ids_list = []
    attention_mask_list = []
    labels_list = []
    for item in batch:
        seq_len = len(item["input_ids"])
        pad_len = max_length - seq_len
        # Pad input_ids with tokenizer.pad_token_id
        input_ids_list.append(item["input_ids"] + [tokenizer.pad_token_id] * pad_len)
        # Pad attention mask with 0
        attention_mask_list.append(item["attention_mask"] + [0] * pad_len)
        # Pad labels with -100 (so padded tokens are ignored in loss)
        labels_list.append(item["labels"] + [-100] * pad_len)
    # Convert to tensors
    input_ids = torch.tensor(input_ids_list, dtype=torch.long)
    attention_mask = torch.tensor(attention_mask_list, dtype=torch.long)
    labels = torch.tensor(labels_list, dtype=torch.long)
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)


In [None]:
# Train the model
trainer.train()


In [None]:
!zip -r /content/data.zip /content/spanglish_sentiment_model

In [None]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

# 1) Use the exact model_name you set earlier:
#    (e.g. "openlm-research/open_llama_3b_v2")
print("Base model identifier:", model_name)

# 2) Configure 8-bit loading
bnb_config = BitsAndBytesConfig(load_in_8bit=True)

# 3) Load the base LLaMA 3B model in 8-bit
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# 4) Attach your fine-tuned adapters from disk
model = PeftModel.from_pretrained(base_model, "spanglish_sentiment_model")

# 5) Switch to evaluation mode
model.eval()

# Quick check
print("Model loaded with adapters. Trainable params:")
model.print_trainable_parameters()  # should list your LoRA parameters only


In [None]:
import torch, time, re, json, ast
from datasets import ClassLabel

# 1) Ensure model is in eval mode on GPU
print("Using device:", next(model.parameters()).device)
model.eval()

# 2) Build integer→string map if sentiment_label is a ClassLabel
label_feat = split["test"].features["sentiment_label"]
if isinstance(label_feat, ClassLabel):
    int2str = {i: name for i, name in enumerate(label_feat.names)}
else:
    int2str = None

# 3) Run evaluation
correct = 0
start = time.time()

for idx, ex in enumerate(split["test"]):
    sentence   = ex["sentence"]
    true_label = ex["sentiment_label"]

    # a) Generate
    prompt = PROMPT_TEMPLATE_DETAILED.format(sentence=sentence)
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(next(model.parameters()).device)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=50, do_sample=False, use_cache=True)

    # b) Strip off prompt tokens & decode
    gen_tokens = output[0, inputs["input_ids"].shape[1]:]
    txt = tokenizer.decode(gen_tokens, skip_special_tokens=True).strip()

    # c) Extract the last JSON block
    matches = re.findall(r"\{.*?\}", txt, flags=re.DOTALL)
    candidate = matches[-1] if matches else txt

    # d) Parse prediction (try JSON, then ast.literal_eval)
    raw = None
    try:
        data = json.loads(candidate)
        raw = data.get("prediction", None)
    except json.JSONDecodeError:
        try:
            data = ast.literal_eval(candidate)
            raw = data.get("prediction", None)
        except Exception:
            raw = None

    # e) Map int→str or lowercase
    if isinstance(raw, int) and int2str:
        pred_label = int2str.get(raw)
    elif isinstance(raw, str):
        pred_label = raw.lower()
    else:
        pred_label = None

    # f) Print first 5 examples
    if idx < 5:
        print(f"\n--- Example {idx} ---")
        print("Sentence:  ", sentence)
        print("True label:", true_label)
        print("Raw gen:   ", txt)
        print("Parsed →   ", pred_label)

    # g) Compare to true label
    if pred_label == true_label:
        correct += 1

# 4) Report
elapsed = time.time() - start
accuracy = correct / len(split["test"]) * 100
print(f"\nEvaluated {len(split['test'])} examples in {elapsed:.1f}s")
print(f"Accuracy: {accuracy:.2f}% ({correct}/{len(split['test'])})")


In [None]:
# Save the adapter-enhanced model and tokenizer
save_path = "spanglish_sentiment_model"
trainer.save_model(save_path)         # saves the model with LoRA adapters
tokenizer.save_pretrained(save_path)

print(f"Model saved to {save_path}/")
