In [2]:
%%capture
# Install necessary libraries
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U peft
%pip install -U accelerate
%pip install -U trl
%pip install -U torchao
%pip install -U evaluate

In [3]:
import os
import torch
import wandb
import numpy as np
import collections
import evaluate
from tqdm.auto import tqdm
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    BitsAndBytesConfig, 
    TrainingArguments, 
    logging
)
from peft import LoraConfig, get_peft_model
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login
from trl import SFTTrainer, setup_chat_format
import bitsandbytes as bnb

In [4]:
# Kaggle secrets setup
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("roberta")
login(token=hf_token)
wb_token = user_secrets.get_secret("robertaw")

# Wandb initialization for tracking
wandb.login(key=wb_token)
run = wandb.init(project='Fine-tune_Roberta', job_type="training", anonymous="allow")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mkobinatha-20[0m ([33mkobinatha-20-student[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [5]:
# Load Dataset and Tokenizer
model_checkpoint = "deepset/xlm-roberta-large-squad2"
dataset_name = "RajeevanL/tamil_squad-2.0"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/606 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [6]:
# Load the dataset from Hugging Face Hub
dataset = load_dataset(dataset_name)
train_dataset = dataset["train"]
validation_dataset = dataset["validation"]

README.md:   0%|          | 0.00/573 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/59.2M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/5.15M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/5.30M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/66277 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5848 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5848 [00:00<?, ? examples/s]

In [49]:
print(train_dataset[0]['Answer'])


KeyError: 'Answer'

In [45]:
# Preprocessing Constants
max_length = 384
stride = 128


In [46]:
def preprocess_training_examples(examples):
    # Ensure questions are properly formatted as strings
    questions = [str(q).strip() for q in examples["Question"]]
    
    # Ensure context is properly formatted as strings
    contexts = [str(c).strip() for c in examples["Context"]]
    
    inputs = tokenizer(
        questions,
        contexts,
        max_length=384,
        truncation=True,
        padding="max_length",
        return_offsets_mapping=True,
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["Answer"]

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        answer = answers[i]

        # Handle the case where answer is None
        if answer is None:
            start_char = end_char = -1  # Set to -1 for invalid answer
        elif isinstance(answer, dict):  # Check if answer is a dictionary
            start_char = answer["answer_start"]
            end_char = start_char + len(answer["text"])
        else:  # If answer is a string, adjust accordingly
            start_char = answer.find(answer) if answer else -1  # Set to -1 if answer is empty
            end_char = start_char + len(answer) if start_char != -1 else -1

        # Find start and end token positions
        sequence_ids = inputs.sequence_ids(i)
        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - sequence_ids[::-1].index(1) - 1

        # Adjust the start and end positions
        start_token = context_start
        end_token = context_end
        for idx, (start, end) in enumerate(offsets):
            if start <= start_char < end:
                start_token = idx
            if start < end_char <= end:
                end_token = idx
                break

        start_positions.append(start_token)
        end_positions.append(end_token)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


In [47]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["Question"]]
    inputs = tokenizer(
        questions,
        examples["Context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [48]:
# Apply Preprocessing
train_dataset = train_dataset.map(preprocess_training_examples, batched=True, remove_columns=train_dataset.column_names)
validation_dataset = validation_dataset.map(preprocess_validation_examples, batched=True, remove_columns=validation_dataset.column_names)


Map:   0%|          | 0/66277 [00:00<?, ? examples/s]

KeyError: 'Question'

In [23]:
print(train_dataset[0]['Answer'])


விடுதலை செய்பவர்கள்.


In [None]:
# Define Metrics
metric = evaluate.load("squad")

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["Context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["Answer"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

# Load Pretrained Model
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)


In [None]:
# Training Arguments
args = TrainingArguments(
    "xlm-roberta-finetuned-tamil-squad",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    fp16=True,
    push_to_hub=False,
)

# Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)


In [None]:
# Training
trainer.train()

In [7]:
def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["Question"]]
    inputs = tokenizer(
        questions,
        examples["Context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["Answer"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [13]:
from datasets import load_dataset

# Load the dataset
dataset_name = "RajeevanL/tamil_squad-2.0"
df_train = load_dataset(dataset_name, split="train")


In [14]:
df_test = load_dataset(dataset_name, split="test")

In [16]:
df_train.column_names

['Question', 'Context', 'Answer']

In [18]:
# Access the first row and get the 'question' field
df_train[0]['Question']


'பெரும்பான்மையான எஸ்டோனியர்கள் ஜெர்மானியர்களை எப்படிக் கருதினர்?'

In [19]:
import numpy as np
from datasets import Dataset

# Add an 'id' column to df_train and df_test using map
df_train = df_train.add_column('id', np.linspace(0, len(df_train) - 1, len(df_train)).astype(str))
df_test = df_test.add_column('id', np.linspace(0, len(df_test) - 1, len(df_test)).astype(str))


In [None]:
from datasets import load_dataset

# Load the dataset
dataset_name = "RajeevanL/tamil_squad-2.0"
dataset = load_dataset(dataset_name, split="train")

# Format each row
def format_row(row):
    return {
        "input_text": f"Question: {row['Question']} Context: {row['Context']}",
        "target_text": row["Answer"],
    }

# Apply formatting to the entire dataset
formatted_dataset = dataset.map(format_row)

# Preview the formatted dataset
print(formatted_dataset[0])


In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, TrainingArguments
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch

# Load the model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForQuestionAnswering.from_pretrained(
    base_model, 
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)

# Prepare for LoRA
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["query", "key", "value"],
    bias="none",
    task_type="QUESTION_ANS",
)

lora_model = get_peft_model(model, lora_config)


In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "deepset/xlm-roberta-large-squad2",
    trust_remote_code=True
)

In [None]:
# Format dataset
dataset = load_dataset(dataset_name, split="train")
def format_row(row):
    # Ensure 'Context' and 'Answer' are not None
    if row['Context'] is None or row['Answer'] is None:
        return None  # Skip rows with missing data
    
    # Find the position of the answer in the context
    answer_start = row['Context'].find(row['Answer'])
    if answer_start == -1:
        return None  # Skip rows where the answer is not found in the context
    
    answer_end = answer_start + len(row['Answer'])

    # Tokenize the context
    encoding = tokenizer(row['Context'], truncation=True, padding=True, max_length=512)

    # Ensure start and end positions are valid
    start_token = encoding.char_to_token(answer_start)
    end_token = encoding.char_to_token(answer_end - 1)

    # If the positions are invalid, skip the row
    if start_token is None or end_token is None:
        return None

    return {
        "input_text": f"Question: {row['Question']} Context: {row['Context']}",
        "target_text": row["Answer"],
        "start_position": start_token,
        "end_position": end_token,
    }

# Apply formatting to the dataset
formatted_dataset = dataset.map(format_row)




In [None]:
print(formatted_dataset.column_names)


In [None]:
def tokenize_function(examples):
    encoding = tokenizer(
        examples["Question"],  # Tokenize 'Question' column
        padding=True,
        truncation=True,
        max_length=512,
        add_special_tokens=True,
    )

    # No need to calculate start and end positions if they are not used
    return encoding


In [None]:
tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True)


In [None]:
# Define training arguments
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    optim="adamw_torch_4bit",
    save_steps=1000,
    logging_steps=100,
    learning_rate=2e-4,
    weight_decay=0.01,
    fp16=False,
    bf16=True,
    max_grad_norm=0.3,
    warmup_ratio=0.1,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="wandb"
)

# Initialize the trainer with tokenized dataset
trainer = SFTTrainer(
    model=lora_model,
    args=training_arguments,
    train_dataset=tokenized_dataset,  # Use the tokenized dataset
    tokenizer=tokenizer,
)

# Start the training process
trainer.train()


In [None]:
# Start training
trainer.train()

In [None]:
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    optim="adamw_torch_4bit",  # Change to a valid 4-bit optimizer
    save_steps=1000,
    logging_steps=100,
    learning_rate=2e-4,
    weight_decay=0.01,
    fp16=False,
    bf16=True,
    max_grad_norm=0.3,
    warmup_ratio=0.1,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="wandb"
)


In [None]:
print(formatted_dataset[0])


In [None]:
from transformers import AutoTokenizer

def format_row(row):
    # Ensure 'Context' and 'Answer' are not None
    if row['Context'] is None or row['Answer'] is None:
        return None  # Skip rows with missing data
    
    # Find the position of the answer in the context
    answer_start = row['Context'].find(row['Answer'])
    if answer_start == -1:
        return None  # Skip rows where the answer is not found in the context
    
    answer_end = answer_start + len(row['Answer'])

    # Tokenize the context
    encoding = tokenizer(row['Context'], truncation=True, padding=True, max_length=512)

    # Ensure start and end positions are valid
    start_token = encoding.char_to_token(answer_start)
    end_token = encoding.char_to_token(answer_end - 1)

    # If the positions are invalid, skip the row
    if start_token is None or end_token is None:
        return None

    return {
        "input_text": f"Question: {row['Question']} Context: {row['Context']}",
        "target_text": row["Answer"],
        "start_position": start_token,
        "end_position": end_token,
    }

# Apply formatting to the dataset
formatted_dataset = dataset.map(format_row)


In [None]:
def tokenize_function(examples):
    # Tokenize the input text
    encoding = tokenizer(
        examples["input_text"],  # Use the 'input_text' for tokenization
        padding=True,
        truncation=True,
        max_length=512,
        add_special_tokens=True,
    )

    # Prepare lists for the start and end positions
    start_positions = []
    end_positions = []

    for i, answer in enumerate(examples["Answer"]):
        start_pos = examples['start_position'][i]
        end_pos = examples['end_position'][i]

        # Get the token positions for the start and end positions
        start_token = encoding.char_to_token(start_pos)
        end_token = encoding.char_to_token(end_pos - 1)

        # Check if valid tokens are found
        if start_token is None or end_token is None:
            # If any token is invalid, set a default value (-1 or None)
            start_positions.append(-1)  # or use None
            end_positions.append(-1)    # or None
        else:
            # Append valid token positions
            start_positions.append(start_token)
            end_positions.append(end_token)

    encoding.update({
        'start_positions': start_positions,
        'end_positions': end_positions,
    })

    return encoding


In [None]:
trainer.train()