<a href="https://colab.research.google.com/github/jtlagumbay/cebqa/blob/main/reader/cebqa_roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **CebQA Reader Component**
Pretrained model: RoBERTa

# **Libraries**

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.1-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [1]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import RobertaForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from huggingface_hub import login


ModuleNotFoundError: No module named 'datasets'

# **Loading Dataset**

In [None]:
data = pd.read_csv('path/to/your-dataset.csv')

dataset = Dataset.from_pandas(data)

# **Tokenize Dataset**

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# **Dataset Splitting**

In [None]:
train_data, val_data = train_test_split(tokenized_dataset, test_size=0.2)

train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)

# **Load Pre-Trained RoBERTa**

In [None]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)


# **Model Training**

## Training Argument

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
)

## Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train()


## Saving the fine-tuned model

In [None]:
model.save_pretrained("path/to/save-directory")
tokenizer.save_pretrained("path/to/save-directory")

# **Evaluating the model**

## Evaluating

In [None]:
results = trainer.evaluate()
print(results)

## Inference

In [None]:
inputs = tokenizer("Your input text here", return_tensors="pt", truncation=True, padding="max_length")
outputs = model(**inputs)
logits = outputs.logits
predicted_class = logits.argmax().item()
print(f"Predicted class: {predicted_class}")