In [None]:
import pandas as pd
import numpy as np
import re
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk

# Downloading NLTK data files
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Define the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load dataset
data = pd.read_csv('facts_opinions.csv')

# Initialize stop-words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) >= 3]
    return ' '.join(tokens)

# Apply preprocessing
data['Processed_Text'] = data['Text'].apply(preprocess_text)

In [None]:
# Determine data splits
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['Processed_Text'].tolist(),
    data['Label'].tolist(),
    test_size=0.1,
    random_state=42
)

In [None]:
# Load the tokenizer for the specified model
tokenizer = AutoTokenizer.from_pretrained('lighteternal/fact-or-opinion-xlmr-el')

# Tokenize the text data
train_encodings = tokenizer(
    train_texts,
    truncation=True,
    padding=True,
    max_length=256
)
val_encodings = tokenizer(
    val_texts,
    truncation=True,
    padding=True,
    max_length=256
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [None]:
class FactOpinionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Convert labels to tensor
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = FactOpinionDataset(train_encodings, train_labels)
val_dataset = FactOpinionDataset(val_encodings, val_labels)

In [None]:
# Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained('lighteternal/fact-or-opinion-xlmr-el')
model.to(device)

config.json:   0%|          | 0.00/778 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [None]:
# Defining optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Use CrossEntropyLoss for classification since model outputs logits
criterion = torch.nn.CrossEntropyLoss()



In [None]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [None]:
epochs = 3
# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss:.4f}')

    # Validation loop
    model.eval()
    val_labels_list = []
    val_preds_list = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            val_labels_list.extend(labels.cpu().numpy())
            val_preds_list.extend(preds.cpu().numpy())

    # Calculate evaluation metrics
    accuracy = accuracy_score(val_labels_list, val_preds_list)
    precision, recall, f1, _ = precision_recall_fscore_support(
        val_labels_list, val_preds_list, average='binary'
    )
    print(
        f'Validation Accuracy: {accuracy:.4f}, '
        f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}'
    )

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Epoch 1/3, Training Loss: 0.0450
Validation Accuracy: 0.9908, Precision: 0.9888, Recall: 0.9917, F1 Score: 0.9902
Epoch 2/3, Training Loss: 0.0280
Validation Accuracy: 0.9912, Precision: 0.9921, Recall: 0.9892, F1 Score: 0.9907
Epoch 3/3, Training Loss: 0.0233
Validation Accuracy: 0.9896, Precision: 0.9825, Recall: 0.9957, F1 Score: 0.9891
