In [None]:
!pip install huggingface_hub
!pip install nltk
!pip install -U pip setuptools wheel
!pip install text2num
!pip install transformers
!pip install tensorboard
!pip install datasets
!pip install transformers[sentencepiece]
!pip install sentencepiece
!pip install huggingface
!pip install --upgrade accelerate
!pip install scikit-learn
!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
!sudo apt-get install git-lfs
!git lfs install
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
model_name = "cardiffnlp/twitter-xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

In [None]:
from datasets import load_dataset, DatasetDict

raw_dataset = load_dataset("javilonso/rest23_sentiment_data_v3_oversampling", use_auth_token=True)

In [None]:
raw_dataset = raw_dataset.remove_columns(['Title', 'Review', 'Polarity', 'Type', '__index_level_0__'])
raw_dataset = raw_dataset.rename_column('Title_Review', 'text')
raw_dataset = raw_dataset.rename_column('Country', 'label')

In [None]:
raw_dataset['train']

In [None]:
# Define the function to preprocess the dataset
def preprocess_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)

# Prepare the dataset
train_dataset = raw_dataset['train'].map(preprocess_function, batched=True)
test_dataset = raw_dataset['test'].map(preprocess_function, batched=True)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

In [None]:
import datasets
import transformers
from transformers import TrainingArguments, Trainer
from sklearn.metrics import f1_score
import torch
from torch.utils.data import Dataset, DataLoader


# Define the training arguments
training_args = TrainingArguments(
    output_dir = 'rm23_ctry_v2lr',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_strategy='epoch',
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    num_train_epochs=8,
    warmup_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    seed=42,
    push_to_hub=True
)


# Define the function to compute the metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='macro')
    return {'f1': f1}

# Create the trainer and train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
trainer.train()

# Evaluate the model
trainer.evaluate()

In [None]:
trainer.push_to_hub()

In [None]:
tokenizer.push_to_hub("javilonso/rm23_ctry_v2lr")