## **SieBERT: Leveraging Transfer Learning for Sentiment Analysis**



In [None]:
# install Hugging Face's transformers and datasets libraries
!pip install transformers
!pip install datasets

In [None]:
# check GPU status
!nvidia-smi

### **Example 1:** Applying SieBERT, a pretrained sentiment analysis model, with *3 lines of code*

In [None]:
from transformers import pipeline  # load pipeline() function from transformers library
sentiment_analysis = pipeline("sentiment-analysis", model="siebert/sentiment-roberta-large-english")  # load pretrained SieBERT model ("Sentiment in English")

In [None]:
sentiment_analysis("This is super helpful. I love it!")  # apply pretrained model to example sentence

### **Example 2:** Classifying multiple sentences using SieBERT

In [None]:
# load dependencies
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# specify path of pretrained model
checkpoint = "siebert/sentiment-roberta-large-english"  # SieBERT

# load pretrained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [None]:
# provide 2 example sentences
sequences = ["This is amazing", "I don't think it's useless.", "I hate this!"]

# tokenize sequences
tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# predict with model
output = model(**tokens)

# transform logits to class labels
predictions = torch.nn.functional.softmax(output.logits, dim=-1)
confidences = predictions.max(1)[0].tolist()
classes = predictions.argmax(-1).tolist()
labels = pd.Series(classes).map(model.config.id2label)

In [None]:
# consolidate results
df = pd.DataFrame(list(zip(sequences, classes, labels, confidences)), columns=['text', 'class', 'class_label', 'confidence'])

# return dataframe
print(df)

### **Example 3:** Fine-tuning SieBERT for multi-class sentiment analysis in a different domain

In [None]:
# load three-class sentiment data set from Hugging Face
from datasets import load_dataset
sentiment = load_dataset('sentiment140')  # source: https://huggingface.co/datasets/sentiment140/viewer/sentiment140/test
print(sentiment)

In [None]:
# print first row from training data split
print(sentiment['train'][0])

# count number of labels
NUM_LABELS = len(set(sentiment['test']['sentiment']))
print(set(sentiment['test']['sentiment']))
print(NUM_LABELS)

In [None]:
# define preprocessing function
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
# tokenize dataset
tokenized_sentiment = sentiment.map(preprocess_function, batched=True)

# use dynamic padding
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# define evaluation metrics
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
# initialize pretrained model with updated classification head
model2 = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=NUM_LABELS, ignore_mismatched_sizes=True)

In [None]:
# set number of epochs
NUM_EPOCHS = 1
NUM_EXAMPLES = 400

In [None]:
# rename label column
tokenized_sentiment = tokenized_sentiment.rename_column("sentiment", "label")

In [None]:
from datasets import ClassLabel, Value

# update labels
def update_labels(example):
  example['label'] = example['label'] / 2
  return example

tokenized_sentiment = tokenized_sentiment.map(update_labels)

new_features = tokenized_sentiment['test'].features.copy()
new_features["label"] = ClassLabel(names=['neg', 'neu', 'pos'])
tokenized_sentiment['test'] = tokenized_sentiment['test'].cast(new_features)

In [None]:
# check features
tokenized_sentiment['test'].features

In [None]:
# train SieBERT
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    evaluation_strategy="epoch",
)

trainer = Trainer(
    model=model2,
    args=training_args,
    train_dataset=tokenized_sentiment["test"].select(range(0,NUM_EXAMPLES)),  
    eval_dataset=tokenized_sentiment["test"].select(range(NUM_EXAMPLES,498)),
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

In [None]:
# store evaluations for SieBERT
siebert_eval = trainer.evaluate()

In [None]:
# specify path of pretrained model
checkpoint = "roberta-large"  # RoBERTa-large

# load pretrained tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# initialize pretrained model with updated classification head
model3 = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=NUM_LABELS, ignore_mismatched_sizes=True)

In [None]:
# train RoBERTa
trainer = Trainer(
    model=model3,
    args=training_args,
    train_dataset=tokenized_sentiment["test"].select(range(0,NUM_EXAMPLES)),  
    eval_dataset=tokenized_sentiment["test"].select(range(NUM_EXAMPLES,498)),
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

In [None]:
# store evaluations for RoBERTa
roberta_eval = trainer.evaluate()
models = ['SieBERT', 'RoBERTa']
accuracies = [siebert_eval['eval_accuracy'], roberta_eval['eval_accuracy']]
f1_scores = [siebert_eval['eval_f1'], roberta_eval['eval_f1']]

In [None]:
# consolidate results
eval = pd.DataFrame(list(zip(models, accuracies, f1_scores)), columns=['model', 'accuracy', 'f1_score'])

# return dataframe
eval

Source: https://huggingface.co/
