# **Finetuning different pretrained models to perform sentiment analysis of yelp reviews**

In [None]:
# Needed python packages
!pip install accelerate
!pip install datasets
!pip install transformers
!pip install pandas
!pip install altair

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import pipeline
from datasets import load_dataset
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, EarlyStoppingCallback
from sklearn.metrics import f1_score
from transformers import DataCollatorWithPadding
from sklearn.metrics import f1_score
import numpy as np

In [None]:
URL_test = "https://raw.githubusercontent.com/laurenzbrahner/BigDataTask2/main/data/Sentiment_Test.csv"
URL_training = "https://raw.githubusercontent.com/laurenzbrahner/BigDataTask2/main/data/Sentiment_training_extended.csv"
URL_val = "https://raw.githubusercontent.com/laurenzbrahner/BigDataTask2/main/data/Sentiment_Val.csv"

# Load Data
df_train = pd.read_csv(URL_training, sep=";")
df_test = pd.read_csv(URL_test, sep=";")
df_val = pd.read_csv(URL_val, sep=";")

# 0-3 vs 5 Star binary mapping
star_mapping = {
    0: 0,
    1: 0,
    2: 0,
    3: 0,
    4: 1
}

df_train['label'] = df_train['label'].map(star_mapping)
df_test['label'] = df_test['label'].map(star_mapping)
df_val['label'] = df_val['label'].map(star_mapping)

In [None]:
# check distribution
df_train["label"].value_counts()

In [None]:
# chose amount to delete from df to balance the distribution
indices_to_remove = df_train[df_train['label'] == 0].sample(1300, random_state=42).index

# delete from df
df_train = df_train.drop(indices_to_remove)

In [None]:
# enter model name as checkpoint
checkpoint = "gilf/english-yelp-sentiment"
# keep tokenizer same as model, only change if model has problems with tokenizer
checkpoint_tokenizer= "gilf/english-yelp-sentiment"

# other used models for finetuning \|/
#"distilbert-base-uncased-finetuned-sst-2-english"


# initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint_tokenizer)

# Define datacollator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# process loaded data
raw_datasets = {}
raw_datasets['train'] = Dataset.from_pandas(df_train)
raw_datasets['test'] = Dataset.from_pandas(df_test)
raw_datasets['val'] = Dataset.from_pandas(df_val)

In [None]:
# doublecheck if data is balanced
df_train["label"].value_counts()

In [None]:

# load model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,
                                                           num_labels=2,
                                                           ignore_mismatched_sizes=True)
# define training arguments
training_args = TrainingArguments("test-trainer", evaluation_strategy="steps",
                                  num_train_epochs=5,metric_for_best_model="accuracy",
                                   load_best_model_at_end=True,)

# tokenize data
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length")

tokenized_datasets = {x: y.map(tokenize_function, batched=True) for x, y in raw_datasets.items()}

In [None]:
# define compute metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids

    accuracy = (preds == labels).mean()
    macro_f1 = f1_score(labels, preds, average='macro')

    return {"accuracy": accuracy, "f1": macro_f1}

In [None]:
# define trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01)]
)

In [None]:
# train model
trainer.train()