In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##### Libraries installation

In [None]:
!pip install datasets transformers[torch] accelerate -U evaluate

In [3]:
from datasets import load_dataset
import numpy as np
import evaluate
from transformers import AutoModelForSequenceClassification,AutoTokenizer,TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
from sklearn.metrics import f1_score

##### Loading dataset

In [None]:
# load dataset
dataset = load_dataset("yelp_review_full")

# print example from training set
dataset["train"][100]

In [5]:
dataset["train"][:5]

{'label': [4, 1, 3, 3, 0],
 'text': ["dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank.",
  "Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't get it.  You 

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

##### Dataset split for faster computational processing

In [14]:
# Number of rows to select
num_rows_train = 10000
num_rows_test = 5000

# Shuffling and selecting 5000 rows from the training dataset
train_ds_small = dataset['train'].shuffle(seed=42).select(range(num_rows_train))

# Shuffling and selecting 5000 rows from the testing dataset
eval_ds_small = dataset['test'].shuffle(seed=42).select(range(num_rows_test))

small_dataset = DatasetDict({
    'train': train_ds_small,
    'validation': eval_ds_small
})

##### Tokenizing

In [15]:
tokenizer = AutoTokenizer.from_pretrained("LiYuan/amazon-review-sentiment-analysis")

def tokenize_function(examples):
   # Map function
    # padding and truncation control for variable length sequences
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [16]:
# apply to all datasets with .map(). Built in function of the HF datasets class
tokenized_datasets = small_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

#### Training

In [None]:
#evaluation metric

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = metric.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = f1_score(labels, predictions, average='weighted')
    return {"accuracy": accuracy, "f1": f1}

In [18]:
model = AutoModelForSequenceClassification.from_pretrained("LiYuan/amazon-review-sentiment-analysis")

In [19]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/my_model",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
# Save the trained model and tokenizer
model.save_pretrained('/content/drive/MyDrive/data-project-llm/my_llm_model_001')
tokenizer.save_pretrained('/content/drive/MyDrive/data-project-llm/my_llm_tokenizer_001')

('/content/drive/MyDrive/data-project-llm/my_llm_tokenizer_01/tokenizer_config.json',
 '/content/drive/MyDrive/data-project-llm/my_llm_tokenizer_01/special_tokens_map.json',
 '/content/drive/MyDrive/data-project-llm/my_llm_tokenizer_01/vocab.txt',
 '/content/drive/MyDrive/data-project-llm/my_llm_tokenizer_01/added_tokens.json',
 '/content/drive/MyDrive/data-project-llm/my_llm_tokenizer_01/tokenizer.json')

In [None]:
trainer.evaluate()

{'eval_loss': 0.8168434500694275,
 'eval_accuracy': 0.6378,
 'eval_f1': 0.6376379297844168,
 'eval_runtime': 158.9344,
 'eval_samples_per_second': 31.46,
 'eval_steps_per_second': 1.969,
 'epoch': 1.0}

### **Insights**

- The **eval_loss : 0.8168** indicates the loss on the evaluation dataset. 

- The **eval_accuracy : 63.78%** indicates the accuracy of the model on the evaulation dataset, telling us that 63.78% of predictions were correct.

- **eval_f1: 0.6376** — The F1 score is a measure of a test's accuracy, considering both precision and recall. An F1 score of around 0.6376 suggests a balanced performance between precision and recall.