In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
import kagglehub
import numpy as np
import pandas as pd
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# download data usng Kaggle api
path = kagglehub.dataset_download("snap/amazon-fine-food-reviews", "Reviews.csv")

# read file
df = pd.read_csv(path, index_col=0)

In [4]:
df

Unnamed: 0_level_0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...
568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


In [5]:
df.isna().any()

ProductId                 False
UserId                    False
ProfileName                True
HelpfulnessNumerator      False
HelpfulnessDenominator    False
Score                     False
Time                      False
Summary                    True
Text                      False
dtype: bool

In [6]:
df['Score'].value_counts()

Score
5    363122
4     80655
1     52268
3     42640
2     29769
Name: count, dtype: int64

In [7]:
def label_score(score: int) -> int:
    '''
    Label the score rating of 1-5 based on the following:
        1-2: 0 (negative)
        3: 1 (neutral)
        4-5: 2 (positive)
    '''
    assert 1 <= score <= 5, "Invalid score. Score should be between 1 and 5."
    if score <= 2:
        return 0
    elif score == 3:
        return 1
    else:
        return 2

In [8]:
df['label'] = df['Score'].apply(label_score)

In [9]:
df

Unnamed: 0_level_0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,label
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,2
2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,0
3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,2
4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,0
5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,2
...,...,...,...,...,...,...,...,...,...,...
568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...,2
568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...,0
568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o...",2
568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...,2


# Spliting data
- Split train test data.
- Further split train data to train, validation data.

In [10]:
from sklearn.model_selection import train_test_split


# split into train+validation data and test data
train_val_texts, test_texts, train_val_labels, test_labels = train_test_split(df['Text'].tolist(),
                                                                              df['label'].tolist(),
                                                                              test_size=0.1,
                                                                              stratify=df['label'],
                                                                              random_state=42)

# split into train and validation data
train_texts, val_texts, train_labels, val_labels = train_test_split(train_val_texts, train_val_labels,
                                                                     test_size=0.1,
                                                                     stratify=train_val_labels,
                                                                     random_state=42)

# Tokenisation
- Tokenising using RoBERTa pretrained vocab.
- Using Hugging Face's Dataset which is optimised for multi-processing data.

In [11]:
from transformers import RobertaTokenizerFast


def tokenize(example: dict) -> dict:
    '''
    Tokenise and preapre the data to be suitable to pass into Trainer for
    a RoBERTA model.

    Arguments:
        example: dict containing the keys ("texts", labels"), mapping to
            a list of values

    Returns:
        dict: dict containing the keys ("input_ids", "attention_masks", "labels"),
            mapping to a list of values
    '''
    tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base")
    encodings = tokenizer(example['texts'], padding=True, truncation=True)
    encodings["labels"] = example["labels"]
    return encodings

In [12]:
from datasets import Dataset


train_dataset = Dataset.from_dict({"texts": train_texts, "labels": train_labels})
train_dataset = train_dataset.map(tokenize, batched=True, remove_columns="texts")

val_dataset = Dataset.from_dict({"texts": val_texts, "labels": val_labels})
val_dataset = val_dataset.map(tokenize, batched=True, remove_columns="texts")

Map: 100%|██████████| 460447/460447 [05:23<00:00, 1423.94 examples/s]
Map: 100%|██████████| 51161/51161 [00:37<00:00, 1364.82 examples/s]


# Model
- Use pretrained RoBERTa model.

In [13]:
from transformers import RobertaForSequenceClassification


model = RobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-base", num_labels=3)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Training
- Use Hugging Face's Trainer for training pipeline.
  - Will detect GPU internally.

In [14]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


def compute_metrics(eval_pred):
    '''
    Compute metrics at evaluation. Returns a dictionary string to metric values.
    '''
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

In [15]:
from transformers import Trainer, TrainingArguments


training_args = TrainingArguments(
    output_dir="./amazon_review_classifier",
    eval_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=16
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6677,0.668227,0.780673,0.60945,0.780673,0.684517
2,0.6679,0.667045,0.780673,0.60945,0.780673,0.684517
3,0.396,0.386248,0.862649,0.806565,0.862649,0.832539


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=86334, training_loss=0.6057306460625937, metrics={'train_runtime': 27950.5699, 'train_samples_per_second': 49.421, 'train_steps_per_second': 3.089, 'total_flos': 3.6344935155781325e+17, 'train_loss': 0.6057306460625937, 'epoch': 3.0})

# Evaluation
- Evaluate on test dataset.

In [16]:
test_dataset = Dataset.from_dict({"texts": test_texts, "labels": test_labels})
test_dataset = test_dataset.map(tokenize, batched=True, remove_columns="texts")

Map: 100%|██████████| 56846/56846 [00:39<00:00, 1439.34 examples/s]


In [None]:
output = trainer.predict(test_dataset)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [22]:
from sklearn.metrics import classification_report

preds = np.argmax(output.predictions, axis=1)
print(classification_report(test_labels, preds, target_names=["Negative", "Neutral", "Positive"]))

              precision    recall  f1-score   support

    Negative       0.62      0.81      0.70      8204
     Neutral       0.00      0.00      0.00      4264
    Positive       0.92      0.95      0.94     44378

    accuracy                           0.86     56846
   macro avg       0.51      0.59      0.54     56846
weighted avg       0.81      0.86      0.83     56846



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
