# Import packages

In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [2]:
import torch
import pandas as pd
import numpy as np
from transformers import RobertaTokenizer, AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, RobertaForSequenceClassification
from datasets import load_dataset, ClassLabel, Value, load_metric

# load source dataset

In [3]:
# sampling the dataset for fine-tuning
train = load_dataset('amazon_us_reviews', 'Video_v1_00', split='train[:60%]') 

Reusing dataset amazon_us_reviews (/home/zh2095/.cache/huggingface/datasets/amazon_us_reviews/Video_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563)


In [4]:
# take a look at a sample
train[0]

{'marketplace': 'US',
 'customer_id': '49033728',
 'review_id': 'R1P1G5KZ05H6RD',
 'product_id': '6302503213',
 'product_parent': '748506413',
 'product_title': 'The Night They Saved Christmas [VHS]',
 'product_category': 'Video',
 'star_rating': 5,
 'helpful_votes': 0,
 'total_votes': 0,
 'vine': 0,
 'verified_purchase': 1,
 'review_headline': 'Very satisfied!!',
 'review_body': 'Fast shipping. Pleasure to deal with. Would recommend. A+++. Thanks!',
 'review_date': '2015-08-31'}

# preprocessing

In [5]:
# remove unuseful columns
train = train.remove_columns(['marketplace', 'review_id', 'product_parent', 'product_title', 'product_category', \
                      'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_date'])

In [6]:
train[0]

{'customer_id': '49033728',
 'product_id': '6302503213',
 'star_rating': 5,
 'review_headline': 'Very satisfied!!',
 'review_body': 'Fast shipping. Pleasure to deal with. Would recommend. A+++. Thanks!'}

# Encoding 

In [7]:
# load the tokenizer pretrained on roberta-base
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [8]:
# encode the training dataset in the form of sentences pair
# truncate at length=32 for a balance of time consuming and information coverage
train_tokenized = train.map(lambda batch: tokenizer(batch['review_body'], padding='max_length', truncation=True, max_length=32))

  0%|          | 0/228362 [00:00<?, ?ex/s]

In [9]:
train_tokenized = train_tokenized.rename_column("star_rating", "labels")
train_tokenized = train_tokenized.rename_column("review_body", "text")

In [10]:
# convert star rating that ranging from 1-5 to labels that ranging from 0-4
def to_label(x):
    x['labels']  = x['labels'] - 1
    return x

train_tokenized = train_tokenized.map(to_label)

  0%|          | 0/228362 [00:00<?, ?ex/s]

In [16]:
train_tokenized.set_format("torch", columns=["input_ids",  "attention_mask", "labels"])

In [17]:
train_tokenized[0]

{'labels': tensor(4),
 'input_ids': tensor([    0, 35515,  6738,     4, 18486, 24669,     7,   432,    19,     4,
         11258,  5940,     4,    83, 42964, 27079,  4557,   328,     2,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0])}

# Fine-tuning

In [32]:
# load a pretrained model

model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=5)

loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/zh2095/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "

In [33]:
# set training arguments manually if needed, otherwise use the defalut
training_args = TrainingArguments(
    output_dir='./output',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=0,                # number of warmup steps for learning rate scheduler
    learning_rate=5e-5,               # learning rate
    logging_dir='./logs',            # directory for storing logs
    logging_steps=1000,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [34]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [35]:
metric = load_metric("accuracy")

In [36]:
# Create a Trainer object with the model, training arguments, training and test datasets, and evaluation function
trainer = Trainer(
    model=model,
    args = training_args,
    train_dataset=train_tokenized)

In [37]:
# clean up gpu cache before training
import gc

gc.collect()

torch.cuda.empty_cache()

In [38]:
trainer.train()    
# trainer.train(resume_from_checkpoint=True) # True if already trained, to save time by continuing on a checkpoint

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: product_id, review_headline, text, customer_id. If product_id, review_headline, text, customer_id are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 228362
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 10707


Step,Training Loss
1000,0.968
2000,0.9113
3000,0.8925
4000,0.8566
5000,0.8296
6000,0.8193
7000,0.8079
8000,0.7502
9000,0.7334
10000,0.7271


Saving model checkpoint to ./output/checkpoint-500
Configuration saved in ./output/checkpoint-500/config.json
Model weights saved in ./output/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./output/checkpoint-1000
Configuration saved in ./output/checkpoint-1000/config.json
Model weights saved in ./output/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./output/checkpoint-1500
Configuration saved in ./output/checkpoint-1500/config.json
Model weights saved in ./output/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./output/checkpoint-2000
Configuration saved in ./output/checkpoint-2000/config.json
Model weights saved in ./output/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./output/checkpoint-2500
Configuration saved in ./output/checkpoint-2500/config.json
Model weights saved in ./output/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to ./output/checkpoint-3000
Configuration saved in ./output/checkpoint-3000/config.json
M

TrainOutput(global_step=10707, training_loss=0.822625502692091, metrics={'train_runtime': 2146.8103, 'train_samples_per_second': 319.118, 'train_steps_per_second': 4.987, 'total_flos': 1.1266159449997312e+16, 'train_loss': 0.822625502692091, 'epoch': 3.0})

In [39]:
# save the fine_tuned model

model.save_pretrained("Roberta-senti")

Configuration saved in Roberta-senti/config.json
Model weights saved in Roberta-senti/pytorch_model.bin


# Predict on the whole dataset with the fine-tuned model

In [40]:
# load the entire dataset
dataset = load_dataset('amazon_us_reviews', 'Video_v1_00', split='train')

# remove unuseful columns
dataset = dataset.remove_columns(['marketplace', 'review_id', 'product_parent', 'product_title', 'product_category', \
                      'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_date'])
    
# load the fine-tuned tokenizer  
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# encode the training dataset in the form of sentences pair
# truncate at length=32 for a balance of time consuming and information coverage
dataset_tokenized = dataset.map(lambda batch: tokenizer(batch['review_body'], padding='max_length', truncation=True, max_length=32))
dataset_tokenized = dataset_tokenized.rename_column("star_rating", "labels")
dataset_tokenized = dataset_tokenized.rename_column("review_body", "text")

# convert star rating that ranging from 1-5 to labels that ranging from 0-4
dataset_tokenized = dataset_tokenized.map(to_label)

dataset_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Reusing dataset amazon_us_reviews (/home/zh2095/.cache/huggingface/datasets/amazon_us_reviews/Video_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563)
loading file https://huggingface.co/roberta-base/resolve/main/vocab.json from cache at /home/zh2095/.cache/huggingface/transformers/d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
loading file https://huggingface.co/roberta-base/resolve/main/merges.txt from cache at /home/zh2095/.cache/huggingface/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/roberta-base/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/tokenizer_config.json from cache

  0%|          | 0/380604 [00:00<?, ?ex/s]

  0%|          | 0/380604 [00:00<?, ?ex/s]

In [41]:
pred_output = trainer.predict(dataset_tokenized)

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: product_id, review_headline, text, customer_id. If product_id, review_headline, text, customer_id are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 380604
  Batch size = 64


In [42]:
# get prediction and evaluate
pred = pred_output[0].argmax(axis=1)
truth = pred_output[1]
accuracy = load_metric('accuracy')
f1 = load_metric('f1')
accuracy.compute(predictions=pred, references=truth)

{'accuracy': 0.7102894346880222}

In [43]:
f1.compute(predictions=pred, references=truth, average='weighted')

{'f1': 0.6851160478312943}

# Save the dataset expanded by rating predicted by sentiment analysis

In [44]:
# expand the original dataset with the predicted rating
data = dataset[:]
data['senti_rating_finetune'] = pred + 1

In [45]:
df = pd.DataFrame.from_dict(data)

# extract variables needed for CF recommender
df= df[['customer_id', 'product_id', 'star_rating', 'senti_rating_finetune']]
df.rename(columns = {'customer_id' : 'user', 'product_id' : 'item', 'star_rating' : 'rating'}, inplace = True)
df.head()

Unnamed: 0,user,item,rating,senti_rating_finetune
0,49033728,6302503213,5,5
1,17857748,B000059PET,5,5
2,25551507,0788812807,4,5
3,21025041,6302509939,5,5
4,40943563,B00JENS2BI,3,4


In [46]:
# save as csv files
df.to_csv('../data/amazon_video_roberta.csv')