# Import packages

In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [2]:
import torch
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel, AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer,BertForSequenceClassification
from datasets import load_dataset, ClassLabel, Value, load_metric

# load source dataset

In [3]:
# sampling the dataset for fine-tuning
train = load_dataset('amazon_us_reviews', 'Video_v1_00', split='train[:60%]') 

Reusing dataset amazon_us_reviews (/home/zh2095/.cache/huggingface/datasets/amazon_us_reviews/Video_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563)


In [4]:
# take a look at a sample
train[0]

{'marketplace': 'US',
 'customer_id': '49033728',
 'review_id': 'R1P1G5KZ05H6RD',
 'product_id': '6302503213',
 'product_parent': '748506413',
 'product_title': 'The Night They Saved Christmas [VHS]',
 'product_category': 'Video',
 'star_rating': 5,
 'helpful_votes': 0,
 'total_votes': 0,
 'vine': 0,
 'verified_purchase': 1,
 'review_headline': 'Very satisfied!!',
 'review_body': 'Fast shipping. Pleasure to deal with. Would recommend. A+++. Thanks!',
 'review_date': '2015-08-31'}

# preprocessing

In [5]:
# remove unuseful columns
train = train.remove_columns(['marketplace', 'review_id', 'product_parent', 'product_title', 'product_category', \
                      'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_date'])

In [6]:
train[0]

{'customer_id': '49033728',
 'product_id': '6302503213',
 'star_rating': 5,
 'review_headline': 'Very satisfied!!',
 'review_body': 'Fast shipping. Pleasure to deal with. Would recommend. A+++. Thanks!'}

# Encoding 

In [7]:
# load the tokenizer pretrained on bert-base
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [8]:
# encode the training dataset in the form of sentences pair
# truncate at length=32 for a balance of time consuming and information coverage
train_tokenized = train.map(lambda batch: tokenizer(batch['review_body'], padding='max_length', truncation=True, max_length=32))

Loading cached processed dataset at /home/zh2095/.cache/huggingface/datasets/amazon_us_reviews/Video_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563/cache-47ca873d215123f9.arrow


In [9]:
train_tokenized = train_tokenized.rename_column("star_rating", "labels")
train_tokenized = train_tokenized.rename_column("review_body", "text")

In [10]:
# convert star rating that ranging from 1-5 to labels that ranging from 0-4
def to_label(x):
    x['labels']  = x['labels'] - 1
    return x

train_tokenized = train_tokenized.map(to_label)

Loading cached processed dataset at /home/zh2095/.cache/huggingface/datasets/amazon_us_reviews/Video_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563/cache-33dac99741212798.arrow


In [11]:
train_tokenized.set_format("torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"])

In [12]:
train_tokenized[0]

{'labels': tensor(4),
 'input_ids': tensor([  101, 13227,  8629,   119, 23786,  1106,  2239,  1114,   119,  5718,
         18029,   119,   138,   116,   116,   116,   119,  5749,   106,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0])}

# Fine-tuning

In [13]:
# load a pretrained model

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
#model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [14]:
# set training arguments manually if needed, otherwise use the defalut
training_args = TrainingArguments(
    output_dir='./output',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=0,                # number of warmup steps for learning rate scheduler
    learning_rate=5e-5,               # learning rate
    logging_dir='./logs',            # directory for storing logs
    logging_steps=1000,
)

In [15]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [16]:
metric = load_metric("accuracy")

In [17]:
# Create a Trainer object with the model, training arguments, training and test datasets, and evaluation function
trainer = Trainer(
    model=model,
    args = training_args,
    train_dataset=train_tokenized)



In [18]:
# clean up gpu cache before training
import gc

gc.collect()

torch.cuda.empty_cache()

In [19]:
# trainer.train()    
trainer.train(resume_from_checkpoint=True) # True if already trained, to save time by continuing on a checkpoint

Loading model from ./output/checkpoint-10500).
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: review_headline, text, product_id, customer_id. If review_headline, text, product_id, customer_id are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 228362
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 10707
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 2
  Continuing training from global step 10500
  Will skip the first 2 epochs then the first 3362 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training o

  0%|          | 0/3362 [00:00<?, ?it/s]

Didn't find an RNG file for process 0, if you are resuming a training that wasn't launched in a distributed fashion, reproducibility is not guaranteed.


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=10707, training_loss=0.01228112053045029, metrics={'train_runtime': 154.6328, 'train_samples_per_second': 4430.404, 'train_steps_per_second': 69.241, 'total_flos': 1.1266159449997312e+16, 'train_loss': 0.01228112053045029, 'epoch': 3.0})

In [20]:
# save the fine_tuned model

model.save_pretrained("bert-senti")

Configuration saved in bert-senti/config.json
Model weights saved in bert-senti/pytorch_model.bin


# Predict on the whole dataset with the fine-tuned model

In [21]:
# load the entire dataset
dataset = load_dataset('amazon_us_reviews', 'Video_v1_00', split='train')

# remove unuseful columns
dataset = dataset.remove_columns(['marketplace', 'review_id', 'product_parent', 'product_title', 'product_category', \
                      'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_date'])
    
# load the fine-tuned tokenizer  
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

# encode the training dataset in the form of sentences pair
# truncate at length=32 for a balance of time consuming and information coverage
dataset_tokenized = dataset.map(lambda batch: tokenizer(batch['review_body'], padding='max_length', truncation=True, max_length=32))
dataset_tokenized = dataset_tokenized.rename_column("star_rating", "labels")
dataset_tokenized = dataset_tokenized.rename_column("review_body", "text")

# convert star rating that ranging from 1-5 to labels that ranging from 0-4
dataset_tokenized = dataset_tokenized.map(to_label)

dataset_tokenized.set_format("torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"])

Reusing dataset amazon_us_reviews (/home/zh2095/.cache/huggingface/datasets/amazon_us_reviews/Video_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563)
loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /home/zh2095/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "

In [22]:
pred_output = trainer.predict(dataset_tokenized)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: review_headline, text, product_id, customer_id. If review_headline, text, product_id, customer_id are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 380604
  Batch size = 64


In [91]:
# get prediction and evaluate
pred = pred_output[0].argmax(axis=1)
truth = pred_output[1]
accuracy = load_metric('accuracy')
f1 = load_metric('f1')
accuracy.compute(predictions=pred, references=truth)

{'accuracy': 0.7418340322224675}

In [89]:
f1.compute(predictions=pred, references=truth, average='weighted')

{'f1': 0.7268553841761681}

# Save the dataset expanded by rating predicted by sentiment analysis

In [65]:
# expand the original dataset with the predicted rating
data = dataset[:]
data['senti_rating_finetune'] = pred + 1

In [66]:
df = pd.DataFrame.from_dict(data)

# extract variables needed for CF recommender
df= df[['customer_id', 'product_id', 'star_rating', 'senti_rating_finetune']]
df.rename(columns = {'customer_id' : 'user', 'product_id' : 'item', 'star_rating' : 'rating'}, inplace = True)
df.head()

Unnamed: 0,user,item,rating,senti_rating_finetune
0,49033728,6302503213,5,5
1,17857748,B000059PET,5,5
2,25551507,0788812807,4,5
3,21025041,6302509939,5,5
4,40943563,B00JENS2BI,3,3


In [67]:
# save as csv files
df.to_csv('../data/amazon_video.csv')