In [1]:
!pip install openprompt

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/ext3/conda/bootcamp/bin/python3.8 -m pip install --upgrade pip' command.[0m


# Import packages

In [24]:
import torch
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

from transformers import  AdamW, get_linear_schedule_with_warmup
from transformers import BertTokenizer, BertModel, AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer,BertForSequenceClassification
from datasets import load_dataset, ClassLabel, Value, load_metric

from openprompt import PromptForClassification, PromptDataLoader
from openprompt.data_utils import InputExample
from openprompt.plms import load_plm
from openprompt.prompts import ManualTemplate, ManualVerbalizer

# Preprocess to obtain the prompt ready dataset

In [3]:
# sampling the dataset for fine-tuning
train = load_dataset('amazon_us_reviews', 'Video_v1_00', split='train[:60%]') 

Reusing dataset amazon_us_reviews (/home/zh2095/.cache/huggingface/datasets/amazon_us_reviews/Video_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563)


In [4]:
# remove unuseful columns
train = train.remove_columns(['customer_id','product_id','marketplace', 'review_id', 'product_parent', 'product_title', 'product_category', \
                      'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_date', 'review_headline'])

In [5]:
# rename columns
train = train.rename_column("star_rating", "labels")
train = train.rename_column("review_body", "text")

In [6]:
# convert star rating that ranging from 1-5 to labels that ranging from 0-4
def to_label(x):
    x['labels']  = x['labels'] - 1
    return x

train = train.map(to_label)

Loading cached processed dataset at /home/zh2095/.cache/huggingface/datasets/amazon_us_reviews/Video_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563/cache-0d7a52179b50824f.arrow


In [7]:
# create dataset for prompting
dataset = []
for i in range(len(train)):
    original_data = train[i]
    data = InputExample(label = original_data['labels'],
                        text_a = original_data['text'])
    dataset.append(data)

In [8]:
dataset[0]

{
  "guid": null,
  "label": 4,
  "meta": {},
  "text_a": "Fast shipping. Pleasure to deal with. Would recommend. A+++. Thanks!",
  "text_b": "",
  "tgt_text": null
}

# Prompt Learning

### 1. Define the task

In [9]:
classes = [0, 1, 2, 3, 4]
dataset = dataset

### 2. Obtain a PLM

In [10]:
plm, tokenizer, model_config, WrapperClass = load_plm("bert", "bert-base-cased")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### 3. Define a Template

In [11]:
promptTemplate = ManualTemplate(
    text = '{"placeholder":"text_a"} Overall, it was a {"mask"} movie',
    tokenizer = tokenizer,
)

### 4. Define a Verbalizer

In [12]:
promptVerbalizer = ManualVerbalizer(
    classes = classes,
    label_words = {
        0: ["awful"],
        1: ["bad"],
        2: ["fair"],
        3: ["good"],
        4: ["wonderful"]
    },
    tokenizer = tokenizer,
)

### 5. Construct a PromptModel

In [13]:
use_cuda = True
promptModel = PromptForClassification(
    template = promptTemplate,
    plm = plm,
    verbalizer = promptVerbalizer,
)

if use_cuda:
    promptModel=  promptModel.cuda()

### 6. Define a DataLoader

In [14]:
# keep the same configuration as the fine-tune one
data_loader = PromptDataLoader(
    dataset = dataset,
    tokenizer = tokenizer,
    template = promptTemplate,
    tokenizer_wrapper_class=WrapperClass,
    max_seq_length=32, 
    batch_size=64,
    truncate_method="tail")


tokenizing: 0it [00:00, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (755 > 512). Running this sequence through the model will result in indexing errors
tokenizing: 228362it [07:59, 476.42it/s]


### 7. Train the PromptModel

In [15]:
loss_func = torch.nn.CrossEntropyLoss()
no_decay = ['bias', 'LayerNorm.weight']

# it's always good practice to set no decay to biase and LayerNorm parameters
optimizer_grouped_parameters = [
    {'params': [p for n, p in promptModel.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in promptModel.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5) 

for epoch in range(10):
    tot_loss = 0
    for step, inputs in enumerate(data_loader):
        if use_cuda:
            inputs = inputs.cuda()
        logits = promptModel(inputs)
        labels = inputs['label']
        loss = loss_func(logits, labels)
        loss.backward()
        tot_loss += loss.item()
        optimizer.step()
        optimizer.zero_grad()
        if step %1000 ==1:
            print("Epoch {}, average loss: {}".format(epoch, tot_loss/(step+1)), flush=True)




Epoch 0, average loss: 2.7009607553482056
Epoch 0, average loss: 0.8218350152293603
Epoch 0, average loss: 0.9048118121497757
Epoch 0, average loss: 0.9386626807035882
Epoch 1, average loss: 0.7404760122299194
Epoch 1, average loss: 0.7070092210274732
Epoch 1, average loss: 0.768560427662495
Epoch 1, average loss: 0.7898571948501605
Epoch 2, average loss: 0.562580019235611
Epoch 2, average loss: 0.5261934837419354
Epoch 2, average loss: 0.5469722600279749
Epoch 2, average loss: 0.5600425761632328
Epoch 3, average loss: 0.4192337393760681
Epoch 3, average loss: 0.37574160427687886
Epoch 3, average loss: 0.38805878180991876
Epoch 3, average loss: 0.40077389897822063
Epoch 4, average loss: 0.3250827193260193
Epoch 4, average loss: 0.27596486138250537
Epoch 4, average loss: 0.28203774682738325
Epoch 4, average loss: 0.2929028686456169
Epoch 5, average loss: 0.24261796474456787
Epoch 5, average loss: 0.2052813686668397
Epoch 5, average loss: 0.20597376037281115
Epoch 5, average loss: 0.2131

### 8. Evaluation and Prediction

In [16]:
# load the entire dataset
all_data = load_dataset('amazon_us_reviews', 'Video_v1_00', split='train')

# remove unuseful columns
all_data = all_data.remove_columns(['customer_id','product_id','marketplace', 'review_id', 'product_parent', 'product_title', 'product_category', \
                      'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_date', 'review_headline'])

# rename columns
all_data = all_data.rename_column("star_rating", "labels")
all_data = all_data.rename_column("review_body", "text")

# convert star rating that ranging from 1-5 to labels that ranging from 0-4
all_data = all_data.map(to_label)

# create dataset for prompting
all_dataset = []
for i in range(len(all_data)):
    original_data = all_data[i]
    data = InputExample(label = original_data['labels'],
                        text_a = original_data['text'])
    all_dataset.append(data)

all_dataset[0]

Reusing dataset amazon_us_reviews (/home/zh2095/.cache/huggingface/datasets/amazon_us_reviews/Video_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563)
Loading cached processed dataset at /home/zh2095/.cache/huggingface/datasets/amazon_us_reviews/Video_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563/cache-35bc8af46f828eb2.arrow


{
  "guid": null,
  "label": 4,
  "meta": {},
  "text_a": "Fast shipping. Pleasure to deal with. Would recommend. A+++. Thanks!",
  "text_b": "",
  "tgt_text": null
}

In [17]:
# create validation dataloader
validation_dataloader = PromptDataLoader(
    dataset = all_dataset,
    tokenizer = tokenizer,
    template = promptTemplate,
    tokenizer_wrapper_class=WrapperClass,
    max_seq_length=32, 
    batch_size=64,
    truncate_method="tail")

tokenizing: 380604it [12:40, 500.45it/s]


In [18]:
# get predictions with the trained prompt model
allpreds = []
alllabels = []

for step, inputs in enumerate(validation_dataloader):
    if use_cuda:
        inputs = inputs.cuda()
    logits = promptModel(inputs)
    labels = inputs['label']
    alllabels.extend(labels.cpu().tolist())
    allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())

In [25]:
# evaluate with accuracy and f1
acc = accuracy_score(alllabels, allpreds)
f1 = f1_score(alllabels, allpreds, average='weighted')

print(f'Accuracy: {acc}')
print(f'F1 score: {f1}')

Accuracy: 0.7600655799728852
F1 score: 0.7641498343395504


# Save the dataset expanded with prompt based sentiment rating

In [20]:
# expand the original dataset with the predicted rating
data = load_dataset('amazon_us_reviews', 'Video_v1_00', split='train')
data = data.remove_columns(['marketplace', 'review_id', 'product_parent', 'product_title', 'product_category', \
                      'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_date'])

data = data[:]
allpreds = np.array(allpreds)
data['senti_rating_prompt'] = allpreds + 1

Reusing dataset amazon_us_reviews (/home/zh2095/.cache/huggingface/datasets/amazon_us_reviews/Video_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563)


In [21]:
df = pd.DataFrame.from_dict(data)

# extract variables needed for CF recommender
df= df[['customer_id', 'product_id', 'star_rating', 'senti_rating_prompt']]
df.rename(columns = {'customer_id' : 'user', 'product_id' : 'item', 'star_rating' : 'rating'}, inplace = True)
df.head()

Unnamed: 0,user,item,rating,senti_rating_prompt
0,49033728,6302503213,5,5
1,17857748,B000059PET,5,5
2,25551507,0788812807,4,5
3,21025041,6302509939,5,5
4,40943563,B00JENS2BI,3,3


In [22]:
# save as csv files
df.to_csv('../data/amazon_video_prompt.csv')