In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers import BertModel
import datasets
from datasets import load_dataset

from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm

We split the data into five categories of clothing (Clothes, Women Clothing, Men Clothing, Baby Clothing, Shoes) and two categories of entertainment products (Music, Movies).

Please pick one among the available configs: ['Wireless_v1_00', 'Watches_v1_00', 'Video_Games_v1_00', 'Video_DVD_v1_00', 'Video_v1_00', 'Toys_v1_00', 'Tools_v1_00', 'Sports_v1_00', 'Software_v1_00', 'Shoes_v1_00', 'Pet_Products_v1_00', 'Personal_Care_Appliances_v1_00', 'PC_v1_00', 'Outdoors_v1_00', 'Office_Products_v1_00', 'Musical_Instruments_v1_00', 'Music_v1_00', 'Mobile_Electronics_v1_00', 'Mobile_Apps_v1_00', 'Major_Appliances_v1_00', 'Luggage_v1_00', 'Lawn_and_Garden_v1_00', 'Kitchen_v1_00', 'Jewelry_v1_00', 'Home_Improvement_v1_00', 'Home_Entertainment_v1_00', 'Home_v1_00', 'Health_Personal_Care_v1_00', 'Grocery_v1_00', 'Gift_Card_v1_00', 'Furniture_v1_00', 'Electronics_v1_00', 'Digital_Video_Games_v1_00', 'Digital_Video_Download_v1_00', 'Digital_Software_v1_00', 'Digital_Music_Purchase_v1_00', 'Digital_Ebook_Purchase_v1_00', 'Camera_v1_00', 'Books_v1_00', 'Beauty_v1_00', 'Baby_v1_00', 'Automotive_v1_00', 'Apparel_v1_00', 'Digital_Ebook_Purchase_v1_01', 'Books_v1_01', 'Books_v1_02']
Example of usage:
	`load_dataset('amazon_us_reviews', 'Wireless_v1_00')`

In [2]:
#data_path = '/home/ubuntu/NLP-brain-biased-robustness/data/amazon/'
#amazon_baby = load_dataset(data_path+'Baby_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563/')

amazon_baby = load_dataset('amazon_us_reviews','Baby_v1_00')
amazon_shoes = load_dataset('amazon_us_reviews','Shoes_v1_00')
amazon_clothes = load_dataset('amazon_us_reviews','Apparel_v1_00')
amazon_music = load_dataset('amazon_us_reviews','Music_v1_00')
amazon_video = load_dataset('amazon_us_reviews','Video_v1_00')

Reusing dataset amazon_us_reviews (/home/ubuntu/.cache/huggingface/datasets/amazon_us_reviews/Baby_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563)


  0%|          | 0/1 [00:00<?, ?it/s]

Reusing dataset amazon_us_reviews (/home/ubuntu/.cache/huggingface/datasets/amazon_us_reviews/Shoes_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563)


  0%|          | 0/1 [00:00<?, ?it/s]

Reusing dataset amazon_us_reviews (/home/ubuntu/.cache/huggingface/datasets/amazon_us_reviews/Apparel_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563)


  0%|          | 0/1 [00:00<?, ?it/s]

Reusing dataset amazon_us_reviews (/home/ubuntu/.cache/huggingface/datasets/amazon_us_reviews/Music_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563)


  0%|          | 0/1 [00:00<?, ?it/s]

Reusing dataset amazon_us_reviews (/home/ubuntu/.cache/huggingface/datasets/amazon_us_reviews/Video_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563)


  0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
baby_small = amazon_baby['train'].select(range(200000, len(amazon_baby['train']))).shuffle(seed=42).select(range(10000))
shoes_small = amazon_shoes['train'].shuffle(seed=42).select(range(50000)).select(range(10000))
clothes_small = amazon_clothes['train'].shuffle(seed=42).select(range(50000)).select(range(10000))
music_small = amazon_music['train'].shuffle(seed=42).select(range(50000)).select(range(10000))
video_small = amazon_video['train'].shuffle(seed=42).select(range(50000)).select(range(10000))

Loading cached shuffled indices for dataset at /home/ubuntu/.cache/huggingface/datasets/amazon_us_reviews/Baby_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563/cache-e4c4210a5c1dbf68.arrow


In [None]:
print(len(baby_small))
print(len(amazon_shoes['train']))
print(len(amazon_clothes['train']))
print(len(amazon_music['train']))
print(len(amazon_video['train']))

In [4]:

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

#tokenize function
def tokenize_data(examples):
    return tokenizer(examples['review_body'], padding="max_length", truncation=True)

#pre-tokenize entire dataset
tokenized_baby = baby_small.map(tokenize_data, batched=True)
tokenized_shoes = shoes_small.map(tokenize_data, batched=True)
tokenized_clothes = clothes_small.map(tokenize_data, batched=True)
tokenized_music = music_small.map(tokenize_data, batched=True)
tokenized_video = video_small.map(tokenize_data, batched=True)

delete_list = ['marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'product_category', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_headline', 'review_body', 'review_date']
tokenized_baby = tokenized_baby.remove_columns(delete_list)
tokenized_baby = tokenized_baby.rename_column("star_rating", "labels")
tokenized_baby.set_format("torch")

tokenized_shoes = tokenized_shoes.remove_columns(delete_list)
tokenized_shoes = tokenized_shoes.rename_column("star_rating", "labels")
tokenized_shoes.set_format("torch")

tokenized_clothes = tokenized_clothes.remove_columns(delete_list)
tokenized_clothes = tokenized_clothes.rename_column("star_rating", "labels")
tokenized_clothes.set_format("torch")

tokenized_music = tokenized_music.remove_columns(delete_list)
tokenized_music = tokenized_music.rename_column("star_rating", "labels")
tokenized_music.set_format("torch")

tokenized_video = tokenized_video.remove_columns(delete_list)
tokenized_video = tokenized_video.rename_column("star_rating", "labels")
tokenized_video.set_format("torch")


baby_dataloader = DataLoader(tokenized_baby, shuffle=True, batch_size=8)
shoes_dataloader = DataLoader(tokenized_shoes, shuffle=True, batch_size=8)
clothes_dataloader = DataLoader(tokenized_clothes, shuffle=True, batch_size=8)
music_dataloader = DataLoader(tokenized_music, shuffle=True, batch_size=8)
video_dataloader = DataLoader(tokenized_video, shuffle=True, batch_size=8)

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

In [9]:
import torch.nn.functional as F
import wandb

def change_all_keys(pre_odict):
    def change_key(odict, old, new):
        for _ in range(len(odict)):
            k, v = odict.popitem(False)
            odict[new if old == k else k] = v
            return odict
    for key in pre_odict.keys():
        if key[:5] == 'bert.':
            post_odict = change_key(pre_odict, key, key[5:])
            return change_all_keys(post_odict)
        if key[:7] == 'linear.':
            del pre_odict[key]
            return change_all_keys(pre_odict)
    return pre_odict

class PlaceHolderBERT(nn.Module):
    def __init__(self, num_out=5, sigmoid=False, return_CLS_representation=False, brain=True):
        super().__init__()
        #self.tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
        self.bert = BertModel.from_pretrained('bert-base-cased')
        if brain == True:
            state_path = '/home/ubuntu/nlp-brain-biased-robustness/state_dicts/NSD_model_prime_prime_epoch_10'
            pre_odict = torch.load(state_path)
            filtered_odict = change_all_keys(pre_odict)
            self.bert.load_state_dict(filtered_odict, strict=True)
        self.linear = nn.Linear(768,num_out)
        self.return_CLS_representation = return_CLS_representation
        self.sigmoid_bool = sigmoid
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        #embeddings = self.tokenizer(x, return_tensors='pt', padding=True)
        #embeddings.to(device)
        representations = self.bert(**x).last_hidden_state
        cls_representation = representations[:,0,:]
        pred = self.linear(cls_representation)
        if self.return_CLS_representation:
            return cls_representation
        if self.sigmoid_bool:
            return self.sigmoid(pred)
        return pred
    
    
def train(model, dataloader, num_epochs=10): #can scrap keyword
    wandb.init(project="preliminary results just in case", entity="nlp-brain-biased-robustness")
    wandb.run.name = 'amazon bb bert'
    wandb.config = {
      "learning_rate": 5e-5,
      "epochs": 10,
      "batch_size": 8
    }
    #optimizer as usual
    optimizer = AdamW(model.parameters(), lr=5e-5)
    loss_function = torch.nn.MSELoss()
    #learning rate scheduler
    num_training_steps = num_epochs * len(dataloader)
    lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)

    #auto logging; progress bar
    progress_bar = tqdm(range(num_training_steps))

    #training loop
    model.train()
    for epoch in range(num_epochs):
        for batch in dataloader: #tryin unpacking text from 'labels' as in model development
            #batch = {k: v.to(device) for k, v in batch.items()}
            features = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            preds = model(features)
            targets = F.one_hot((batch['labels']-1).to(torch.int64), num_classes=5).to(device)
            loss = loss_function(preds, targets.float()) #replace .loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
        baby_score = evaluate(model, baby_dataloader)
        print(baby_score)
        wandb.log({"baby": baby_score})
        shoes_score = evaluate(model, shoes_dataloader)
        print(shoes_score)
        wandb.log({"shoes": shoes_score})
        clothes_score = evaluate(model, clothes_dataloader)
        print(clothes_score)
        wandb.log({"clothes": clothes_score})
        music_score = evaluate(model, music_dataloader)
        print(music_score)
        wandb.log({"music": music_score})
        video_score = evaluate(model, video_dataloader)
        print(video_score)
        wandb.log({"video": video_score})
        print("_________________________________________________")
            

def evaluate(model, dataloader):
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)
    model.eval()
    num_correct = 0
    num_samples = 0
    for batch in dataloader:
        #batch = {k: v.to(device) for k, v in batch.items()}
        features = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        with torch.no_grad():
            preds = model(features)
            preds = torch.argmax(preds, axis=1)
            labels = F.one_hot((batch['labels']-1).to(torch.int64), num_classes=5).to(device)
            labels = torch.argmax(labels, axis=1)
            num_correct += (preds==labels).sum()
            num_samples += preds.size(0)
    return float(num_correct)/float(num_samples)*100 

In [None]:
model = PlaceHolderBERT()
train(model, baby_dataloader)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
baby,▁▄█
clothes,█▃▁
music,█▁█
shoes,█▁▁
video,█▁

0,1
baby,92.69
clothes,65.52
music,70.57
shoes,67.72
video,59.7


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/12500 [00:00<?, ?it/s]

74.86
67.22
66.36
70.75
62.660000000000004
_________________________________________________
83.72
67.38
65.31


In [17]:
evaluate(model, baby_dataloader)

62.4