In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers import BertModel
from datasets import load_dataset

from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm

from datasets import load_metric

class PlaceHolderBERT(nn.Module):
    def __init__(self, num_out=1, sigmoid=False, return_CLS_representation=False):
        super().__init__()
        #self.tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.linear = nn.Linear(768,num_out)
        self.return_CLS_representation = return_CLS_representation
        self.sigmoid_bool = sigmoid
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        #embeddings = self.tokenizer(x, return_tensors='pt', padding=True)
        #embeddings.to(device)
        representations = self.bert(**x).last_hidden_state
        cls_representation = representations[:,0,:]
        pred = self.linear(cls_representation)
        if self.return_CLS_representation:
            return cls_representation
        if self.sigmoid_bool:
            return self.sigmoid(pred)
        return pred
    
    
def train(model, dataloader, num_epochs=1): #can scrap keyword
    #optimizer as usual
    optimizer = AdamW(model.parameters(), lr=5e-5)
    loss_function = torch.nn.MSELoss()
    #learning rate scheduler
    num_training_steps = num_epochs * len(dataloader)
    lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)

    #auto logging; progress bar
    progress_bar = tqdm(range(num_training_steps))

    #training loop
    model.train()
    for epoch in range(num_epochs):
        for batch in dataloader: #tryin unpacking text from 'labels' as in model development
            batch = {k: v.to(device) for k, v in batch.items()}
            features = {k: v for k, v in batch.items() if k != 'labels'}
            preds = model(features)
            loss = loss_function(preds, batch['labels'].float()) #replace .loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
            

def evaluate(model, dataloader):
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)
    model.eval()
    num_correct = 0
    num_samples = 0
    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        features = {k: v for k, v in batch.items() if k != 'labels'}
        with torch.no_grad():
            preds = model(features)
            preds = torch.where(preds < .5, 0, 1)
            labels = batch['labels'].reshape(preds.shape)
            num_correct += (preds==labels).sum()
            num_samples += preds.size(0)
    return float(num_correct)/float(num_samples)*100 



tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

#tokenize function
def tokenize_imdb(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

def tokenize_sst2(examples):
    return tokenizer(examples['sentence'], padding="max_length", truncation=True)

#pre-tokenize entire dataset
tokenized_imdb = imdb.map(tokenize_imdb, batched=True)
tokenized_sst2 = sst2.map(tokenize_sst2, batched=True)

tokenized_imdb = tokenized_imdb.remove_columns(["text"])
tokenized_imdb = tokenized_imdb.rename_column("label", "labels")
tokenized_imdb.set_format("torch")

tokenized_sst2 = tokenized_sst2.remove_columns(["sentence","idx"])
tokenized_sst2 = tokenized_sst2.rename_column("label", "labels")
tokenized_sst2.set_format("torch")


### Only for practive
imdb_small_train = tokenized_imdb['train'].shuffle(seed=42).select(range(1000))
imdb_small_test = tokenized_imdb['test'].shuffle(seed=42).select(range(500))
###
imdb_train_loader = DataLoader(imdb_small_train, shuffle=True, batch_size=8)
imdb_test_loader = DataLoader(imdb_small_test, shuffle=True, batch_size=8)

sst2_small_train = tokenized_sst2["train"].shuffle(seed=42).select(range(1000))
sst2_small_test = tokenized_sst2["validation"].shuffle(seed=42).select(range(500)) #actual test set is fucked up

sst2_train_loader = DataLoader(sst2_small_train, shuffle=True, batch_size=8)
sst2_test_loader = DataLoader(sst2_small_test, shuffle=True, batch_size=8)

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers import BertModel
from datasets import load_dataset

from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm

We split the data into five categories of clothing (Clothes, Women Clothing, Men Clothing, Baby Clothing, Shoes) and two categories of entertainment products (Music, Movies).

Please pick one among the available configs: ['Wireless_v1_00', 'Watches_v1_00', 'Video_Games_v1_00', 'Video_DVD_v1_00', 'Video_v1_00', 'Toys_v1_00', 'Tools_v1_00', 'Sports_v1_00', 'Software_v1_00', 'Shoes_v1_00', 'Pet_Products_v1_00', 'Personal_Care_Appliances_v1_00', 'PC_v1_00', 'Outdoors_v1_00', 'Office_Products_v1_00', 'Musical_Instruments_v1_00', 'Music_v1_00', 'Mobile_Electronics_v1_00', 'Mobile_Apps_v1_00', 'Major_Appliances_v1_00', 'Luggage_v1_00', 'Lawn_and_Garden_v1_00', 'Kitchen_v1_00', 'Jewelry_v1_00', 'Home_Improvement_v1_00', 'Home_Entertainment_v1_00', 'Home_v1_00', 'Health_Personal_Care_v1_00', 'Grocery_v1_00', 'Gift_Card_v1_00', 'Furniture_v1_00', 'Electronics_v1_00', 'Digital_Video_Games_v1_00', 'Digital_Video_Download_v1_00', 'Digital_Software_v1_00', 'Digital_Music_Purchase_v1_00', 'Digital_Ebook_Purchase_v1_00', 'Camera_v1_00', 'Books_v1_00', 'Beauty_v1_00', 'Baby_v1_00', 'Automotive_v1_00', 'Apparel_v1_00', 'Digital_Ebook_Purchase_v1_01', 'Books_v1_01', 'Books_v1_02']
Example of usage:
	`load_dataset('amazon_us_reviews', 'Wireless_v1_00')`

In [4]:
amazon_baby = load_dataset('amazon_us_reviews','Baby_v1_00')
amazon_shoes = load_dataset('amazon_us_reviews','Shoes_v1_00')
amazon_clothes = load_dataset('amazon_us_reviews','Apparel_v1_00')
amazon_music = load_dataset('amazon_us_reviews','Music_v1_00')
amazon_video = load_dataset('amazon_us_reivews','Video_v1_00')

Reusing dataset amazon_us_reviews (/home/ubuntu/.cache/huggingface/datasets/amazon_us_reviews/Baby_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563)


  0%|          | 0/1 [00:00<?, ?it/s]

Reusing dataset amazon_us_reviews (/home/ubuntu/.cache/huggingface/datasets/amazon_us_reviews/Shoes_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563)


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading and preparing dataset amazon_us_reviews/Apparel_v1_00 (download: 618.59 MiB, generated: 2.10 GiB, post-processed: Unknown size, total: 2.71 GiB) to /home/ubuntu/.cache/huggingface/datasets/amazon_us_reviews/Apparel_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563...


Generating train split:   0%|          | 0/5906333 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [3]:
amazon_clothes

NameError: name 'amazon_clothes' is not defined

In [None]:
val = Out[2]['train']
s = set()
for term in val['product_category']:
    s.add(term)
    
s