In [61]:
from torch.utils.data import DataLoader,Dataset
import pandas as pd
import torch
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers import BertWordPieceTokenizer
from transformers import AutoTokenizer
from transformers import BertForSequenceClassification, BertTokenizer

In [40]:
with open("./data/amazon_sample.csv") as f:
    reader = pd.read_csv(f,chunksize=64,iterator=True)
    
    for chunk in reader.get_chunk():
        print(chunk)
        break
        

reviewText


In [18]:
df = pd.read_csv("./data/amazon_book_reviews_sample.csv",compression='gzip')
    

In [229]:
from sklearn.model_selection import train_test_split


In [263]:
X_train, X_test, y_train, y_test = train_test_split(df[['reviewText']], df[['overall']], test_size=0.01, stratify=df[['overall']],random_state=32)

In [264]:
train_ds = pd.DataFrame({"reviewText":X_train.reviewText,"overall":y_train.overall})
test_ds = pd.DataFrame({"reviewText":X_test.reviewText,"overall":y_test.overall})

In [265]:
train_ds.to_csv("./data/amazon_reviews_train.csv",index=None)
test_ds.to_csv("./data/amazon_reviews_test.csv",index=None)

In [26]:
df[['reviewText','overall']].to_csv("./data/amazon_sample.csv",index=None)

In [61]:
df['reviewText'].tolist()[:10]

['This once of the best books I have ever read. It was also delivered in time and in great condition.',
 "This book is great, a real experience. The author has a real connection to nature and canines and the end, of course, is sad, as the story of all man/dog relations must be. But the author can evoke his and Merle's world (and what a world it is) so completely that, for the week or so I spent with this book, I was more in the wilds of Wyoming than in my own home. But it wasn't completely a pleasant experience for me; as a vegetarian, I found the author's description of his and Merle's hunting experiences hard to take at times, even though the descriptions were not graphic. But it is a wonderful book.",
 "It's a good book. Just a book requirement for class. The book came on time and in superb condition. Recommend? If you need this book then go for it. It was so much cheaper here than the copy in my school.",
 "My least favorite book of the series so far. I didn't find the mystery very

In [173]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)
tokenizer.batch_encode_plus(df.reviewText.tolist()[:5],
                                                add_special_tokens=True, 
                                                return_attention_mask=True, 
                                                pad_to_max_length=True, 
                                                max_length=50, 
                                                return_tensors='pt')

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'input_ids': tensor([[  101,  2023,  2320,  1997,  1996,  2190,  2808,  1045,  2031,  2412,
          3191,  1012,  2009,  2001,  2036,  5359,  1999,  2051,  1998,  1999,
          2307,  4650,  1012,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2023,  2338,  2003,  2307,  1010,  1037,  2613,  3325,  1012,
          1996,  3166,  2038,  1037,  2613,  4434,  2000,  3267,  1998, 28735,
          2015,  1998,  1996,  2203,  1010,  1997,  2607,  1010,  2003,  6517,
          1010,  2004,  1996,  2466,  1997,  2035,  2158,  1013,  3899,  4262,
          2442,  2022,  1012,  2021,  1996,  3166,  2064, 23408, 11045,   102],
        [  101,  2009,  1005,  1055,  1037,  2204,  2338,  1012,  2074,  1037,
          2338,  9095,  2005,  2465,  1012,  1996,  2338,  2234,  2006,  2051,
          1998,  1999, 21688,  4650,

In [280]:
# Create Dataset
import numpy as np
from torch.utils.data import TensorDataset
class CSVDataset(Dataset):
    def __init__(self, path, chunksize, nb_samples):
        self.path = path
        self.chunksize = chunksize
        self.len = int(nb_samples / self.chunksize)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)
    def __getitem__(self, index):
        inputs = next(
            pd.read_csv(
                self.path,
                skiprows=index * self.chunksize + 1,  #+1, since we skip the header
                chunksize=self.chunksize,
                names=['reviewText','label']))
        y = torch.from_numpy(inputs.label.values)
        #print(inputs.reviewText.tolist())
        texts = self.tokenizer.batch_encode_plus(inputs.reviewText.tolist(),
                                                add_special_tokens=True, 
                                                return_attention_mask=True, 
                                                pad_to_max_length=True, 
                                                max_length=512,truncation=True, 
                                                return_tensors='pt'
                                                )
        ids = texts['input_ids'][0]
        mask = texts['attention_mask'][0]
        token_type_ids = texts["token_type_ids"][0]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(inputs.label.values-1, dtype=torch.long)[0]}
        #print(texts)
        #text = [np.array(t.ids) for t in texts]
        #text = (text)
        #print(text[0])
        #return {"text":texts,"y":y}

    def __len__(self):
        return self.len


train_dataset = CSVDataset('./data/amazon_reviews_train.csv', chunksize=64, nb_samples=train_ds.shape[0])
train_loader = DataLoader(train_dataset, batch_size=2, num_workers=1, shuffle=True)
test_dataset = CSVDataset('./data/amazon_reviews_test.csv', chunksize=64, nb_samples=test_ds.shape[0])
test_loader = DataLoader(test_dataset, batch_size=2, num_workers=1, shuffle=True)

for batch_idx, text in enumerate(train_loader):
    print('batch: {}\tinput: {},\ty: {}'.format(batch_idx, text['ids'],text['targets']))
    print(len(text['ids']),len(text['targets']))
    if batch_idx>2:
        break



batch: 0	input: tensor([[  101,  1045,  1005,  ...,     0,     0,     0],
        [  101,  6156, 11417,  ...,     0,     0,     0]]),	y: tensor([1, 2])
2 2
batch: 1	input: tensor([[  101,  2009,  2001,  ...,     0,     0,     0],
        [  101,  1037, 26380,  ...,     0,     0,     0]]),	y: tensor([2, 4])
2 2
batch: 2	input: tensor([[  101,  2079,  2017,  ...,     0,     0,     0],
        [  101, 15654,  2139,  ...,     0,     0,     0]]),	y: tensor([4, 4])
2 2
batch: 3	input: tensor([[ 101, 1045, 2001,  ...,    0,    0,    0],
        [ 101, 2138, 2027,  ...,    0,    0,    0]]),	y: tensor([3, 3])
2 2


In [151]:
len(df['overall'].unique())

5

In [281]:
from transformers import BertModel
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained('distilbert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 500)
        self.l4 = torch.nn.Linear(500, 5)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1)
        output_3 = self.l3(output_2)
        output = self.l4(output_3)
        return output

model = BERTClass()
model.to(device)
criterion = torch.nn.CrossEntropyLoss()


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing BertModel: ['distilbert.embeddings.word_embeddings.weight', 'distilbert.embeddings.position_embeddings.weight', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.0.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.k_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.0.sa_layer_norm.weight', 'distilbert.transformer.layer.0.sa_layer_norm.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transformer.layer.0.ffn.lin2.weight', 'd

In [199]:
model = BertForSequenceClassification.from_pretrained("distilbert-base-uncased",
                                                      num_labels=5,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing BertForSequenceClassification: ['distilbert.embeddings.word_embeddings.weight', 'distilbert.embeddings.position_embeddings.weight', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.0.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.k_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.0.sa_layer_norm.weight', 'distilbert.transformer.layer.0.sa_layer_norm.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transformer.layer.0.

In [276]:
from transformers import AdamW, get_linear_schedule_with_warmup


optimizer = torch.optim.Adam(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)
                  
epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=train_dataset.len*epochs)


In [241]:

from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    #label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [202]:
from tqdm.notebook import tqdm

In [214]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    progress_bar = tqdm(dataloader_val, desc='Evaluation: ', leave=False, disable=False)
    for batch in progress_bar:
        b = tuple((batch['ids'].to('cpu'),batch['token_type_ids'].to('cpu'),batch['mask'].to('cpu'),batch['targets'].to('cpu')))
        #print(b)
        inputs = {'input_ids':      b[0],
                  'token_type_ids': b[1],
                  'attention_mask': b[2],
                  'labels': b[3]
                 }   

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
        progress_bar.set_postfix({'eval_loss': '{:.3f}'.format(loss.item()/len(batch))})
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [222]:

import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
#torch.cuda.manual_seed_all(seed_val)


    
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(loader, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        #print(batch)
        b = tuple((batch['ids'].to('cpu'),batch['token_type_ids'].to('cpu'),batch['mask'].to('cpu'),batch['targets'].to('cpu')))
        #print(b)
        inputs = {'input_ids':      b[0],
                  'token_type_ids': b[1],
                  'attention_mask': b[2],
                  'labels': b[3]
                 }   

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'data/finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(df)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(loader)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=781.0, style=ProgressStyle(description_widt…




Epoch 1
Training loss: 0.010064628523290157


HBox(children=(FloatProgress(value=0.0, description='Evaluation: ', max=781.0, style=ProgressStyle(description…



Validation loss: 1.2368654566942194
F1 Score (Weighted): 0.36436189337503344


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=781.0, style=ProgressStyle(description_widt…






KeyboardInterrupt: 

Training loss: 0.00032171043276786804


HBox(children=(FloatProgress(value=0.0, description='Evaluation: ', max=500.0, style=ProgressStyle(description…



Validation loss: 1.1362975490093232
F1 Score (Weighted): 0.5193510002245469


In [277]:
device='cpu'
def train(epoch,train_loader):
    model.train()
    progress_bar = tqdm(train_loader, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for data in progress_bar:
        model.zero_grad()
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
        #print(len(ids[0]),len(mask),len(token_type_ids),len(targets[0]))
        outputs = model(ids,mask,token_type_ids)
        #print(outputs)
        #print(targets)
        optimizer.zero_grad()
        loss = criterion(outputs, targets)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})   
        #optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
    torch.save(model.state_dict(), f'data/finetuned_BERT_epoch_{epoch}.model')
    
    
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    progress_bar = tqdm(dataloader_val, desc='Evaluation: ', leave=False, disable=False)
    for data in progress_bar:
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
        #print(len(ids[0]),len(mask),len(token_type_ids),len(targets[0]))
        with torch.no_grad():        
            outputs = model(ids,mask,token_type_ids)
            
        loss = criterion(outputs,targets)
        #logits = outputs[1]
        loss_val_total += loss.item()

        logits = outputs.detach().cpu().numpy()
        label_ids = targets.detach().cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
        progress_bar.set_postfix({'eval_loss': '{:.3f}'.format(loss.item()/len(batch))})
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [278]:
train_ds.shape

(99000, 2)

In [282]:

for epoch in range(epochs):
    #if epoch != 0:
    train(epoch,train_loader)
    val_loss, predictions, true_vals = evaluate(test_loader)
    val_f1 = f1_score_func(predictions, true_vals)
    
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    tqdm.write(f'Accuracy by class: {accuracy_per_class(predictions, true_vals)}')

HBox(children=(FloatProgress(value=0.0, description='Epoch 0', max=773.0, style=ProgressStyle(description_widt…



HBox(children=(FloatProgress(value=0.0, description='Evaluation: ', max=8.0, style=ProgressStyle(description_w…



Validation loss: 1.5799352079629898
F1 Score (Weighted): 0.41739130434782606
Class: 1
Accuracy: 0/1

Class: 2
Accuracy: 0/1

Class: 3
Accuracy: 0/4

Class: 4
Accuracy: 8/9

Accuracy by class: None


HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=773.0, style=ProgressStyle(description_widt…



HBox(children=(FloatProgress(value=0.0, description='Evaluation: ', max=8.0, style=ProgressStyle(description_w…



Validation loss: 1.5793904662132263
F1 Score (Weighted): 0.41739130434782606
Class: 1
Accuracy: 0/1

Class: 2
Accuracy: 0/1

Class: 3
Accuracy: 0/4

Class: 4
Accuracy: 8/9

Accuracy by class: None


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=773.0, style=ProgressStyle(description_widt…



HBox(children=(FloatProgress(value=0.0, description='Evaluation: ', max=8.0, style=ProgressStyle(description_w…



Validation loss: 1.579997032880783
F1 Score (Weighted): 0.41739130434782606
Class: 1
Accuracy: 0/1

Class: 2
Accuracy: 0/1

Class: 3
Accuracy: 0/4

Class: 4
Accuracy: 8/9

Accuracy by class: None


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=773.0, style=ProgressStyle(description_widt…



HBox(children=(FloatProgress(value=0.0, description='Evaluation: ', max=8.0, style=ProgressStyle(description_w…



Validation loss: 1.5881664007902145
F1 Score (Weighted): 0.41739130434782606
Class: 1
Accuracy: 0/1

Class: 2
Accuracy: 0/1

Class: 3
Accuracy: 0/4

Class: 4
Accuracy: 8/9

Accuracy by class: None


HBox(children=(FloatProgress(value=0.0, description='Epoch 4', max=773.0, style=ProgressStyle(description_widt…



HBox(children=(FloatProgress(value=0.0, description='Evaluation: ', max=8.0, style=ProgressStyle(description_w…



Validation loss: 1.578774943947792
F1 Score (Weighted): 0.41739130434782606
Class: 1
Accuracy: 0/1

Class: 2
Accuracy: 0/1

Class: 3
Accuracy: 0/4

Class: 4
Accuracy: 8/9

Accuracy by class: None
