In [1]:
# imports
import pandas as pd
from transformers import BertTokenizer
import torch
import numpy as np
import pandas as pd
import gzip
from nltk import TweetTokenizer
import json

Following: https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f

# Loading BERT and trying stuff out

In [2]:
# loading the pretrained model
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [3]:
example_text = 'I will watch Memento tonight'

bert_input = tokenizer(example_text, # specifying the text/file
                       padding='max_length', # to pad each sequence to a length we specify with max_length
                       max_length=10, # The max length of each sequence. We should use 512 since that's the max length with BERT
                       truncation=True, # If True, then any tokens over our max length will be cut-off
                       return_tensors='pt') # The type of tensors that will be returned pt for pytorch, tf for tensorflow

print(bert_input['input_ids'])
print(bert_input['token_type_ids'])
print(bert_input['attention_mask'])

tensor([[  101,   146,  1209,  2824,  2508, 26173,  3568,   102,     0,     0]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])


# What is input_ids?

This is the id representation of each token, which can be decoded into actual tokens like so:

In [4]:
example_text = tokenizer.decode(bert_input.input_ids[0])

print(example_text)

[CLS] I will watch Memento tonight [SEP] [PAD] [PAD]


As we can see, the BertTokenizer takes care of the necessary transformations of the input text such that it's ready to be used as an input for our BERT model.

It adds the [CLS], [SEP], and [PAD] tokens that we need.

Since the max_length = 10 we get the two [PAD] tokens at the end to make the length 10.

# What is token_type_ids?

This is a binary mask which identifies in which sequence a token belongs. If we have only a single sequence, then all of the token type ids will be 0. For a text classification task, token_type_ids is an optional input for our BERT model.

# What is attention_mask

This is a binary mask which identifies whether a word is a real word or just padding. If the token contains [CLS], [SEP], or any real word, then the mask would be 1. If the token is just [PAD], then the mask would be 0.

# Quick Note

We are using the BertTokenizer from bert-base-cased, which is a pre-trained BERT model that works well on English data.

# Loading in our datasets

In [5]:
def build_vocab(filepath, padding = False):
    train_vocab = {}
    train = gzip.open(filepath)
    counter1 = 0
    counter2 = 0
    counter3 = 0
    counter = 0
    if padding: 
        train_vocab['<PAD>'] = 0
        counter2 += 1
    no_reviewText = []
    labels = {}
    sentences = {}
    tokenizer = TweetTokenizer()
    for line in train:
        counter1 +=1
        #print(line)
        if 'reviewText' in json.loads(line).keys():
            a = json.loads(line)
            sentences[counter3] = a['reviewText']
            counter3 += 1
            if a['sentiment'] == 'positive':
                labels[counter] = 1
            elif a['sentiment'] == 'negative': 
                labels[counter] = 0
            counter +=1
            for word in tokenizer.tokenize(json.loads(line)['reviewText']):
                if word not in train_vocab.keys():
                    train_vocab[word] = counter2
                    counter2 += 1
        else:
            no_reviewText.append(counter1)
    final_dict = {'line_count' : counter1,
                 'review_count' : counter3,
                 'vocab_size' : counter2,
                 'no_text_reviews' : no_reviewText,
                 'labels' : labels,
                 'vocabulary' : train_vocab,
                 'sentences' : sentences}
    return final_dict

In [6]:
train = build_vocab('../classification/music_reviews_train.json.gz')

In [7]:
def zipped_to_pandas(filepath):
    data = gzip.open(filepath)
    df = pd.DataFrame(columns=['reviewText', 'sentiment'])

    for line in data:
        dicted = json.loads(line)
        if 'reviewText' not in dicted:
            reviewText = 'Null'
        else:
            reviewText = dicted['reviewText']

        if 'sentiment' not in dicted:
            sentiment = 'Null'
        else:
            sentiment = dicted['sentiment']

        filtered_dict = {'reviewText': [reviewText], 'sentiment': [sentiment]}
        temp_df = pd.DataFrame.from_dict(filtered_dict)
        df = pd.concat([df, temp_df])
    return df

In [14]:
train = zipped_to_pandas('../classification/music_reviews_train.json.gz')
train.head()

Unnamed: 0,reviewText,sentiment
0,"So creative! Love his music - the words, the ...",positive
0,This tape can hardly be understood and it was ...,negative
0,Buy the CD. Do not buy the MP3 album. Downlo...,negative
0,I love Dallas Holms music and voice! Thank Yo...,positive
0,Great memories of my early years in Christ,positive


In [15]:
dev = zipped_to_pandas('../classification/music_reviews_dev.json.gz')
dev.head()

Unnamed: 0,reviewText,sentiment
0,My dentist recommended this as a relaxation te...,positive
0,I am personally acquainted with a member of th...,negative
0,The Cd cover was broken when I got it,negative
0,"This is an uplifting, keep going, motivating s...",positive
0,I bought this vinyl 2 times and they won't exc...,negative


In [10]:
def build_vocab(filepath, padding = False):
    train_vocab = {}
    train = gzip.open(filepath)
    counter1 = 0
    counter2 = 0
    counter3 = 0
    counter = 0
    if padding: 
        train_vocab['<PAD>'] = 0
        counter2 += 1
    no_reviewText = []
    labels = {}
    sentences = {}
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
    for line in train:
        counter1 +=1
        #print(line)
        if 'reviewText' in json.loads(line).keys():
            a = json.loads(line)
            sentences[counter3] = a['reviewText']
            counter3 += 1
            if a['sentiment'] == 'positive':
                labels[counter] = 1
            elif a['sentiment'] == 'negative': 
                labels[counter] = 0
            counter +=1
            for word in tokenizer.tokenize(json.loads(line)['reviewText']):
                if word not in train_vocab.keys():
                    train_vocab[word] = counter2
                    counter2 += 1
        else:
            no_reviewText.append(counter1)
    final_dict = {'line_count' : counter1,
                 'review_count' : counter3,
                 'vocab_size' : counter2,
                 'no_text_reviews' : no_reviewText,
                 'labels' : labels,
                 'vocabulary' : train_vocab,
                 'sentences' : sentences}
    return final_dict

In [11]:
train = build_vocab('../classification/music_reviews_train.json.gz')

# Dataset Class

Now that we know how shit works, let's build a Dataset class for our dataset.

In [28]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
labels = {'negative': 0,
          'positive': 1,
          'Null': 2}

class Dataset(torch.utils.data.Dataset):
    
    def __init__(self, df):
        self.labels = [labels[label] for label in df['sentiment']]
        self.texts = [tokenizer(text,
                                padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['reviewText']]

    def classes(self):
        return self.labels
    
    def __len__(self):
        return len(self.labels)
    
    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])
    
    def get_batch_texts(self, idx):
        # Fetch a batch of labels
        return self.texts[idx]
    
    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
    
        return batch_texts, batch_y

Above, the variable "labels" is a dictionary that maps the "sentiment" in the dataframe into the id representation of our label.

Note that we also call the BertTokenizer in the __init__ function above to transform our input texts into the format BERT expects.

After defining dataset class, let's split our dataframe into training, validation, and test set with the proportion of 80:10:10

In [17]:
np.random.seed(112)

# only doing it this way to follow the guide, need to combine our train and dev and then re-split
frames = [train, dev]
df = pd.concat(frames)

df.head()

Unnamed: 0,reviewText,sentiment
0,"So creative! Love his music - the words, the ...",positive
0,This tape can hardly be understood and it was ...,negative
0,Buy the CD. Do not buy the MP3 album. Downlo...,negative
0,I love Dallas Holms music and voice! Thank Yo...,positive
0,Great memories of my early years in Christ,positive


In [18]:
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))

88000 11000 11000


## Model Building

So far, we have built a dataset class to generate our data. Now let's build the actual model using a pre-trained BERT base model which has 12 layers of Transformer encoder.

If your dataset is not in English, it would be best if you use bert-base-multilingual-cased model. blah blah read the article more if we want multilingual / specific other languages.

In [35]:
df_train

Unnamed: 0,reviewText,sentiment
0,Atr fans are obviously metal fans since they'r...,negative
0,Love that song. I watched the video as well an...,positive
0,It didn't download to my phone I want my money...,negative
0,Are you aware that Amazon policy on poor quail...,negative
0,The collection arrived on schedule and the pac...,negative
...,...,...
0,"My first (and standing) thought is ""They've tu...",negative
0,this song get u dancing and love watching the ...,positive
0,A very good song.,positive
0,Nice,positive


In [21]:
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        
        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 2)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

From the above code, the BERT model outputs two variables:

 - The first variable, which we name _ in the code, contains the embedding vectors of all of the tokens in a sequence.
 - The second variable, which we named "pooled_output", contains the embedding vector of [CLS] token. For a text classification task, it is enough to use this embedding as an input for our classifier.

We then pass the "pooled_output" variable into a linear layer with ReLU activation function. At the end of the linear layer, we have a vector of size 5, each corresponds to a category of our labels (sport, business, politics, entertainment, and tech).

# CHANGE ABOVE TO WHATEVER WE END UP WITH

## Training loop

Now it's time for us to train the model. The training loop will be a standard PyTorch training loop.

In [38]:
print(df_train.shape)
df_train.head()

(88000, 2)


Unnamed: 0,reviewText,sentiment
0,Atr fans are obviously metal fans since they'r...,negative
0,Love that song. I watched the video as well an...,positive
0,It didn't download to my phone I want my money...,negative
0,Are you aware that Amazon policy on poor quail...,negative
0,The collection arrived on schedule and the pac...,negative


In [39]:
print(df_val.shape)
df_val.head()

(11000, 2)


Unnamed: 0,reviewText,sentiment
0,"""Winds of Samsara"" is a glorious collaboration...",positive
0,Great song!,positive
0,One of my All-Time Favorite songs!!,positive
0,"I am a big fan of TSO, but this album, after 5...",negative
0,He is his mothers son in the song department. ...,positive


In [41]:
from torch.optim import Adam
from tqdm import tqdm

def train(model, train_data, val_data, learning_rate, epochs):
    
    train_data

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)
    
    #train_dataloader = train_dataloader.type(torch.LongTensor)
    #val_dataloader = val_dataloader.type(torch.LongTensor)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device, dtype=torch.long)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                #print("train_input: ", train_input)
                #print("train_label: ", train_label)
                #print("output: ", output)
                
                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)
                    
                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc

            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')

EPOCHS = 5
model = BertClassifier()
LR = 1e-6

# taking only 5% of train data to see if it can run
df_train_test = df_train.head(10)

train(model, df_train_test, df_val, LR, EPOCHS)

KeyboardInterrupt: 