In [1]:
# imports
import pandas as pd
from transformers import BertTokenizer
import torch
import numpy as np
import pandas as pd
import gzip
from nltk import TweetTokenizer
import json

Following: https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f

# Loading BERT and trying stuff out

In [2]:
# loading the pretrained model
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [3]:
example_text = 'I will watch Memento tonight'

bert_input = tokenizer(example_text, # specifying the text/file
                       padding='max_length', # to pad each sequence to a length we specify with max_length
                       max_length=10, # The max length of each sequence. We should use 512 since that's the max length with BERT
                       truncation=True, # If True, then any tokens over our max length will be cut-off
                       return_tensors='pt') # The type of tensors that will be returned pt for pytorch, tf for tensorflow

print(bert_input['input_ids'])
print(bert_input['token_type_ids'])
print(bert_input['attention_mask'])

tensor([[  101,   146,  1209,  2824,  2508, 26173,  3568,   102,     0,     0]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])


# What is input_ids?

This is the id representation of each token, which can be decoded into actual tokens like so:

In [4]:
example_text = tokenizer.decode(bert_input.input_ids[0])

print(example_text)

[CLS] I will watch Memento tonight [SEP] [PAD] [PAD]


As we can see, the BertTokenizer takes care of the necessary transformations of the input text such that it's ready to be used as an input for our BERT model.

It adds the [CLS], [SEP], and [PAD] tokens that we need.

Since the max_length = 10 we get the two [PAD] tokens at the end to make the length 10.

# What is token_type_ids?

This is a binary mask which identifies in which sequence a token belongs. If we have only a single sequence, then all of the token type ids will be 0. For a text classification task, token_type_ids is an optional input for our BERT model.

# What is attention_mask

This is a binary mask which identifies whether a word is a real word or just padding. If the token contains [CLS], [SEP], or any real word, then the mask would be 1. If the token is just [PAD], then the mask would be 0.

# Quick Note

We are using the BertTokenizer from bert-base-cased, which is a pre-trained BERT model that works well on English data.

# Loading in our datasets

In [5]:
def build_vocab(filepath, padding = False):
    train_vocab = {}
    train = gzip.open(filepath)
    counter1 = 0
    counter2 = 0
    counter3 = 0
    counter = 0
    if padding: 
        train_vocab['<PAD>'] = 0
        counter2 += 1
    no_reviewText = []
    labels = {}
    sentences = {}
    tokenizer = TweetTokenizer()
    for line in train:
        counter1 +=1
        #print(line)
        if 'reviewText' in json.loads(line).keys():
            a = json.loads(line)
            sentences[counter3] = a['reviewText']
            counter3 += 1
            if a['sentiment'] == 'positive':
                labels[counter] = 1
            elif a['sentiment'] == 'negative': 
                labels[counter] = 0
            counter +=1
            for word in tokenizer.tokenize(json.loads(line)['reviewText']):
                if word not in train_vocab.keys():
                    train_vocab[word] = counter2
                    counter2 += 1
        else:
            no_reviewText.append(counter1)
    final_dict = {'line_count' : counter1,
                 'review_count' : counter3,
                 'vocab_size' : counter2,
                 'no_text_reviews' : no_reviewText,
                 'labels' : labels,
                 'vocabulary' : train_vocab,
                 'sentences' : sentences}
    return final_dict

In [6]:
train = build_vocab('../classification/music_reviews_train.json.gz')

In [7]:
def zipped_to_pandas(filepath):
    data = gzip.open(filepath)
    df = pd.DataFrame(columns=['reviewText', 'sentiment'])

    for line in data:
        dicted = json.loads(line)
        if 'reviewText' not in dicted:
            reviewText = 'Null'
        else:
            reviewText = dicted['reviewText']

        if 'sentiment' not in dicted:
            sentiment = 'Null'
        else:
            sentiment = dicted['sentiment']

        filtered_dict = {'reviewText': [reviewText], 'sentiment': [sentiment]}
        temp_df = pd.DataFrame.from_dict(filtered_dict)
        df = pd.concat([df, temp_df])
    return df

In [8]:
train = zipped_to_pandas('../classification/music_reviews_train.json.gz')
train.head()

Unnamed: 0,reviewText,sentiment
0,"So creative! Love his music - the words, the ...",positive
0,This tape can hardly be understood and it was ...,negative
0,Buy the CD. Do not buy the MP3 album. Downlo...,negative
0,I love Dallas Holms music and voice! Thank Yo...,positive
0,Great memories of my early years in Christ,positive


In [9]:
dev = zipped_to_pandas('../classification/music_reviews_dev.json.gz')
dev.head()

Unnamed: 0,reviewText,sentiment
0,My dentist recommended this as a relaxation te...,positive
0,I am personally acquainted with a member of th...,negative
0,The Cd cover was broken when I got it,negative
0,"This is an uplifting, keep going, motivating s...",positive
0,I bought this vinyl 2 times and they won't exc...,negative


In [10]:
def build_vocab(filepath, padding = False):
    train_vocab = {}
    train = gzip.open(filepath)
    counter1 = 0
    counter2 = 0
    counter3 = 0
    counter = 0
    if padding: 
        train_vocab['<PAD>'] = 0
        counter2 += 1
    no_reviewText = []
    labels = {}
    sentences = {}
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
    for line in train:
        counter1 +=1
        #print(line)
        if 'reviewText' in json.loads(line).keys():
            a = json.loads(line)
            sentences[counter3] = a['reviewText']
            counter3 += 1
            if a['sentiment'] == 'positive':
                labels[counter] = 1
            elif a['sentiment'] == 'negative': 
                labels[counter] = 0
            counter +=1
            for word in tokenizer.tokenize(json.loads(line)['reviewText']):
                if word not in train_vocab.keys():
                    train_vocab[word] = counter2
                    counter2 += 1
        else:
            no_reviewText.append(counter1)
    final_dict = {'line_count' : counter1,
                 'review_count' : counter3,
                 'vocab_size' : counter2,
                 'no_text_reviews' : no_reviewText,
                 'labels' : labels,
                 'vocabulary' : train_vocab,
                 'sentences' : sentences}
    return final_dict

In [11]:
train = build_vocab('../classification/music_reviews_train.json.gz')

# Dataset Class

Now that we know how shit works, let's build a Dataset class for our dataset.

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
labels = {'negative': 0,
          'positive': 1,
          'Null': 2}

class Dataset(torch.utils.data.Dataset):
    
    def __init__(self, df):
        self.labels = [labels[label] for label in df['sentiment']]
        self.texts = [tokenizer(text,
                                padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['reviewText']]

    def classes(self):
        return self.labels
    
    def __len__(self):
        return len(self.labels)
    
    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])
    
    def get_batch_text(self, idx):
        # Fetch a batch of labels
        return self.texts[idx]
    
    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
    
        return batch_texts, batch_y