In [58]:
import torchvision
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from transformers import BertModel, BertTokenizer
import torch.nn as nn
from torch.utils.data import DataLoader, SubsetRandomSampler
import torch.optim as optim
import sklearn as sk
import time
from sklearn.model_selection import train_test_split

# to convert our data to the BERT language representation & use its vocabulary
class OurDataset(Dataset):
  def __init__(self, data, len_max):
    # expected data input is a pandas dataframe
    self.data = data
    self.data.reset_index(drop=True, inplace=True)
    # self.reviews = self.data['summary']
    # self.ratings = self.data['overall']
    self.len_max = len_max # the maximum length of a review to consider
    self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # add do_lower_case, if you want to do all lowercase text

  def __len__(self):
    return len(self.data)

  def __getitem__(self, ind):
    review = self.data.loc[ind, 'reviewText']
    rating = int(self.data.loc[ind, 'overall']) - 1 # Ratings=1,2,3,4,5 to 0,1,2,3,4
    # use the BERT Tokenizer to ensure review is represented similarly
    tokens = self.tokenizer.tokenize(review)
    # recall that BERT uses additional token embeddings
    tokens = ['[CLS]'] + tokens + ['[SEP]']
    # [CLS] should be added to the beginning of the input
    # [SEP] should be added to the end of the input
    # to add [PAD] if sentence is too short
    if len(tokens) < self.len_max:
      # At the end of the tokens add PAD
      tokens = tokens + ['[PAD]' for i in range(self.len_max - len(tokens))]
    else:
      # tokens list is too long, need to cut off the tokens and then re-add SEP
      tokens = tokens[:self.len_max - 1] + ['[SEP]']

    # Converts tokens to an id using the vocabulary
    token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
    # convert to PyTorch tensor
    tokens_to_tensors = torch.tensor(token_ids)
    # MLM to distinguish between the PAD and the important tokens
    attention_mask = (tokens_to_tensors != 0).long()
    # convert our labels into tensors
    # label = torch.tensor(rating).long()
    return tokens_to_tensors, attention_mask, rating


# to import the pretrained BERT Model
# purpose: multi-class classification where we try to predict the ratings
class RatingPredictor(nn.Module):
  def __init__(self, rating_scale):
    super(RatingPredictor, self).__init__()
    # load the BERT Model configuration, and update to match our dataset
    # self.bert_config = BertConfig(hidden_size=768,
    #                               num_hidden_layers=12,
    #                               num_attention_heads=12,
    #                               intermediate_size=3072,
    #                               num_labels=rating_scale) # change to multi-class
    # load in pretrained model
    self.bert = BertModel.from_pretrained('bert-base-uncased')
    # self.bert = BertForSequenceClassification('bert-base-uncased', do_lower_case=True, num_labels=rating_scale) # this one has a linear layer after the pooled layer
    # because we want to fine-tune, make sure that the weights from BERT aren't updated
    for param in self.bert.parameters():
      param.requires_grad = False
    
    # our classifer on top of the BERT Model
    self.linear1 = nn.Linear(768, 500)
    self.relu = nn.ReLU()
    self.drop = nn.Dropout(0.2) # dropout with 50%
    self.linear2 = nn.Linear(500, rating_scale)
    # self.fc = nn.LogSoftmax(dim=0) # to calculate the probabilities
    self.fc = nn.LogSoftmax(dim=1)

  def forward(self, tokens, attention_mask):
    # grab the BERT Model outputs after forward pass
    outputs = self.bert.forward(input_ids=tokens, attention_mask=attention_mask) 
    # forward pass for Bert will return two outputs
    # 12 layers to one pooled output, so grab last output layer
    # pooled output size should be (1, 768) as we pass one review at a time
    pooled_output = outputs.pooler_output
    #print(outputs.pooler_output)
    lin_output1 = self.linear1(pooled_output)
    relu_output = self.relu(lin_output1)
    dropped_output = self.drop(relu_output)
    lin_output2 = self.linear2(dropped_output)
    result = self.fc(lin_output2)
    return result

"""
  def forward(self, tokens, attention_mask):
    # grab the BERT Model outputs after forward pass
    output = self.bert(input_ids=tokens, attention_mask=attention_mask, return_dict=True) 
    # forward pass for Bert will return two outputs
    # 12 layers to one pooled output, so grab last output layer
    # pooled output size should be (1, 768) as we pass one review at a time
    # print(output.pooler_output[0].size())
    pooled_output = output['pooler_output'][0]
    lin_output1 = self.linear1(pooled_output)
    print("through first linear layer")
    relu_output = self.relu(lin_output1)
    print("ReLu'ed!!")
    dropped_output = self.drop(relu_output)
    print("dropped bish")
    lin_output2 = self.linear2(dropped_output)
    print("Linear 2, almost there")
    print(lin_output2.size())
    result = self.fc(lin_output2)
    print("Log soft max completo")
    print(result.size())
    return result
  """


# training and validation functions
def dataloader(fileName, bs):
  """Load the data into PyTorch DataLoader for train, val, test.
  """
  np.random.seed(42)
  # load data from fileName
  raw_data = pd.read_json(fileName, lines=True, orient='columns', dtype=True)
  raw_data = raw_data[['reviewText', 'overall']]
  raw_data = raw_data.dropna()
  # split the data into train, val, and test
  # reduce the original 200,000 to 50,000, with the original proportions of each class
  X_temp1, X_temp2, y_temp1, y_temp2 = train_test_split(raw_data['reviewText'], raw_data['overall'], test_size=0.25, stratify=raw_data['overall'], random_state=42)
  # now we have 50,000 examples in  X_temp2, y_temp2
  X_train, X_temp3, y_train, y_temp3 = train_test_split(X_temp2, y_temp2, test_size=0.5, stratify=y_temp2, random_state=42)
  # X_train now has 25,000 examples
  X_val, X_test, y_val, y_test = train_test_split(X_temp3, y_temp3, test_size=0.5, stratify=y_temp3, random_state=42)
  # X_val & X_test have 12,500 examples each
  # merge X and ys to feed into OurDataset
  train_set = pd.DataFrame(X_train)
  train_set['overall'] = y_train
  val_set = pd.DataFrame(X_val)
  val_set['overall'] = y_val
  test_set = pd.DataFrame(X_test)
  test_set['overall'] = y_test
  # train_set = raw_data.sample(frac=0.5, random_state=42)
  # temp = raw_data.drop(train_set.index)
  # val_set = temp.sample(frac=0.3, random_state=42)
  # test_set = temp.drop(val_set.index)
  # train_split = 0.5
  # val_split = 0.3
  # fullsize = len(raw_data)
  # indices = list(range(fullsize))
  # split1 = int(np.floor(train_split * fullsize))
  # split2 = int(np.floor(val_split * fullsize))
  # np.random.shuffle(indices)
  # train_ind, val_ind, test_ind = indices[:split1], indices[split1:split1+split2], indices[split1+split2:]
  # using the split indices, get the samples
  # train_sampler = torch.utils.data.SubsetRandomSampler(train_ind)
  # val_sampler = torch.utils.data.SubsetRandomSampler(val_ind)
  # test_sampler = torch.utils.data.SubsetRandomSampler(test_ind)
  # utilize OurDataset class to create & tokenize the data
  # all_data = OurDataset(raw_data, 186)
  train_data = OurDataset(train_set, 512)
  val_data = OurDataset(val_set, 512)
  test_data = OurDataset(test_set, 512)
  # use DataLoader
  train_loader = DataLoader(train_data, batch_size=bs, shuffle=False)
  val_loader = DataLoader(val_data, batch_size=bs, shuffle=False)
  test_loader = DataLoader(test_data, batch_size=bs) # ??
  return train_loader, val_loader, test_loader, len(train_set), len(val_set), len(test_set)


def get_accuracy(pred, label):
  # determine the index of the most likely rating
  index = torch.argmax(pred, dim = 1)
  # return the number of correctly predicted ratings / number of total examples in a batch
  return (index==label).sum().item()


def evaluate(model, loader, loader_len, criterion):
  """Evaluate the network model based on validation set.
  """
  #model.train(False)
  model.eval() # go into evaluation mode
  acc, err = 0, 0
  with torch.no_grad():
    total_loss = 0.0
    total_acc = 0.0
    for iter, (tokens, attention_mask, rating) in enumerate(loader):
      pred = model(tokens, attention_mask)
      # loss = criterion(nn.LogSoftmax(pred, dim=1), rating)
      loss = criterion(pred, rating)
      total_loss += loss.item()
      total_acc += get_accuracy(pred, rating)
      if (iter + 1) % 100 == 0:
        print("Iter {}     -     Loss: {}     -      Accuracy: {}".format(iter+1, total_loss / (iter+1), total_acc / (iter*32)))

    err = (total_loss) / (iter + 1)
    acc = (total_acc) / (loader_len) # the total number of correctly predicted 
  return err, acc


def train(model, train_loader, train_len, val_loader, val_len, epochs, learning_rate):
  """Use training and validation, train the model.
  """
  print("I'm at the start")
  model.train(True)
  torch.manual_seed(42) # for reproducibility
  # define loss function and optimizer for weight updates
  target_weights = torch.FloatTensor([20/41, 19/41, 8/41, 3/41, 1/41]) # Weights that should be multiplied to the learning rate of the optimizer
  criterion = nn.NLLLoss(weight=target_weights)
  optimizer = optim.Adam(model.parameters(), lr=learning_rate)
  # to store values later
  train_acc, train_loss, val_acc, val_loss = [], [], [], []
  startTime = time.time()
  for epoch in range(epochs):
    total_loss = 0.0
    total_acc = 0.0
    # iterate through the batches
    # iter = 0
    for iter, (tokens, attention_mask, rating) in enumerate(train_loader):
      optimizer.zero_grad()
      pred = model(tokens, attention_mask) # predict using tokens & attention mask
      # compute loss
      # loss = criterion(nn.LogSoftmax(pred, dim=1), rating)
      loss = criterion(pred, rating)
      # backprop
      loss.backward()
      # weight updates
      optimizer.step()
      # add in loss and accuracy (number of correctly predicted ratings)
      total_loss += (loss)
      total_acc += (get_accuracy(pred, rating))
    
      # for us to see where we are in training
      if (iter+1) % 10 == 0:
        print("Epoch {}  - Iter {}  - Training Time: {} -     Loss: {}      -    Accuracy: {}".format(epoch+1, iter+1, time.time()-startTime,
        total_loss / (iter+1), total_acc / (iter*32)))
    train_loss.append(total_loss / (iter+1)) # calculate the average loss across all iterations per epoch
    train_acc.append(total_acc / train_len) # calculate the number of correctly predicted ratings / total training examples
    total_loss = 0.0
    # compute validation loss at the end of each epoch
    val_err, val_avg_acc = evaluate(model, val_loader, val_len, criterion)
    val_loss.append(val_err)
    val_acc.append(val_avg_acc)

    print("END  ---  Epoch {}  ---  Training Error: {}  ---   Validation Error: {}".format(
        epoch, train_loss[epoch], val_err))
  return train_loss, train_acc, val_loss, val_acc

In [59]:
# load the data for training and validation, set aside the test
train_loader, val_loader, test_loader, train_len, val_len, test_len = dataloader('train.json', bs=32)
print("Finish loading our data splits!")

Finish loading our data splits!


In [12]:
mod = RatingPredictor(rating_scale=5)
print("Initiated Instance of Our Network")

Initiated Instance of Our Network


In [13]:
# train and check validation
# mod.train(True)
train_loss, train_acc, val_loss, val_acc = train(mod, train_loader, train_len, val_loader, val_len, epochs=3, learning_rate=0.00005)

I'm at the start
Epoch 1  - Iter 10  - Training Time: 407.6044840812683 -     Loss: 1.6166388988494873      -    Accuracy: 0.28125
Epoch 1  - Iter 20  - Training Time: 817.4286618232727 -     Loss: 1.607580542564392      -    Accuracy: 0.26151315789473684
Epoch 1  - Iter 30  - Training Time: 1234.8846859931946 -     Loss: 1.6130540370941162      -    Accuracy: 0.2273706896551724
Epoch 1  - Iter 40  - Training Time: 1641.68985581398 -     Loss: 1.6126466989517212      -    Accuracy: 0.22275641025641027
Epoch 1  - Iter 50  - Training Time: 2053.19322681427 -     Loss: 1.6148546934127808      -    Accuracy: 0.22385204081632654
Epoch 1  - Iter 60  - Training Time: 2450.1118021011353 -     Loss: 1.6114085912704468      -    Accuracy: 0.21927966101694915
Epoch 1  - Iter 70  - Training Time: 2871.865550994873 -     Loss: 1.6120229959487915      -    Accuracy: 0.213768115942029
Epoch 1  - Iter 80  - Training Time: 3272.555946826935 -     Loss: 1.6111360788345337      -    Accuracy: 0.226265822

Epoch 1  - Iter 660  - Training Time: 26714.764316082 -     Loss: 1.5793476104736328      -    Accuracy: 0.41952769347496205
Epoch 1  - Iter 670  - Training Time: 27132.267019987106 -     Loss: 1.5790681838989258      -    Accuracy: 0.4198430493273543
Epoch 1  - Iter 680  - Training Time: 27541.97731399536 -     Loss: 1.5784904956817627      -    Accuracy: 0.41927466863033874
Epoch 1  - Iter 690  - Training Time: 27959.15756201744 -     Loss: 1.5781468152999878      -    Accuracy: 0.4187681422351234
Epoch 1  - Iter 700  - Training Time: 28375.442598104477 -     Loss: 1.5778794288635254      -    Accuracy: 0.4182761087267525
Epoch 1  - Iter 710  - Training Time: 28785.748197078705 -     Loss: 1.5774352550506592      -    Accuracy: 0.41797425952045136
Epoch 1  - Iter 720  - Training Time: 29192.068790912628 -     Loss: 1.5775600671768188      -    Accuracy: 0.4184196801112656
Epoch 1  - Iter 730  - Training Time: 29614.55724787712 -     Loss: 1.577069640159607      -    Accuracy: 0.41919

Epoch 2  - Iter 510  - Training Time: 62278.38876724243 -     Loss: 1.5187349319458008      -    Accuracy: 0.48428290766208254
Epoch 2  - Iter 520  - Training Time: 62609.72949695587 -     Loss: 1.518999695777893      -    Accuracy: 0.4836825626204239
Epoch 2  - Iter 530  - Training Time: 62941.54171991348 -     Loss: 1.5190911293029785      -    Accuracy: 0.4820415879017013
Epoch 2  - Iter 540  - Training Time: 63272.95648813248 -     Loss: 1.518230676651001      -    Accuracy: 0.48208487940630795
Epoch 2  - Iter 550  - Training Time: 63604.127594947815 -     Loss: 1.5168609619140625      -    Accuracy: 0.48372040072859745
Epoch 2  - Iter 560  - Training Time: 63935.13967990875 -     Loss: 1.516809105873108      -    Accuracy: 0.48300536672629696
Epoch 2  - Iter 570  - Training Time: 64266.55780386925 -     Loss: 1.5164817571640015      -    Accuracy: 0.48259007029876977
Epoch 2  - Iter 580  - Training Time: 64597.98266005516 -     Loss: 1.5154531002044678      -    Accuracy: 0.483430

Epoch 3  - Iter 360  - Training Time: 98156.7338719368 -     Loss: 1.462003231048584      -    Accuracy: 0.5164519498607242
Epoch 3  - Iter 370  - Training Time: 98517.7891061306 -     Loss: 1.4615613222122192      -    Accuracy: 0.517699864498645
Epoch 3  - Iter 380  - Training Time: 98868.4282488823 -     Loss: 1.4614312648773193      -    Accuracy: 0.5194591029023746
Epoch 3  - Iter 390  - Training Time: 99206.91242098808 -     Loss: 1.4612977504730225      -    Accuracy: 0.5204048843187661
Epoch 3  - Iter 400  - Training Time: 99544.8468902111 -     Loss: 1.4608362913131714      -    Accuracy: 0.5187186716791979
Epoch 3  - Iter 410  - Training Time: 99881.82496094704 -     Loss: 1.4607396125793457      -    Accuracy: 0.519789119804401
Epoch 3  - Iter 420  - Training Time: 100217.7894320488 -     Loss: 1.4596943855285645      -    Accuracy: 0.5214051312649165
Epoch 3  - Iter 430  - Training Time: 100552.69215393066 -     Loss: 1.4588743448257446      -    Accuracy: 0.521925990675990

In [46]:
print(val_loss)
print(val_acc)

[1.5332056866277515, 1.4746790392624447, 1.4243862720401697]
[0.5885741718674988, 0.5621699471915507, 0.5566490638502161]


In [18]:
PATH = 'finetuned_BERT_lr_5e-5_bs_32_epochs_3.mod'
torch.save(mod, PATH)

In [19]:
from sklearn.metrics import mean_squared_error

In [23]:
def evaluate_test(model, loader, loader_len):
  """Evaluate the network model based on validation set.
  """
  #model.train(False)
  model.eval() # go into evaluation mode
  acc, err = 0, 0
  with torch.no_grad():
    total_loss = 0.0
    total_acc = 0.0
    for iter, (tokens, attention_mask, rating) in enumerate(loader):
      pred = model(tokens, attention_mask)
      # loss = criterion(nn.LogSoftmax(pred, dim=1), rating)
      pred_temp1 = pred.argmax(dim=1)
      loss = mean_squared_error(pred_temp1.numpy(), rating.numpy())
      total_loss += loss
      total_acc += get_test_accuracy(pred_temp1, rating)
      if (iter + 1) % 100 == 0:
        print("Iter {}     -     Loss: {}     -       Accuracy: {}".format(iter+1, total_loss / (iter+1), total_acc / (iter+1)))
    
    err = (total_loss) / (iter + 1)
    acc = (total_acc) / (loader_len) # the total number of correctly predicted 
    return err, acc

In [24]:
def get_test_accuracy(pred, label):
  # determine the index of the most likely rating
  # index = torch.argmin(pred, dim = 1)
  # return the number of correctly predicted ratings / number of total examples in a batch
  return (pred==label).sum().item()

In [25]:
# to test the trained model on the test set
test_err, test_acc = evaluate_test(mod, test_loader, test_len)

Iter 100     -     Loss: 2.5253125     -       Accuracy: 18.15
Iter 200     -     Loss: 2.59296875     -       Accuracy: 17.8
Iter 300     -     Loss: 2.6044791666666667     -       Accuracy: 17.866666666666667


In [35]:
print(test_err)
print(test_acc)

2.646703608979824
0.5587293967034725


In [42]:
# import the Yelp dataset
df = pd.read_csv('yelp.csv')
df = df[['text', 'stars']]
df = df.rename(columns={'text':'reviewText', 'stars':'overall'})
df_len = len(df)
# limit to only 1000 examples
yelp_inds = np.random.randint(0, df_len, size=1000) # pick
df = df.iloc[yelp_inds] # now 1000 random examples
# load df for Yelp into OurDataset
yelp_data = OurDataset(df, 512)
yelp_loader = DataLoader(yelp_data, batch_size=32)
test_yelp_err, test_yelp_acc = evaluate_test(mod, yelp_loader, 1000)

In [43]:
print(test_yelp_err)
print(test_yelp_acc)

3.6171875
0.336


In [44]:
# import SST dataset
sst_test = pd.read_csv('sst_test.txt', sep='\t', header=None, names=['truth', 'text'])
sst_test['truth'] = sst_test['truth'].str.replace('__label__', '')
sst_test['truth'] = sst_test['truth'].astype(int).astype('category')
sst_test = sst_test.rename(columns={'text':'reviewText', 'truth':'overall'})
# limit to only 1000 examples
sst_len = len(sst_test)
sst_inds = np.random.randint(0, sst_len, size=1000)
sst_test = sst_test.iloc[sst_inds] # now 1000 random examples
# load df for SST into OurDataset
sst_data = OurDataset(sst_test, 512)
sst_loader = DataLoader(sst_data, batch_size=32)
test_sst_err, test_sst_acc = evaluate_test(mod, sst_loader, 1000)

In [45]:
print(test_sst_err)
print(test_sst_acc)

3.7314453125
0.25
