In [1]:
!pip install transformers
!pip install nlpaug
from google.colab import drive
from google.colab import files
drive.mount('/content/gdrive')
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score
import transformers
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
from torch import nn
import torch.nn.functional as F
#from tqdm import trange
from tqdm.notebook import tqdm, trange
import os
from transformers import AdamW, get_linear_schedule_with_warmup
import nlpaug


#DECLARE USER HERE
#Hardeep: set to 0
#Edmund: set to 1
set_user = 1

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
cuda


In [2]:
PATH = None
if set_user == 0:
  PATH = '/content/gdrive/MyDrive/182proj/'
elif set_user == 1:
  PATH = '/content/gdrive/MyDrive/nlp_proj/'
import sys
import os
sys.path.append(os.path.abspath(PATH))
from tcn import TemporalConvNet

In [3]:
data = pd.read_json(PATH + 'yelp_review_training_dataset.jsonl', lines=True)

In [4]:
X = data['text'][:256]
y = data['stars'][:256]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.1, random_state=123)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.4, random_state=123)


In [5]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-cased')

In [6]:
class ProcessData(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_len, labels):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.labels = labels
    
    def __getitem__(self, index):
        review = self.data[index]
        label = self.labels[index]
        return self.tokenizer.encode_plus(review, max_length=self.max_len, padding='max_length', return_attention_mask=True, return_tensors='pt', truncation=True), review, torch.tensor([label-1]).to(torch.long)
    
    def __len__(self):
        review_length = len(self.data)
        return review_length

In [7]:
tokenized_training_data = ProcessData(X_train.to_numpy().tolist(), tokenizer, 150, y_train.to_numpy())
tokenized_validation_data = ProcessData(X_val.to_numpy().tolist(), tokenizer, 150, y_val.to_numpy())
tokenized_test_data = ProcessData(X_test.to_numpy().tolist(), tokenizer, 150, y_test.to_numpy())

In [8]:
#Create dataloader
params = {'batch_size': 16,
          'num_workers': 0}
loader_tokenized_training_data = torch.utils.data.DataLoader(tokenized_training_data, **params)
loader_tokenized_validation_data = torch.utils.data.DataLoader(tokenized_validation_data, **params)
loader_tokenized_test_data = torch.utils.data.DataLoader(tokenized_test_data, **params)

### Data Augmentations

***ONLY RUN BELOW CELL ONCE OR ELSE DATA WILL BE CORRUPTED AND YOU WILL NEED TO RESTART THE RUNTIME***

In [9]:
delete_aug_data = pd.read_csv(PATH + 'augmentations/delete_aug.csv', header=None)
swap_aug_data = pd.read_csv(PATH + 'augmentations/swap_aug.csv', header=None)
typo_aug_data = pd.read_csv(PATH + 'augmentations/typo_aug.csv', header=None)

delete_aug_data.columns = delete_aug_data.iloc[0]
delete_aug_data = delete_aug_data[1:]

swap_aug_data.columns = swap_aug_data.iloc[0]
swap_aug_data = swap_aug_data[1:]

typo_aug_data.columns = typo_aug_data.iloc[0]
typo_aug_data = typo_aug_data[1:]

  interactivity=interactivity, compiler=compiler, result=result)


In [10]:
class TransTCN(nn.Module):
    def __init__(self, classes, num_augmentations, input_size, num_channels, kernel_size=2, dropout=0.3, hidden_state=768):
        super(TransTCN, self).__init__()
        self.berts = []
        self.num_augmentations = num_augmentations
        for i in range(num_augmentations):
          self.berts.append(transformers.BertModel.from_pretrained('bert-base-cased').to(device))
        self.tcn = TemporalConvNet(input_size, num_channels, kernel_size, dropout=dropout)
        self.finalLinear = nn.Linear(hidden_state, classes)

    #Return shape (batch_size, num_channels[0], bert_output_dimension)
    def concatBerts(self, bert_outputs):
      concatBert = bert_outputs[0][1].unsqueeze(dim=1)
      for i in range(1, len(bert_outputs)):
        concatBert = torch.cat((concatBert, bert_outputs[i][1].unsqueeze(dim=1)), dim=1)
      return concatBert
    #(bert -> tcn) * n -> bert -> linear -> softmax
    def forward(self, input_ids, attention_masks):
        bert_outputs = []
        for i in range(len(self.berts)):
          bert_outputs.append(self.berts[i](input_ids[i], attention_masks[i]))
        output = self.concatBerts(bert_outputs)
        output = self.tcn(output).squeeze(dim=1)
        output = self.finalLinear(output)
        return output


In [11]:
tokenized_training_data_typo = ProcessData(typo_aug_data['text'][:256].to_numpy().tolist(), tokenizer, 150, y_train.to_numpy())
tokenized_training_data_swap = ProcessData(swap_aug_data['text'][:256].to_numpy().tolist(), tokenizer, 150, y_train.to_numpy())
tokenized_training_data_del = ProcessData(delete_aug_data['text'][:256].to_numpy().tolist(), tokenizer, 150, y_train.to_numpy())
tokenized_training_data= ProcessData(X_train.to_numpy().tolist(), tokenizer, 150, y_train.to_numpy())
#Create dataloader

loader_tokenized_training_data = torch.utils.data.DataLoader(tokenized_training_data, **params)
loader_tokenized_training_data_typo = torch.utils.data.DataLoader(tokenized_training_data_typo, **params)
loader_tokenized_training_data_swap = torch.utils.data.DataLoader(tokenized_training_data_swap, **params)
loader_tokenized_training_data_del = torch.utils.data.DataLoader(tokenized_training_data_del, **params)
loaders = [iter(loader_tokenized_training_data), iter(loader_tokenized_training_data_typo), iter(loader_tokenized_training_data_swap), iter(loader_tokenized_training_data_del)]

In [12]:
num_tcn_layers = 3
model = TransTCN(classes=5, num_augmentations=4, input_size=4,num_channels=[4,4] * num_tcn_layers + [1])
model = model.to(device)
from torch.optim import Adam
criterion = nn.CrossEntropyLoss().to(device)
optimizer = AdamW(model.parameters(), lr=5e-5, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,num_training_steps=len(loader_tokenized_training_data) * 5)

In [13]:
def training(model, data_loaders, size):
    model = model.train()
    losses = []
    training_acc = 0
    for _ in tqdm(data_loaders[0]):

      batch_ids_list = []
      batch_masks_list = []
      labels = None
      stopped = False
      for data_loader in data_loaders:
        try:
          data = next(data_loader)
        except StopIteration:
          stopped = True
          print("stopped data_loader")
          break
        #import pdb; pdb.set_trace()

        batch_ids = data[0]['input_ids']
        batch_ids = batch_ids.flatten().reshape((batch_ids.shape[0], batch_ids.shape[2]))
        batch_ids_list.append(batch_ids.to(device))

        batch_masks = data[0]['attention_mask']
        batch_masks = batch_masks.flatten().reshape((batch_masks.shape[0], batch_masks.shape[2]))
        batch_masks_list.append(batch_masks.to(device))
        
        labels = data[2]
      if stopped:
        break
      labels = labels.to(device)
      output = model(batch_ids_list, batch_masks_list)
      #print(output)
  
      prediction = torch.max(output, 1)[1]

      training_loss = criterion(output, torch.flatten(labels))
      training_acc += torch.sum(prediction == torch.flatten(labels))

      losses.append(training_loss.item())
      training_loss.backward()
      #nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
      
      optimizer.step()
      scheduler.step()
      optimizer.zero_grad()
      torch.cuda.empty_cache()
      
    return training_acc / size, np.mean(losses)

In [14]:
torch.cuda.empty_cache()
highest_acc = 0
for epoch in trange(5):
    print('Epoch: ' , str(epoch))
    print('==================================')
    training_accuracy, training_loss = training(model,loaders,len(X_train))
    #validation_accuracy, validation_loss = evaluate(model, loader_tokenized_validation_data, len(X_val))
    
    print('Training accuracy: ', training_accuracy )
    print('Training loss: ', training_loss)
    #print('Validation accuracy: ', validation_accuracy)
    #print('Validation loss: ', validation_loss)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Epoch:  0


HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))

stopped data_loader
Training accuracy:  tensor(0.0696, device='cuda:0')
Training loss:  1.6457522085734777
Epoch:  1


HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))


Training accuracy:  0.0
Training loss:  nan
Epoch:  2


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))


Training accuracy:  0.0
Training loss:  nan
Epoch:  3


HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))


Training accuracy:  0.0
Training loss:  nan
Epoch:  4


HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))


Training accuracy:  0.0
Training loss:  nan

