In [1]:
!pip install transformers
!pip install nlpaug
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score
import transformers
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
from torch import nn
import torch.nn.functional as F
#from tqdm import trange
from tqdm.notebook import tqdm, trange
import os
from transformers import AdamW, get_linear_schedule_with_warmup
import nlpaug
from google.colab import drive
drive.mount('/content/gdrive')

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 20.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 51.8MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 42.3MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1
Collecting nlpaug
[?25l  Downloading https://files.pythonh

In [2]:
PATH = '/content/gdrive/MyDrive/182proj/augmentations/'

In [3]:
rev = pd.read_json('/content/gdrive/MyDrive/182proj/yelp_review_training_dataset.jsonl',lines=True)

In [4]:
rev.head()

Unnamed: 0,review_id,text,stars
0,Q1sbwvVQXV2734tPgoKj4Q,Total bill for this horrible service? Over $8G...,1
1,GJXCdrto3ASJOqKeVWPi6Q,I *adore* Travis at the Hard Rock's new Kelly ...,5
2,2TzJjDVDEuAW6MR5Vuc1ug,I have to say that this office really has it t...,5
3,yi0R0Ugj_xUx_Nek0-_Qig,Went in for a lunch. Steak sandwich was delici...,5
4,11a8sVPMUFtaC7_ABRkmtw,Today was my second out of three sessions I ha...,1


In [5]:
X = rev['text'][:1000]
y = rev['stars'][:1000]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.1, random_state=123)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.4, random_state=123)


In [6]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-cased')

In [7]:
class ProcessData(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_len, labels):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.labels = labels
    
    def __getitem__(self, index):
        review = self.data[index]
        label = self.labels[index]
        return self.tokenizer.encode_plus(review, max_length=self.max_len, padding='max_length', return_attention_mask=True, return_tensors='pt', truncation=True), review, torch.tensor([label-1]).to(torch.long)
    
    def __len__(self):
        review_length = len(self.data)
        return review_length

In [8]:
tokenized_training_data = ProcessData(X_train.to_numpy().tolist(), tokenizer, 150, y_train.to_numpy())
tokenized_validation_data = ProcessData(X_val.to_numpy().tolist(), tokenizer, 150, y_val.to_numpy())
tokenized_test_data = ProcessData(X_test.to_numpy().tolist(), tokenizer, 150, y_test.to_numpy())

In [9]:
#Create dataloader
params = {'batch_size': 32,
          'num_workers': 0}
loader_tokenized_training_data = torch.utils.data.DataLoader(tokenized_training_data, **params)
loader_tokenized_validation_data = torch.utils.data.DataLoader(tokenized_validation_data, **params)
loader_tokenized_test_data = torch.utils.data.DataLoader(tokenized_test_data, **params)

In [10]:
class BERTNet(nn.Module):
    def __init__(self, classes):
        super(BERTNet, self).__init__()
        self.bert_model = transformers.BertModel.from_pretrained('bert-base-cased')
        self.drop = nn.Dropout(p=0.3)
        self.linear = nn.Linear(768, classes)
        self.sm = nn.Softmax(dim=1)
    
    def forward(self, input_ids, attention_mask):
        #print(input_ids,attention_mask )
        output = self.bert_model(input_ids, attention_mask)
        output = output[1]
        output = self.drop(output)
        output = self.linear(output)
        return output

### Data Augmentations

In [None]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

In [None]:
X_train[0]

'Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.'

In [None]:
aug = nac.KeyboardAug()
augmented_text = X_train.apply(aug.augment)
augmented_text

875    Th7s shop combines coffee, breakfast and coffe...
933    Played an 18 - hole round on a Saturday during...
861    They say Fhey are opSn until 7pm. It ' s is 61...
868    Had the lemon g#ass chJcMen. It was anaziJg. T...
792    Being a re0eat customer, I can h*Jestly say th...
                             ...                        
988    I wanted to like this salon - it seemed cute a...
322    I went Yere for lKnch and got the grilled caju...
382    Came in with my girl at 8: 05 on Tuesday, stor...
365    I bought a house that had a saff with a dial. ...
510    Very rude staff. Slow little to no communicati...
Name: text, Length: 900, dtype: object

In [None]:
augmented_text.to_csv(PATH+'typo1000.csv', index = False)

In [None]:
aug_syn = naw.SynonymAug(aug_src='wordnet')
augmented_text2 = X_train.apply(aug_syn.augment)
augmented_text2.to_csv(PATH+'synonym1000.csv', index=False)
augmented_text2

875    This shop combines coffee, breakfast and coffe...
933    Played an 18 - hole round on a Saturday during...
861    They say they be overt until 7pm. It ' s is 61...
868    Had the lemon grass chicken. Information techn...
792    Being a repeat customer, One can honestly pron...
                             ...                        
988    I wanted to wish this salon - it seemed cute a...
322    I blend in here for lunch and got the grilled ...
382    Came in with my girl at 8: 05 on Tuesday, stor...
365    I bought a house that had a safe with a dial. ...
510    Very rude staff. Dull little to no communicati...
Name: text, Length: 900, dtype: object

In [None]:
aug_rws = naw.RandomWordAug(action='swap')
augmented_text3 = X_train.apply(aug_rws.augment)
augmented_text3.to_csv(PATH+'swap1000.csv', index=False)
augmented_text3

875    This combines shop coffee, breakfast and break...
933    Played an 18 - hole round on a Saturday during...
861    They say they are until open. 7pm It ' s is an...
868    The lemon had grass. chicken It was amazing. a...
792    A being repeat customer, I can say honestly th...
                             ...                        
988    I wanted to like this salon - it seemed cute a...
322    I here for went lunch and got the grilled caju...
382    Came in with my girl at 8: 05 on Tuesday, stor...
365    I bought a house that had a safe with dial a. ...
510    Very rude staff. little Slow to no between com...
Name: text, Length: 900, dtype: object

In [None]:
aug_del = naw.RandomWordAug(action='delete')
augmented_text4 = X_train.apply(aug_del.augment)
augmented_text4.to_csv(PATH+'delete1000.csv', index=False)
augmented_text4

875    Shop coffee, breakfast coffee break items and ...
933    Played an 18 - hole round on a Saturday the mi...
861    They say they are open. It ' is 615pm and the ...
868    Had chicken. It was amazing. The great! I high...
792    Being repeat customer, I can honestly say the ...
                             ...                        
988    I wanted to like this salon - it seemed cute a...
322    Went here for lunch grilled chicken salad, bot...
382    Came in with my girl at 8: 05 on Tuesday, stor...
365    I bought a house that had a safe with a dial. ...
510    Rude staff. Slow little to no communication be...
Name: text, Length: 900, dtype: object

In [None]:
X_train.to_csv(PATH+'original1000.csv', index=False)
X_train

875    This shop combines coffee, breakfast and coffe...
933    Played an 18-hole round on a Saturday during t...
861    They say they are open until 7pm.  It's is 615...
868    Had the lemon grass chicken. It was amazing. T...
792    Being a repeat customer, I can honestly say th...
                             ...                        
988    I wanted to like this salon- it seemed cute an...
322    I went here for lunch and got the grilled caju...
382    Came in with my girl at 8:05 on Tuesday, store...
365    I bought a house that had a safe with a dial.t...
510    Very rude staff. Slow little to no communicati...
Name: text, Length: 900, dtype: object

In [11]:
class TransTCN(nn.Module):
    def __init__(self, classes, augmentations, input_size, num_channels, kernel_size=2, dropout=0.3, hidden_state=768):
        super(TransTCN, self).__init__()
        self.berts = []
        for i in range(augmentations):
          self.berts.append(transformers.BertModel.from_pretrained('bert-base-cased').to(device))
        #self.tcn = TemporalConvNet(input_size, num_channels, kernel_size, dropout=dropout)
        self.finalLinear = nn.Linear(hidden_state, classes)

    #(bert -> tcn) * n -> bert -> linear -> softmax
    def forward(self, a,b,c,d,e,f,g,h,m,n):
       bert_outputs = []
       #for i in range(len(self.berts)):
         
         #bert_outputs.append(self.berts[i](input_ids[i], attention_masks[i])
       bert_outputs.append(self.berts[0](a, f))
       bert_outputs.append(self.berts[1](b, g))
       bert_outputs.append(self.berts[2](c, h))
       bert_outputs.append(self.berts[3](d, m))
       bert_outputs.append(self.berts[4](e, n))
       for j in bert_outputs:
         print(j)
         print(j.shape)
       #concat bert
       #tcn something
       return -1


In [12]:
augmented_text = pd.read_csv(PATH+'typo1000.csv', header=None)
augmented_text2 = pd.read_csv(PATH+'synonym1000.csv', header=None)
augmented_text3 = pd.read_csv(PATH+'swap1000.csv', header=None)
augmented_text4 = pd.read_csv(PATH+'delete1000.csv', header=None)

tokenized_training_data_typo = ProcessData(augmented_text.to_numpy().tolist(), tokenizer, 150, y_train.to_numpy())
tokenized_training_data_syn = ProcessData(augmented_text2.to_numpy().tolist(), tokenizer, 150, y_train.to_numpy())
tokenized_training_data_swap = ProcessData(augmented_text3.to_numpy().tolist(), tokenizer, 150, y_train.to_numpy())
tokenized_training_data_del = ProcessData(augmented_text4.to_numpy().tolist(), tokenizer, 150, y_train.to_numpy())
tokenized_training_data= ProcessData(X_train.to_numpy().tolist(), tokenizer, 150, y_train.to_numpy())
#Create dataloader

loader_tokenized_training_data = torch.utils.data.DataLoader(tokenized_training_data, **params)
loader_tokenized_training_data_typo = torch.utils.data.DataLoader(tokenized_training_data_typo, **params)
loader_tokenized_training_data_syn = torch.utils.data.DataLoader(tokenized_training_data_syn, **params)
loader_tokenized_training_data_swap = torch.utils.data.DataLoader(tokenized_training_data_swap, **params)
loader_tokenized_training_data_del = torch.utils.data.DataLoader(tokenized_training_data_del, **params)
loaders = [iter(loader_tokenized_training_data), iter(loader_tokenized_training_data_typo), iter(loader_tokenized_training_data_syn), iter(loader_tokenized_training_data_swap), iter(loader_tokenized_training_data_del)]

In [13]:
model = TransTCN(5,5,1,[5,5])
model = model.to(device)
from torch.optim import Adam
criterion = nn.CrossEntropyLoss().to(device)
optimizer = AdamW(model.parameters(), lr=5e-5, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,num_training_steps=len(loader_tokenized_training_data) * 5)

In [14]:
def training(model, data_loaders, size):
    model = model.train()
    losses = []
    training_acc = 0
    for _ in tqdm(data_loaders[0]):

      batch_ids_list = []
      batch_masks_list = []
      labels = None
      for data_loader in data_loaders:
        data = next(data_loader)
        #import pdb; pdb.set_trace()

        batch_ids = data[0]['input_ids']
        batch_ids = batch_ids.flatten().reshape((batch_ids.shape[0], batch_ids.shape[2]))
        batch_ids_list.append(batch_ids.to(device))

        batch_masks = data[0]['attention_mask']
        batch_masks = batch_masks.flatten().reshape((batch_masks.shape[0], batch_masks.shape[2]))
        batch_masks_list.append(batch_masks.to(device))
        
        labels = data[2]
        #labels = labels.to(device)
        #data[2] = data[2].to(device)
      #print(batch_masks_list[0])
      labels = labels.to(device)
      output = model(batch_ids_list[0], batch_ids_list[1], batch_ids_list[2], batch_ids_list[3], batch_ids_list[4],batch_masks_list[0], batch_masks_list[1], batch_masks_list[2] , batch_masks_list[3], batch_masks_list[4])
      #print(output)
  
      prediction = torch.max(output, 1)[1]

      training_loss = criterion(output, torch.flatten(labels))
      training_acc += torch.sum(prediction == torch.flatten(labels))

      losses.append(training_loss.item())
      training_loss.backward()
      #nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
      
      optimizer.step()
      scheduler.step()
      optimizer.zero_grad()
      
    return training_acc / size, np.mean(losses)

In [15]:
highest_acc = 0
for epoch in trange(5):
    print('Epoch: ' , str(epoch))
    print('==================================')
    training_accuracy, training_loss = training(model,loaders,len(X_train) )
    #validation_accuracy, validation_loss = evaluate(model, loader_tokenized_validation_data, len(X_val))
    
    print('Training accuracy: ', training_accuracy )
    print('Training loss: ', training_loss)
    #print('Validation accuracy: ', validation_accuracy)
    #print('Validation loss: ', validation_loss)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Epoch:  0


HBox(children=(FloatProgress(value=0.0, max=29.0), HTML(value='')))

RuntimeError: ignored