In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score
import transformers
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
from torch import nn
import torch.nn.functional as F
#from tqdm import trange
from tqdm.notebook import tqdm, trange
import os
from transformers import AdamW, get_linear_schedule_with_warmup
import nlpaug
from google.colab import drive
drive.mount('/content/gdrive')

cuda
Mounted at /content/gdrive


In [23]:
PATH = '/content/gdrive/MyDrive/182proj/augmentations/'

In [9]:
rev = pd.read_json('/content/gdrive/MyDrive/182proj/yelp_review_training_dataset.jsonl',lines=True)

In [10]:
rev.head()

Unnamed: 0,review_id,text,stars
0,Q1sbwvVQXV2734tPgoKj4Q,Total bill for this horrible service? Over $8G...,1
1,GJXCdrto3ASJOqKeVWPi6Q,I *adore* Travis at the Hard Rock's new Kelly ...,5
2,2TzJjDVDEuAW6MR5Vuc1ug,I have to say that this office really has it t...,5
3,yi0R0Ugj_xUx_Nek0-_Qig,Went in for a lunch. Steak sandwich was delici...,5
4,11a8sVPMUFtaC7_ABRkmtw,Today was my second out of three sessions I ha...,1


In [12]:
X = rev['text'][:1000]
y = rev['stars'][:1000]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.1, random_state=123)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.4, random_state=123)


In [13]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-cased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




In [14]:
class ProcessData(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_len, labels):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.labels = labels
    
    def __getitem__(self, index):
        review = self.data[index]
        label = self.labels[index]
        return self.tokenizer.encode_plus(review, max_length=self.max_len, padding='max_length', return_attention_mask=True, return_tensors='pt', truncation=True), review, torch.tensor([label-1]).to(torch.long)
    
    def __len__(self):
        review_length = len(self.data)
        return review_length

In [15]:
tokenized_training_data = ProcessData(X_train.to_numpy().tolist(), tokenizer, 150, y_train.to_numpy())
tokenized_validation_data = ProcessData(X_val.to_numpy().tolist(), tokenizer, 150, y_val.to_numpy())
tokenized_test_data = ProcessData(X_test.to_numpy().tolist(), tokenizer, 150, y_test.to_numpy())

In [16]:
#Create dataloader
params = {'batch_size': 32,
          'num_workers': 0}
loader_tokenized_training_data = torch.utils.data.DataLoader(tokenized_training_data, **params)
loader_tokenized_validation_data = torch.utils.data.DataLoader(tokenized_validation_data, **params)
loader_tokenized_test_data = torch.utils.data.DataLoader(tokenized_test_data, **params)

In [17]:
class BERTNet(nn.Module):
    def __init__(self, classes):
        super(BERTNet, self).__init__()
        self.bert_model = transformers.BertModel.from_pretrained('bert-base-cased')
        self.drop = nn.Dropout(p=0.3)
        self.linear = nn.Linear(768, classes)
        self.sm = nn.Softmax(dim=1)
    
    def forward(self, input_ids, attention_mask):
        #print(input_ids,attention_mask )
        output = self.bert_model(input_ids, attention_mask)
        output = output[1]
        output = self.drop(output)
        output = self.linear(output)
        return output

### Data Augmentations

In [18]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

In [19]:
X_train[0]

'Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.'

In [20]:
aug = nac.KeyboardAug()
augmented_text = X_train.apply(aug.augment)
augmented_text

875    This shop combines coffee, vreZkfast and coffe...
933    Played an 18 - hole round on a Saturday during...
861    Thej say they are opwn until 7pm. It ' s is &1...
868    Had the lemon grass cTickeJ. It was aKazinv. T...
792    Being a repeat customer, I can honestly say th...
                             ...                        
988    I wanted to like this salon - it seemed cuRe a...
322    I wDnt heEe for ljnch and got the grilled Vaju...
382    Came in with my girl at 8: 05 on Tuesday, stor...
365    I bought a house that had a safe wLth a dial. ...
510    Very rude staff. Slow little to no communicati...
Name: text, Length: 900, dtype: object

In [24]:
augmented_text.to_csv(PATH+'typo1000.csv', index = False)

In [28]:
aug_syn = naw.SynonymAug(aug_src='wordnet')
augmented_text2 = X_train.apply(aug_syn.augment)
augmented_text2.to_csv(PATH+'synonym1000.csv', index=False)
augmented_text2

875    This shop flux coffee, breakfast and umber bre...
933    Played an 18 - hole round on a Saturday during...
861    They say they are assailable until 7pm. Inform...
868    Make the lemon grass chicken. It be amazing. T...
792    Being a repeat customer, I can honestly say th...
                             ...                        
988    I wanted to like this salon - it seemed cute a...
322    1 go hither for lunch and got the grilled caju...
382    Came in with my girl at 8: 05 on Tuesday, stor...
365    I bought a house that had a safe with a dial. ...
510    Very rude staff. Slow petty to no communicatin...
Name: text, Length: 900, dtype: object

In [29]:
aug_rws = naw.RandomWordAug(action='swap')
augmented_text3 = X_train.apply(aug_rws.augment)
augmented_text3.to_csv(PATH+'swap1000.csv', index=False)
augmented_text3

875    This shop coffee combines, breakfast and coffe...
933    Played an 18 - hole round on a Saturday during...
861    Say they are they open until 7pm. It ' s 615pm...
868    Had the grass lemon chicken. It was amazing. T...
792    Being a repeat customer, I can honestly say th...
                             ...                        
988    I wanted to like this - salon it seemed cute a...
322    I went here lunch for got the and cajun grille...
382    Came in with my girl at 8: 05 on Tuesday, stor...
365    I bought a house that a had safe with a dial. ...
510    Very rude staff. Slow little to communication ...
Name: text, Length: 900, dtype: object

In [31]:
aug_del = naw.RandomWordAug(action='delete')
augmented_text4 = X_train.apply(aug_del.augment)
augmented_text4.to_csv(PATH+'delete1000.csv', index=False)
augmented_text4

875    This shop combines coffee, coffee and a yarn s...
933    Played an 18 - hole round on a Saturday during...
861    They say they are open. It ' is 615pm and the ...
868    Had grass chicken. was amazing. The ambience i...
792    Being a repeat customer, I can honestly say th...
                             ...                        
988    I wanted to like this salon - it cute and tren...
322    I went here for and got grilled cajun sandwich...
382    Came in my girl at 8: 05 on Tuesday, store at ...
365    I bought a house that had a safe with a dial. ...
510    Very staff. Slow no communication between the ...
Name: text, Length: 900, dtype: object

In [32]:
X_train.to_csv(PATH+'original1000.csv', index=False)
X_train

875    This shop combines coffee, breakfast and coffe...
933    Played an 18-hole round on a Saturday during t...
861    They say they are open until 7pm.  It's is 615...
868    Had the lemon grass chicken. It was amazing. T...
792    Being a repeat customer, I can honestly say th...
                             ...                        
988    I wanted to like this salon- it seemed cute an...
322    I went here for lunch and got the grilled caju...
382    Came in with my girl at 8:05 on Tuesday, store...
365    I bought a house that had a safe with a dial.t...
510    Very rude staff. Slow little to no communicati...
Name: text, Length: 900, dtype: object

In [38]:
class TransTCN(nn.Module):
    def __init__(self, classes, augmentations, input_size, num_channels, kernel_size=2, dropout=0.3, hidden_state=768):
        super(TransTCN, self).__init__()
        self.berts = []
        for i in range(augmentations):
          self.berts.append(transformers.BertModel.from_pretrained('bert-base-cased'))
        #self.tcn = TemporalConvNet(input_size, num_channels, kernel_size, dropout=dropout)
        self.finalLinear = nn.Linear(hidden_state, classes)

    #(bert -> tcn) * n -> bert -> linear -> softmax
    def forward(self, input_ids, attention_masks):
       bert_outputs = []
       for i in range(len(self.berts)):
         bert_outputs.append(self.berts[i](input_ids[i], attention_masks[i]))
       for j in bert_outputs:
         print(j)
         print(j.shape)
       #concat bert
       #tcn something
       return -1


In [42]:
tokenized_training_data_typo = ProcessData(augmented_text.to_numpy().tolist(), tokenizer, 150, y_train.to_numpy())
tokenized_training_data_syn = ProcessData(augmented_text2.to_numpy().tolist(), tokenizer, 150, y_train.to_numpy())
tokenized_training_data_swap = ProcessData(augmented_text3.to_numpy().tolist(), tokenizer, 150, y_train.to_numpy())
tokenized_training_data_del = ProcessData(augmented_text4.to_numpy().tolist(), tokenizer, 150, y_train.to_numpy())
tokenized_training_data= ProcessData(X_train.to_numpy().tolist(), tokenizer, 150, y_train.to_numpy())
#Create dataloader
loader_tokenized_training_data = torch.utils.data.DataLoader(tokenized_training_data, **params)
loader_tokenized_training_data_typo = torch.utils.data.DataLoader(tokenized_training_data_typo, **params)
loader_tokenized_training_data_syn = torch.utils.data.DataLoader(tokenized_training_data_syn, **params)
loader_tokenized_training_data_swap = torch.utils.data.DataLoader(tokenized_training_data_swap, **params)
loader_tokenized_training_data_del = torch.utils.data.DataLoader(tokenized_training_data_del, **params)
loaders = [loader_tokenized_training_data, loader_tokenized_training_data_typo, loader_tokenized_training_data_syn, loader_tokenized_training_data_swap, loader_tokenized_training_data_del]

In [39]:
model = TransTCN(5,5,1,[5,5]).to(device)
from torch.optim import Adam
criterion = nn.CrossEntropyLoss().to(device)
optimizer = AdamW(model.parameters(), lr=5e-5, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,num_training_steps=len(loader_tokenized_training_data) * 5)

In [41]:
def training(model, data_loaders, size):
    model = model.train()
    losses = []
    training_acc = 0
    for data_loader in tqdm(data_loaders):
      for data in tqdm(data_loader):
          batch_ids = data[0]['input_ids']
          batch_ids = batch_ids.flatten().reshape((batch_ids.shape[0], batch_ids.shape[2]))
          batch_masks = data[0]['attention_mask']
          batch_masks = batch_masks.flatten().reshape((batch_masks.shape[0], batch_masks.shape[2]))
          data[2] = data[2].to(device)
        
        
          output = model(batch_ids.to(device), batch_masks.to(device))
          #print(output)
      
          prediction = torch.max(output, 1)[1]
  
          training_loss = criterion(output, torch.flatten(data[2]))
          training_acc += torch.sum(prediction == torch.flatten(data[2]))
    
          losses.append(training_loss.item())
          training_loss.backward()
          #nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
          
          optimizer.step()
          scheduler.step()
          optimizer.zero_grad()
        
    return training_acc / size, np.mean(losses)

In [43]:
highest_acc = 0
for epoch in trange(5):
    print('Epoch: ' , str(epoch))
    print('==================================')
    training_accuracy, training_loss = training(model,loaders,len(X_train) )
    #validation_accuracy, validation_loss = evaluate(model, loader_tokenized_validation_data, len(X_val))
    
    print('Training accuracy: ', training_accuracy )
    print('Training loss: ', training_loss)
    #print('Validation accuracy: ', validation_accuracy)
    #print('Validation loss: ', validation_loss)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Epoch:  0


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=29.0), HTML(value='')))

ValueError: ignored