# BERT for yelp prediction
Recall our goals: 1). Find what attributes are ``important'' in sentiment analysis; 2). Predict ratings based on features.

# Preprocessing
We preprocess data into numpy format

In [0]:
import numpy as np
import pandas as pd

class data:
    def __init__(self, fileName):
        df = pd.read_csv(fileName)
        values = df.values
        # Star ratings, notice for test/validation data, these are actually ids
        self.stars = np.array(values[:, 0], dtype = int)
        # Name of restaurants
        self.names = values[:, 1]
        # Text of review
        self.text = values[:, 2]
        # Date of review
        self.date = values[:, 3]
        # City of restaurant
        self.city = values[:, 7]
        # Misc categories, might not use it
        self.category = values[:, 10]
        # Sentiment score
        self.sentiment = np.nan_to_num(values[:, 13])
        s = {0, 1, 2, 3, 7, 10}
        cols = []
        for i in range(values.shape[1]):
            if i not in s: cols.append(i)
        # Other numerical measurements
        self.numerical = np.array(values[:, cols], dtype = np.float64)
        self.text_df = None
    
    def center(self):
        # Center all numerical attributes to mean 0
        mean = np.nanmean(self.numerical, axis = 0)
        self.numerical -= mean
        self.numerical = np.nan_to_num(self.numerical)

    '''def transfer_text(self):
        self.text_df = pd.DataFrame({
            "id": range(len(self.text)),
            "label": self.stars,
            "alpha": ["a"] * len(self.text),
            "text": self.text
        })
        self.text_df["text"] = self.text_df["text"].replace(r"\n", " ", regex = True)'''

    

train_data = data("/content/Yelp_train.csv")
validation_data = data("/content/Yelp_validate.csv")
test_data = data("/content/Yelp_test.csv")

# Setup BERT
Here we import all modules BERT needs, preprocess text data into desired
format

In [3]:
!pip install pytorch_pretrained_bert

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

Collecting pytorch_pretrained_bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |████████████████████████████████| 133kB 3.5MB/s 
[?25hCollecting regex
[?25l  Downloading https://files.pythonhosted.org/packages/e3/8e/cbf2295643d7265e7883326fb4654e643bfc93b3a8a8274d8010a39d8804/regex-2019.11.1-cp36-cp36m-manylinux1_x86_64.whl (643kB)
[K     |████████████████████████████████| 645kB 42.8MB/s 
Installing collected packages: regex, pytorch-pretrained-bert
Successfully installed pytorch-pretrained-bert-0.6.2 regex-2019.11.1


Using TensorFlow backend.


RuntimeError: ignored

In [4]:
# Add proper prefix/suffix to each sentence
sentences = ["[CLS] " + text + " [SEP]" for text in train_data.text]
# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased", do_lower_case = False)
# Tokenize each sentence with respect to BERT vocabulary
tokenized_text = [tokenizer.tokenize(sent) for sent in sentences]

100%|██████████| 213450/213450 [00:00<00:00, 2392840.75B/s]


In [0]:
# Convert all sequences to BERT ids
# NOTE: BERT has a maximum sequence length of 512, so many review texts have
# been truncated
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_text],
                          maxlen = 512, dtype = "long", truncating = "post", padding = "post")

In [0]:
# Attention masks for BERT, in our case, we wish to predict all words, so no 
# mask needed
attentions = []
for seq in input_ids:
  attentions.append([float(i > 0) for i in seq])

In [0]:
attentions

In [0]:
# Split training and validation data 8:2
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, train_data.stars, random_state = 2019,
                                                                                    test_size = 0.2)
train_masks, validation_masks, _, _ = train_test_split(attentions, input_ids, random_state = 2019, test_size = 0.2)

# Turn all data into tensors
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

# Set up iterators for tensors
batch_size = 32
train_tensor = TensorDataset(train_inputs, train_masks, train_labels)
validation_tensor = TensorDataset(validation_inputs, validation_masks, validation_labels)
train_sampler = RandomSampler(train_tensor)
validation_sampler = SequentialSampler(validation_tensor)
train_dataloader = DataLoader(train_tensor, sampler = train_sampler, batch_size = batch_size)
validation_dataloader = DataLoader(validation_tensor, sampler = validation_sampler,
                                   batch_size = batch_size)

In [0]:
model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels = 5)
model.cuda()

In [1]:
parameters = list(model.named_parameters())
no_decay = ["bias", "gamma", "beta"]
grouped_paras = [
                 {"params": [p for n, p in parameters if not any(nd in n for nd in no_decay)],
                  "weight_decay_rate": 0.01},
                 {"params": [p for n, p in parameters if any(nd in n for nd in no_decay)],
                  "weight_decay_rate": 0.0}
]

optimizer = BertAdam(grouped_paras, lr = 2e-5, warmup = 0.01)

train_loss_list = []
epochs = 4

for _ in trange(epochs, desc = "Epoch"):
  model.train()
  train_loss = 0
  number_train_examples, number_train_steps = 0, 0
  for step, batch in enumerate(train_dataloader):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_masks, b_labels = batch
    optimizer.zero_grad()
    loss = model(b_input_ids, token_type_ids = None, attention_mask = b_masks,
                 labels = b_labels)
    train_loss_list.append(loss.item())
    loss.backward()
    optimizer.step()
    train_loss += loss.item()
    number_train_examples += b_input_ids.size(0)
    number_train_steps += 1




NameError: ignored