In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
!pip install pytorch-pretrained-bert

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |██▋                             | 10kB 18.6MB/s eta 0:00:01[K     |█████▎                          | 20kB 5.0MB/s eta 0:00:01[K     |████████                        | 30kB 6.9MB/s eta 0:00:01[K     |██████████▋                     | 40kB 8.6MB/s eta 0:00:01[K     |█████████████▎                  | 51kB 5.7MB/s eta 0:00:01[K     |███████████████▉                | 61kB 6.6MB/s eta 0:00:01[K     |██████████████████▌             | 71kB 7.0MB/s eta 0:00:01[K     |█████████████████████▏          | 81kB 7.6MB/s eta 0:00:01[K     |███████████████████████▉        | 92kB 8.3MB/s eta 0:00:01[K     |██████████████████████████▌     | 102kB 7.2MB/s eta 0:00:01[K     |█████████████████████████████▏  | 112kB 7.2MB/s eta 0:00:01[K     |██████████████████████

In [0]:
!cp /content/drive/'My Drive'/test_task/bert.py .
!cp /content/drive/'My Drive'/test_task/utils.py .

In [0]:
from bert import *
from utils import *

100%|██████████| 231508/231508 [00:00<00:00, 959086.77B/s]


Firstly, I will define Bert as I made before:

In [0]:
from pytorch_pretrained_bert import BertConfig
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
import pickle
import torch.optim as optim

In [0]:
config = BertConfig(vocab_size_or_config_json_file=32000, 
                    hidden_size=768,
                    num_hidden_layers=12,
                    num_attention_heads=12,
                    intermediate_size=3072)

num_labels = 5

In [0]:
model = BertForSequenceClassification(config, num_labels)

100%|██████████| 407873900/407873900 [00:14<00:00, 28026816.57B/s]


Loading the data and indexes, as we want to do a clear comparison:

In [0]:
import pandas as pd
df = pd.read_json("/content/drive/My Drive/test_task/reviews.json", lines=True)

In [0]:
with open('/content/drive/My Drive/test_task/indexes.pickle', 'rb') as f:
    indexes = pickle.load(f)

In [0]:
train_idxs, validation_idxs = indexes

Getting train and val datasets:

In [0]:
columns = ['reviewText', 'summary', 'overall']
train = df.iloc[train_idxs][columns].astype(str)
val = df.iloc[validation_idxs][columns].astype(str)

As in previous notebooks, I will shift labels on 1, to make them in the right format:

In [0]:
train.loc[:, 'overall'] = train.overall.astype(int).apply(lambda target: target-1)
val.loc[:, 'overall'] = val.overall.astype(int).apply(lambda target: target-1)

At this step I will create lists of the train and the val data for both text columns:

In [0]:
X_train_rev = train['reviewText'].to_list()
X_train_sum = train['summary'].to_list()
y_train = train['overall'].to_list()

X_val_rev = val['reviewText'].to_list()
X_val_sum = val['summary'].to_list()
y_val = val['overall'].to_list()

I will extend the functionality of text_dataset to make the ability to load to the model two sentences which will be separated by [SEP] token:

In [0]:
class text_pairs_dataset(text_dataset):

    def __init__(self, x_y_list, max_review_length, max_summ_length, transform=None):
        
        self.x_y_list = x_y_list
        self.transform = transform
        self.max_review_length = max_review_length
        self.max_summ_length = max_summ_length
       
    def __getitem__(self,index):
        
        #indexind first sequence
        tokenized_review = tokenizer.tokenize(self.x_y_list[0][index])
        
        if len(tokenized_review) > self.max_review_length:
            tokenized_review = tokenized_review[:self.max_review_length]
            
        ids_review  = tokenizer.convert_tokens_to_ids(tokenized_review)

        padding = [0] * (self.max_review_length - len(ids_review))
        
        ids_review += padding

        #indexind second one
        tokenized_summ = tokenizer.tokenize(self.x_y_list[1][index])
        
        if len(tokenized_summ) > self.max_summ_length:
            tokenized_summ = tokenized_summ[:self.max_summ_length]
            
        ids_sum  = tokenizer.convert_tokens_to_ids(tokenized_summ)

        padding = [0] * (self.max_summ_length - len(ids_sum))
        
        ids_sum += padding

        #joining them together with [SEP] token (idx of [SEP] token is 102)
        ids = ids_review + [102] + ids_sum

        assert len(ids) == self.max_review_length + self.max_summ_length + 1
        
        ids = torch.tensor(ids)
        
        sentiment = self.x_y_list[2][index] # color   
 
        list_of_labels = [torch.from_numpy(np.array(sentiment))]
        
        
        return ids, list_of_labels[0]

I will use the same lengths as I used before for review and summary fields.

In [0]:
batch_size = 32

max_review_length = 60
max_summ_length = 60

train_lists = [X_train_rev, X_train_sum, y_train]
val_lists = [X_val_rev, X_val_sum, y_val]

training_dataset = text_pairs_dataset(x_y_list = train_lists, 
                                      max_review_length = max_review_length,
                                      max_summ_length = max_summ_length)

test_dataset = text_pairs_dataset(x_y_list = val_lists,
                                  max_review_length = max_review_length,
                                  max_summ_length = max_summ_length)

dataloaders_dict = {'train': torch.utils.data.DataLoader(training_dataset, 
                                                         batch_size=batch_size, 
                                                         shuffle=True, 
                                                         num_workers=0),
                   'val':torch.utils.data.DataLoader(test_dataset, 
                                                     batch_size=batch_size, 
                                                     shuffle=True, 
                                                     num_workers=0)}

dataset_sizes = {'train':len(train_lists[0]),
                'val':len(val_lists[0])}

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [0]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
   

The same hyperparameters:

In [0]:
lrlast = .001
lrmain = .00001
optimizer = optim.Adam([
                        {"params":model.bert.parameters(),"lr": lrmain},
                        {"params":model.classifier.parameters(), "lr": lrlast},
                      ])

criterion = nn.CrossEntropyLoss()

# Decay LR by a factor of 0.1 every 4 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=4, gamma=0.1)

model_name = 'bert2texts'

In [0]:
model, raw_preds = train_model(model, model_name, criterion, optimizer, exp_lr_scheduler, dataloaders_dict, dataset_sizes, device, num_epochs=4)

starting
Epoch 1/4
----------




train total loss: 0.4862 
train F-1 score : 0.8193 
val total loss: 0.4133 
val F-1 score : 0.8437 
saving with loss of 0.4132876981496811 improved over previous 100

Epoch 2/4
----------
train total loss: 0.3853 
train F-1 score : 0.8525 
val total loss: 0.3997 
val F-1 score : 0.8465 
saving with loss of 0.3996911370277405 improved over previous 0.4132876981496811

Epoch 3/4
----------
train total loss: 0.3330 
train F-1 score : 0.8710 
val total loss: 0.4183 
val F-1 score : 0.8505 

Epoch 4/4
----------
train total loss: 0.2549 
train F-1 score : 0.9031 
val total loss: 0.4521 
val F-1 score : 0.8457 

Training complete in 226m 14s
Best val loss: 0.399691


Oh, as we can see, this approach overperformed two separate models with third built on their predictions.

4 epochs were too much for it, cause overfit had been started. But we saved the best model weights, so it doesn't matter.

In [0]:
with open('/content/drive/My Drive/test_task/two_sent.pickle', 'wb') as f:
    pickle.dump(raw_preds, f)