In [1]:
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter, defaultdict, OrderedDict

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertConfig, DistilBertModel

import time
import torch
from torch import optim
import pickle
import warnings
warnings.filterwarnings('ignore')

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

### Loading Data

In [2]:
reviews = pd.read_csv("data/reviews2.csv")
reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,cAQMvg8vdIkXFAmFdmtf8Q,SJ7IbI1QVvia5muxByAv7w,3BzwagIBPQEf_Ic44oZtYQ,5,0,0,0,Very upscale and romantic place. Good was exce...,2019-02-15 03:10:26
1,sjm_uUcQVxab_EeLCqsYLg,0kA0PAJ8QFMeveQWHFqz2A,8zehGz9jnxPqXtOc7KaJxA,4,0,0,0,The food is always great here. The service fro...,2011-07-28 18:05:01
2,3_oBHUOlAC33347F8_vYKg,l8reACV0ZmCyjBZy4wtq-w,FxveeHL_B0Kkz1KjPKyF3A,1,2,2,2,Literally the slowest and worst service I have...,2016-02-05 01:13:15
3,Gi5LSRmTXoL9Bp4jNGPjLw,hn0ZbitvmlHnF--KJGJ6_A,TA1KUSCu8GkWP9w0rmElxw,4,0,0,0,I have been here twice and have had really goo...,2011-10-27 14:32:57
4,R9SFR1FgssHATWd9PpQEHg,BjckP4AW2FXivEAUmh5d3g,PUZSvR-nEHlhEi0gSADu7w,4,0,0,0,Went here with an friend visiting from Italy: ...,2017-09-10 16:16:58


### Star Counts
- Positive (star >= 4), Negative (star < 4)

In [3]:
print("stars count:  ", Counter(reviews['stars']))
reviews['label'] = 0
reviews.loc[reviews['stars'] >= 4, 'label'] = 1
print("labels count:  ", Counter(reviews['label']))

stars count:   Counter({5: 722908, 4: 503086, 3: 234943, 2: 134182, 1: 112742})
labels count:   Counter({1: 1225994, 0: 481867})


### Sub-sampling each sentiment (5000 Each)
- Using WordPiece to tokenize the strings

In [4]:
np.random.seed(48)
selected_idx = np.array([])
for label in range(2):
    selected_idx=np.append(selected_idx, np.random.choice(reviews[reviews['label'] == label].index, size=5000, replace=False), axis=0)
    
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
selected_text = reviews.loc[selected_idx, ['text', 'stars', 'label']]
encoded_text = tokenizer(selected_text['text'].tolist(), padding='max_length', truncation=True, return_tensors='pt')

### Load DistillBert Pre-training Weights
- 10 Epochs
- 1000 Batch (batch size: 10)

In [13]:
configuration = DistilBertConfig()
configuration.num_labels = 2
configuration._name_or_path = 'distilbert-base-uncased'
encoder = DistilBertForSequenceClassification(configuration)
encoder = encoder.to(device)

In [7]:
np.random.seed(48)
labels = torch.tensor((selected_text['label']).tolist())
idx = np.random.permutation(selected_text.shape[0])

batch_size = 10
total_num_batch = selected_text.shape[0] // batch_size
print(total_num_batch)
optimizer = torch.optim.Adam(encoder.parameters(), lr=1e-5)
for i in range(10):
    all_loss = 0
    for batch in range(total_num_batch):
        start = time.time() 
        optimizer.zero_grad()
        input_ids = torch.tensor(encoded_text['input_ids'][idx[batch_size * batch:batch_size * (batch+1)], :]).to(device)
        attention_mask = torch.tensor(encoded_text['attention_mask'][idx[batch_size * batch:batch_size * (batch+1)], :]).to(device)
        label = torch.tensor(labels[idx[batch_size * batch:batch_size * (batch+1)]]).to(device)
        enc = encoder(input_ids, attention_mask, labels = label)
        loss = enc.loss
        all_loss += float(loss)
        loss.backward()
        optimizer.step()
        end = time.time()
        if batch % 50 == 0:
            print('Batch: {}, time: {}, loss: {}'.format(batch, end-start, loss))
    print("Average loss: ", all_loss / total_num_batch)
    print(loss)

1000
Batch: 0, time: 0.40135788917541504, loss: 0.6513352394104004
Batch: 50, time: 0.2838094234466553, loss: 0.739883542060852
Batch: 100, time: 0.285632848739624, loss: 0.7026276588439941
Batch: 150, time: 0.2759861946105957, loss: 0.6837993860244751
Batch: 200, time: 0.28923869132995605, loss: 0.6517378091812134
Batch: 250, time: 0.2873342037200928, loss: 0.42001065611839294
Batch: 300, time: 0.289722204208374, loss: 0.5119680166244507
Batch: 350, time: 0.2913084030151367, loss: 0.4387996792793274
Batch: 400, time: 0.28182029724121094, loss: 0.5223572850227356
Batch: 450, time: 0.2658567428588867, loss: 0.27973473072052
Batch: 500, time: 0.26551175117492676, loss: 0.6251158714294434
Batch: 550, time: 0.2666037082672119, loss: 0.7200733423233032
Batch: 600, time: 0.2940635681152344, loss: 0.3364659249782562
Batch: 650, time: 0.2943398952484131, loss: 0.30579668283462524
Batch: 700, time: 0.29401350021362305, loss: 0.3601791262626648
Batch: 750, time: 0.29522180557250977, loss: 0.4647

### Save model

In [8]:
# torch.save(encoder.state_dict(), "temp/encoder.pt")
# encoder.config.to_json_file("temp/encoder_config.pt")

### Load Model

In [4]:
config = DistilBertConfig.from_json_file("bert/encoder_config.pt")
encoder = DistilBertForSequenceClassification(config)
state_dict = torch.load("bert/encoder.pt")
encoder.load_state_dict(state_dict)
encoder = encoder.to(device)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

### Get Sentiment

In [5]:
batch_size = 256
num_batch = reviews.shape[0] // batch_size
print("Total number of batch:  ", num_batch)
all_pred = np.array([]).reshape(0, 2)
with torch.no_grad():
    start = time.time()
    for batch in range(num_batch):
        encoded_reviews_batch = tokenizer(reviews.iloc[batch*batch_size:(batch+1)*batch_size]['text'].tolist(), padding='max_length', truncation=True, return_tensors='pt')
        results = encoder(encoded_reviews_batch['input_ids'].to(device), 
                          encoded_reviews_batch['attention_mask'].to(device), 
                          labels = torch.tensor(reviews.iloc[batch*batch_size:(batch+1)*batch_size]['label'].to_list()).to(device))
        pred = torch.nn.Softmax(dim=1)(results.logits).cpu().data.numpy()
        all_pred = np.concatenate((all_pred, pred))
        end = time.time()
        if batch % 1000 == 0:
            print("{}  Time elapsed:  {}".format(batch, end - start) )
            start = time.time()
    encoded_reviews_batch = tokenizer(reviews.iloc[(batch+1)*batch_size:]['text'].tolist(), padding='max_length', truncation=True, return_tensors='pt')
    results = encoder(encoded_reviews_batch['input_ids'].to(device), 
                      encoded_reviews_batch['attention_mask'].to(device), 
                      labels = torch.tensor(reviews.iloc[(batch+1)*batch_size:]['label'].to_list()).to(device))
    pred = torch.nn.Softmax(dim=1)(results.logits).cpu().data.numpy()
    all_pred = np.concatenate((all_pred, pred))
    end = time.time()
    print("Finished. Time elapsed:  {}".format(end-start))

Total number of batch:   6671
0  Time elapsed:  2.6457459926605225
1000  Time elapsed:  2692.9258897304535
2000  Time elapsed:  2696.823832988739
3000  Time elapsed:  2698.4334688186646
4000  Time elapsed:  2693.5135509967804
5000  Time elapsed:  2695.0844798088074
6000  Time elapsed:  2700.5155498981476
Finished. Time elapsed:  1810.7902421951294


In [8]:
reviews[['negative', 'positive']] = all_pred

In [11]:
reviews.to_csv('reviews_with_sentiment.csv', index=False)

In [16]:
(reviews['label'] == all_pred.argmax(1)).sum() / reviews.shape[0]

0.8499286534442791

## Self fine tuning
- Fix pre-training weights
- Add dense layers on top
- Only tune dense layers weights

In [6]:
encoder = DistilBertModel.from_pretrained('distilbert-base-uncased')
encoder = encoder.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Obtain embedding from pre-trained models

In [7]:
np.random.seed(48)
idx = np.random.permutation(selected_text.shape[0])

batch_size = 32
total_num_batch = selected_text.shape[0] // batch_size
print(total_num_batch)
all_embedding = np.zeros(shape=(selected_text.shape[0], 768))

with torch.no_grad():
    for batch in range(total_num_batch):
        start = time.time()
        input_ids = torch.tensor(encoded_text['input_ids'][idx[batch_size * batch:batch_size * (batch+1)], :]).to(device)
        attention_mask = torch.tensor(encoded_text['attention_mask'][idx[batch_size * batch:batch_size * (batch+1)], :]).to(device)
        enc = encoder(input_ids, attention_mask)
        all_embedding[idx[batch_size * batch:batch_size * (batch+1)], :] = enc[0][:,0,:].cpu().data.numpy()
        end = time.time()
        if batch % 50 == 0:
            print(batch, end-start)
    input_ids = torch.tensor(encoded_text['input_ids'][idx[batch_size * (batch+1):], :]).to(device)
    attention_mask = torch.tensor(encoded_text['attention_mask'][idx[batch_size * (batch+1):], :]).to(device)
    enc = encoder(input_ids, attention_mask)
    all_embedding[idx[batch_size * (batch+1):], :] = enc[0][:,0,:].cpu().data.numpy()

312
0 0.42522501945495605
50 0.27116918563842773
100 0.28605151176452637
150 0.28875064849853516
200 0.2722632884979248
250 0.28982996940612793
300 0.29233789443969727


### Fine-Tune

In [15]:
class Model(torch.nn.Module):
    def __init__(self, H):
        super(Model, self).__init__()
        self.linear1 = torch.nn.Linear(768, H)
        self.linear2 = torch.nn.Linear(H, 1)

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred
    
labels = torch.tensor((selected_text['label']).tolist())

model = Model(128)
model = model.to(device)

In [30]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.BCEWithLogitsLoss()
for i in range(1000):
    all_loss = 0
    for batch in range(total_num_batch):
        start = time.time() 
        optimizer.zero_grad()
        pred = model(torch.tensor(all_embedding[batch*batch_size:(batch+1)*batch_size, :]).float().to(device))
        label = torch.tensor(labels[idx[batch_size * batch:batch_size * (batch+1)]])
        loss = criterion(pred.cpu(), label.float().unsqueeze(1))
        all_loss += float(loss)
        loss.backward()
        optimizer.step()
        end = time.time()
    print("Epoch: {}  Average loss: {}".format(i, float(all_loss / total_num_batch)))


Epoch: 0  Average loss: 0.767658793200285
Epoch: 1  Average loss: 0.6684566416228429
Epoch: 2  Average loss: 0.62086895843729
Epoch: 3  Average loss: 0.5883290941516558
Epoch: 4  Average loss: 0.56410362251485
Epoch: 5  Average loss: 0.5470325753856928
Epoch: 6  Average loss: 0.5347868076597269
Epoch: 7  Average loss: 0.525824815273667
Epoch: 8  Average loss: 0.5190742890804242
Epoch: 9  Average loss: 0.5141656713512464
Epoch: 10  Average loss: 0.5105705606058623
Epoch: 11  Average loss: 0.5078865797378314
Epoch: 12  Average loss: 0.5058081769981445
Epoch: 13  Average loss: 0.5040901992470026
Epoch: 14  Average loss: 0.502592199553664
Epoch: 15  Average loss: 0.5012805259858186
Epoch: 16  Average loss: 0.500253104724181
Epoch: 17  Average loss: 0.4992480758482065
Epoch: 18  Average loss: 0.4984326846897602
Epoch: 19  Average loss: 0.4976827699977618
Epoch: 20  Average loss: 0.4969380006003074
Epoch: 21  Average loss: 0.49625840472678345
Epoch: 22  Average loss: 0.49557243258907246
Epoc

KeyboardInterrupt: 