In [22]:
from transformers import BertTokenizer, BertModel
from transformers.trainer_utils import set_seed
import torch
from torch import nn
import pandas as pd
import random
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import torch.nn.functional as F

set_seed(1234)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [23]:
from transformers.models.bert.modeling_bert import BertConfig, BertEncoder

In [24]:
class CustomBertTokenClassifier(nn.Module):
    def __init__(self, bert):
        super().__init__()
        self.bert = bert
        self.embedding_dim = bert.config.to_dict()['hidden_size']
        self.num_labels = 3
        self.fc = nn.Linear(self.embedding_dim, 3)
        self.fc2 = nn.Linear(self.embedding_dim, 5)
        self.dropout = nn.Dropout(0.3)
        self.loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 1.0, 1.0]).to(device))
        self.sent_loss_fct = nn.CrossEntropyLoss()
        decoder_config = BertConfig(num_hidden_layers=6, add_cross_attention=True, is_decoder=True)
        self.decoder = BertEncoder(decoder_config)
        
        
    def forward(self, input_ids, attention_mask, bio_labels=None, sent_labels=None):
        # text = [batch size, sent len]
        bert_outs = self.bert(input_ids, attention_mask)  # outputs = {last_hidden_state, hidden_states}
        embedding = bert_outs.last_hidden_state  # embedding = [batch size, sent len, emb dim=768]
        lower_embed = bert_outs.hidden_states[6]
        logits = self.fc(self.dropout(embedding))
        
        sent_embedding = self.decoder(embedding, encoder_hidden_states=lower_embed)[0]
        sent_logits = self.fc2(self.dropout(sent_embedding))
        
        lossA = None
        lossB = None
        if bio_labels is not None:
            active_loss = attention_mask.view(-1) == 1
            active_logits = logits.view(-1, 3)
            active_labels = torch.where(
                active_loss, bio_labels.view(-1), torch.tensor(self.loss_fct.ignore_index).type_as(bio_labels)
            )
            
            active_loss2 = attention_mask.view(-1) == 1
            active_logits2 = sent_logits.view(-1, 5)
            active_labels2 = torch.where(
                active_loss2, sent_labels.view(-1), torch.tensor(self.sent_loss_fct.ignore_index).type_as(sent_labels)
            )
            
            lossA = self.loss_fct(active_logits, active_labels)
            lossB = self.sent_loss_fct(active_logits2, active_labels2)
            
        return logits, sent_logits, lossA, lossB


def train_model(model, optimizer, train_dataloader):
    model.train()
    total_lossA = 0
    total_lossB = 0
    for step, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_bio_labels, b_sent_labels = batch

        optimizer.zero_grad()
        logits, sent_logits, lossA, lossB = model(b_input_ids, attention_mask=b_input_mask, 
                                                  bio_labels=b_bio_labels, sent_labels=b_sent_labels)
        #label_ids = b_labels.to('cpu').numpy()
        total_lossA += lossA.item()
        total_lossB += lossB.item()
        loss = lossA + lossB
        # Backward pass
        loss.backward()
        optimizer.step()
    print(f'LossA = {total_lossA / len(train_dataloader)}, LossB = {total_lossB / len(train_dataloader)}')
    print(f'Loss = {(total_lossA + total_lossB) / len(train_dataloader)}')

        
def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()
    total_lossA = 0
    total_lossB = 0
    for step, batch in tqdm(enumerate(val_dataloader), total=len(val_dataloader)):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_bio_labels, b_sent_labels = batch
        with torch.no_grad():
            logits, sent_logits, lossA, lossB = model(b_input_ids, attention_mask=b_input_mask, 
                                                      bio_labels=b_bio_labels, sent_labels=b_sent_labels)
        #label_ids = b_labels.to('cpu').numpy()
        total_lossA += lossA.item()
        total_lossB += lossB.item()
        loss = lossA + lossB
    print(f'LossA = {total_lossA / len(val_dataloader)}, LossB = {total_lossB / len(val_dataloader)}')
    print(f'Loss = {(total_lossA + total_lossB) / len(val_dataloader)}')
    return (total_lossA + total_lossB) / len(val_dataloader)

In [25]:
df = pd.read_csv('preproc_bert_16_restaurant_train.csv')
df['token_ids'] = df['token_ids'].apply(eval)
df['labels'] = df['labels'].apply(eval)
df.head(5)

Unnamed: 0,sid,token_ids,labels
0,1004293:0,"[101, 13325, 2013, 3025, 8466, 2023, 2109, 200...","[O, O, O, O, O, O, O, O, O, O, O, B-neg, O, O,..."
1,1004293:1,"[101, 2057, 1010, 2045, 2020, 2176, 1997, 2149...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,1004293:2,"[101, 2027, 2196, 2716, 2149, 19394, 5649, 271...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,1004293:3,"[101, 1996, 2833, 2001, 10223, 6508, 1011, 220...","[O, O, B-neg, O, O, O, O, O, O, O, O, O, O, O,..."
4,1004293:4,"[101, 2044, 2035, 2008, 1010, 2027, 10865, 200...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"


In [26]:
df_test = pd.read_csv('preproc_bert_16_restaurant_test.csv')
df_test['token_ids'] = df_test['token_ids'].apply(eval)
df_test['labels'] = df_test['labels'].apply(eval)
df_test.head(5)

Unnamed: 0,sid,token_ids,labels
0,en_BlueRibbonSushi_478218171:0,"[101, 9805, 2213, 999, 102]","[O, O, O, O, O]"
1,en_BlueRibbonSushi_478218171:1,"[101, 4240, 2428, 2204, 10514, 6182, 1012, 102]","[O, O, O, O, B-pos, I-pos, O, O]"
2,en_BlueRibbonSushi_478218171:2,"[101, 2025, 1996, 5221, 8810, 2021, 11706, 101...","[O, O, O, O, B-neu, O, O, O, O]"
3,en_BlueRibbonSushi_478218171:3,"[101, 2665, 5572, 13675, 21382, 7987, 9307, 20...","[O, B-pos, I-pos, I-pos, I-pos, I-pos, I-pos, ..."
4,en_BlueRibbonSushi_478218171:4,"[101, 2123, 1005, 1056, 2681, 1996, 4825, 2302...","[O, O, O, O, O, O, O, O, O, O, O]"


In [None]:
from collections import defaultdict

label_count = defaultdict(int)
for x in df['labels']:
    for e in x:
        label_count[e] += 1

for k,v in label_count.items():
    print(f"{k}: {v}")

del label_count

In [27]:
# Simple output mapping:
#label_mapping = {'O': 0, 'B-pos': 1, 'B-neg': 2, 'B-neu': 3, 'B-con': 4, 'I-pos': 5, 'I-neg': 6, 'I-neu': 7, 'I-con': 8}
bio_label_mapping = {'O': 0, 'B-pos': 1, 'B-neg': 1, 'B-neu': 1, 'B-con': 1, 'I-pos': 2, 'I-neg': 2, 'I-neu': 2, 'I-con': 2}
sent_label_mapping = {'O': 0, 'B-pos': 1, 'B-neg': 2, 'B-neu': 3, 'B-con': 4, 'I-pos': 1, 'I-neg': 2, 'I-neu': 3, 'I-con': 4}

def generate_input_masks_labels(df, MAX_LEN=128):
    input_ids  = list(df['token_ids'])
    attention_masks = [torch.ones((len(x)), dtype=torch.long) for x in input_ids]
    bio_labels = [[bio_label_mapping[x] for x in seq] for seq in df['labels']]
    sent_labels = [[sent_label_mapping[x] for x in seq] for seq in df['labels']]
    
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post", value=0)
    attention_masks = pad_sequences(attention_masks, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post", value=0)
    bio_labels = pad_sequences(bio_labels, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post", value=0)
    sent_labels = pad_sequences(sent_labels, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post", value=0)
    
    input_ids = torch.tensor(input_ids).long()
    attention_masks = torch.tensor(attention_masks).long()
    bio_labels = torch.tensor(bio_labels).long()
    sent_labels = torch.tensor(sent_labels).long()

    return input_ids, attention_masks, bio_labels, sent_labels

In [28]:
df_train = df

In [29]:
input_ids, attention_masks, bio_labels, sent_labels = generate_input_masks_labels(df_train, MAX_LEN=128)
print(input_ids.shape)
print(attention_masks.shape)
print(bio_labels.shape)
print(sent_labels.shape)

torch.Size([2000, 128])
torch.Size([2000, 128])
torch.Size([2000, 128])
torch.Size([2000, 128])


In [30]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 5
train_data = TensorDataset(input_ids, attention_masks, bio_labels, sent_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

In [31]:
test_batch_size = 5
input_ids_test, attention_masks_test, bio_labels_test, sent_labels_test = generate_input_masks_labels(df_test, MAX_LEN=128)
test_data = TensorDataset(input_ids_test, attention_masks_test, bio_labels_test, sent_labels_test)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=test_batch_size)

In [11]:
from transformers import AdamW
pred_log = []
epochs = 8

bert = BertModel.from_pretrained("bert-base-uncased", output_hidden_states=True)
model = CustomBertTokenClassifier(bert)
model.to(device)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
    'weight_decay_rate': 0.05},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
    'weight_decay_rate': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-6)

best_loss = 1e10
model_path = 'bertv6'
for ep in range(epochs):
    print(f"=== Training phase {ep+1} ====")
    train_model(model, optimizer, train_dataloader)
    print(f"=== Eval phase {ep+1} ====")
    loss = evaluate(model,test_dataloader)
    if loss < best_loss:
        best_loss = loss
        torch.save(model.state_dict(), f"bert_v6_{loss:.3f}.pth")

  0%|          | 0/400 [00:00<?, ?it/s]

=== Training phase 1 ====


100%|██████████| 400/400 [02:15<00:00,  2.95it/s]
  0%|          | 0/136 [00:00<?, ?it/s]

LossA = 0.45173813568428156, LossB = 0.2851531721930951
Loss = 0.7368913078773767
=== Eval phase 1 ====


100%|██████████| 136/136 [00:11<00:00, 11.76it/s]


LossA = 0.24818020481068423, LossB = 0.18255043502294405
Loss = 0.43073063983362825


  0%|          | 1/400 [00:00<01:09,  5.75it/s]

=== Training phase 2 ====


100%|██████████| 400/400 [02:16<00:00,  2.93it/s]
  0%|          | 0/136 [00:00<?, ?it/s]

LossA = 0.19644657303579152, LossB = 0.1651101989345625
Loss = 0.361556771970354
=== Eval phase 2 ====


100%|██████████| 136/136 [00:11<00:00, 11.72it/s]


LossA = 0.18372528687776887, LossB = 0.15576656606486616
Loss = 0.339491852942635


  0%|          | 1/400 [00:00<01:09,  5.75it/s]

=== Training phase 3 ====


100%|██████████| 400/400 [02:16<00:00,  2.93it/s]
  0%|          | 0/136 [00:00<?, ?it/s]

LossA = 0.1485727001260966, LossB = 0.1250163272349164
Loss = 0.273589027361013
=== Eval phase 3 ====


100%|██████████| 136/136 [00:11<00:00, 11.68it/s]


LossA = 0.1640767667661695, LossB = 0.15265462496523363
Loss = 0.31673139173140313


  0%|          | 1/400 [00:00<01:08,  5.81it/s]

=== Training phase 4 ====


100%|██████████| 400/400 [02:16<00:00,  2.94it/s]
  0%|          | 0/136 [00:00<?, ?it/s]

LossA = 0.12543529354035854, LossB = 0.10994601860817056
Loss = 0.23538131214852911
=== Eval phase 4 ====


100%|██████████| 136/136 [00:11<00:00, 11.71it/s]


LossA = 0.15024539037361084, LossB = 0.1495252715814037
Loss = 0.2997706619550145


  0%|          | 1/400 [00:00<01:08,  5.81it/s]

=== Training phase 5 ====


100%|██████████| 400/400 [02:16<00:00,  2.94it/s]
  0%|          | 0/136 [00:00<?, ?it/s]

LossA = 0.10228885822813026, LossB = 0.09169660675514024
Loss = 0.19398546498327052
=== Eval phase 5 ====


100%|██████████| 136/136 [00:11<00:00, 11.66it/s]


LossA = 0.1425217282924089, LossB = 0.15272462667952127
Loss = 0.2952463549719302


  0%|          | 1/400 [00:00<01:11,  5.55it/s]

=== Training phase 6 ====


100%|██████████| 400/400 [02:16<00:00,  2.94it/s]
  0%|          | 0/136 [00:00<?, ?it/s]

LossA = 0.08539053940970916, LossB = 0.07882187742419773
Loss = 0.1642124168339069
=== Eval phase 6 ====


100%|██████████| 136/136 [00:11<00:00, 11.67it/s]


LossA = 0.13795327739995522, LossB = 0.15429735901638839
Loss = 0.2922506364163436


  0%|          | 1/400 [00:00<01:08,  5.85it/s]

=== Training phase 7 ====


100%|██████████| 400/400 [02:16<00:00,  2.93it/s]
  0%|          | 0/136 [00:00<?, ?it/s]

LossA = 0.07094804522581398, LossB = 0.06977066178194946
Loss = 0.14071870700776345
=== Eval phase 7 ====


100%|██████████| 136/136 [00:11<00:00, 11.50it/s]
  0%|          | 1/400 [00:00<01:05,  6.06it/s]

LossA = 0.13742422497564213, LossB = 0.16638184420641183
Loss = 0.303806069182054
=== Training phase 8 ====


100%|██████████| 400/400 [02:16<00:00,  2.93it/s]
  0%|          | 0/136 [00:00<?, ?it/s]

LossA = 0.0568046206003055, LossB = 0.0587187351405737
Loss = 0.1155233557408792
=== Eval phase 8 ====


100%|██████████| 136/136 [00:11<00:00, 11.53it/s]

LossA = 0.13788569080027038, LossB = 0.16394268926181693
Loss = 0.30182838006208734





In [32]:
bert = BertModel.from_pretrained("bert-base-uncased", output_hidden_states=True)
model = CustomBertTokenClassifier(bert)
model.load_state_dict(torch.load("bert_v6_0.292.pth"))
model.to(device)
model.eval()


CustomBertTokenClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise

In [33]:
def evaluate_prob(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()
    total_lossA = 0
    total_lossB = 0
    all_pred = []
    all_pred_sent = []
    for step, batch in tqdm(enumerate(val_dataloader), total=len(val_dataloader)):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_bio_labels, b_sent_labels = batch
        with torch.no_grad():
            logits, sent_logits, lossA, lossB = model(b_input_ids, attention_mask=b_input_mask, 
                                                      bio_labels=b_bio_labels, sent_labels=b_sent_labels)

        logits = logits.cpu().numpy()
        all_pred.append(np.argmax(logits,axis=2))
        sent_logits = sent_logits.cpu().numpy()
        all_pred_sent.append(np.argmax(sent_logits,axis=2))
        total_lossA += lossA.item()
        total_lossB += lossB.item()
        
    print(f'LossA = {total_lossA / len(val_dataloader)}, LossB = {total_lossB / len(val_dataloader)}')
    print(f'Loss = {(total_lossA + total_lossB) / len(val_dataloader)}')
    # Return BIO predictions, Sentiment predictions : [eval_set_size, seq_len, 3], [eval_set_size, seq_len, 5]
    return np.concatenate(all_pred, axis=0), np.concatenate(all_pred_sent, axis=0)

In [34]:
bio_pred, sent_pred = evaluate_prob(model,test_dataloader)
print(bio_pred.shape)
print(sent_pred.shape)

100%|██████████| 136/136 [00:11<00:00, 12.13it/s]

LossA = 0.13795327739995522, LossB = 0.15429735901638839
Loss = 0.2922506364163436
(676, 128)
(676, 128)





In [35]:
from sklearn.metrics import confusion_matrix

test_labels_np = bio_labels_test.cpu().numpy().flatten()
bio_pred = bio_pred.flatten()

att_mask = attention_masks_test.cpu().numpy().flatten() == 1
test_labels_final = test_labels_np[att_mask]
pred_final = bio_pred[att_mask]

pd.DataFrame(confusion_matrix(test_labels_final, pred_final))

Unnamed: 0,0,1,2
0,10910,139,135
1,128,457,29
2,150,41,407


In [36]:
from sklearn.metrics import classification_report
print(classification_report(test_labels_final, pred_final, target_names=['O', 'B', 'I']))

              precision    recall  f1-score   support

           O       0.98      0.98      0.98     11184
           B       0.72      0.74      0.73       614
           I       0.71      0.68      0.70       598

    accuracy                           0.95     12396
   macro avg       0.80      0.80      0.80     12396
weighted avg       0.95      0.95      0.95     12396



In [37]:
test_labels_np = sent_labels_test.cpu().numpy().flatten()
sent_pred = sent_pred.flatten()

att_mask = attention_masks_test.cpu().numpy().flatten() == 1
test_labels_final = test_labels_np[att_mask]
pred_final = sent_pred[att_mask]

pd.DataFrame(confusion_matrix(test_labels_final, pred_final))

Unnamed: 0,0,1,2,3
0,10901,225,58,0
1,143,830,23,0
2,59,13,88,0
3,23,16,17,0


In [19]:
# If the dataset has CONFLICT polarity:
# print(classification_report(test_labels_final, pred_final, target_names=['O', 'POS', 'NEG', 'NEU', 'CON']))
print(classification_report(test_labels_final, pred_final, target_names=['O', 'POS', 'NEG', 'NEU']))

              precision    recall  f1-score   support

           O       0.98      0.97      0.98     11184
         POS       0.77      0.83      0.80       996
         NEG       0.47      0.55      0.51       160
         NEU       0.00      0.00      0.00        56

    accuracy                           0.95     12396
   macro avg       0.55      0.59      0.57     12396
weighted avg       0.95      0.95      0.95     12396



  'precision', 'predicted', average, warn_for)


In [38]:
def combine_labels(bio, sent):
    if bio == 0 or sent == 0:
        return 0
    return (bio-1)*4 + sent

f=np.vectorize(combine_labels)
c1 = f(bio_pred[att_mask], sent_pred[att_mask])
c2 = f(bio_labels_test.cpu().numpy().flatten()[att_mask], sent_labels_test.cpu().numpy().flatten()[att_mask])
pd.DataFrame(confusion_matrix(c2, c1))

Unnamed: 0,0,1,2,3,4,5,6
0,10951,79,33,0,115,6,0
1,87,334,20,0,28,0,0
2,42,9,63,0,1,0,0
3,9,9,12,0,0,0,0
4,113,32,1,0,380,1,0
5,29,0,5,0,1,10,0
6,15,0,1,0,7,3,0


In [40]:
print(classification_report(c2, c1,
                            labels = [1, 2, 3, 5, 6, 7],
                            target_names=['B-pos', 'B-neg', 'B-neu', 'I-pos', 'I-neg', 'I-neu']))

              precision    recall  f1-score   support

       B-pos       0.72      0.71      0.72       469
       B-neg       0.47      0.55      0.50       115
       B-neu       0.00      0.00      0.00        30
       I-pos       0.71      0.72      0.72       527
       I-neg       0.50      0.22      0.31        45
       I-neu       0.00      0.00      0.00        26

   micro avg       0.68      0.65      0.67      1212
   macro avg       0.40      0.37      0.37      1212
weighted avg       0.65      0.65      0.65      1212



In [None]:
 # torch.cuda.empty_cache()