In [1]:
from transformers import get_linear_schedule_with_warmup, AdamW
from sklearn import metrics, model_selection
from model import BERTSentiment
import config
import torch 
from torch.utils.data import DataLoader
from engine import train_fn, eval_fn
import dataset
import pandas as pd
from engine import calculate_accuracy

2021-10-16 14:58:17.469524: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include:/usr/lib/cuda/lib64:
2021-10-16 14:58:17.469587: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:

train_data = pd.read_csv(config.PROCESSED_TRAIN_FILE_PATH).dropna().reset_index(drop=True)
test_data = pd.read_csv(config.PROCESSED_TEST_FILE_PATH).dropna().reset_index(drop=True)

df_train, df_val = model_selection.train_test_split(
    train_data,
    test_size = 0.1,
    random_state=42,
    stratify=train_data.Sentiment.values
)

In [3]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

train_dataset = dataset.ExtractDataset(
    sentences = (df_train.Sentence.values), 
    sentiments = (df_train.Sentiment.values), 
    entity = (df_train.Entity.values)
)

val_dataset = dataset.ExtractDataset(
    sentences = (df_val.Sentence.values), 
    sentiments = (df_val.Sentiment.values), 
    entity = (df_val.Entity.values)
)

In [4]:
train_loader = DataLoader(train_dataset, 
                          batch_size=config.TRAIN_BATCH_SIZE, 
                          shuffle=True)
eval_loader = DataLoader(val_dataset, 
                         batch_size=config.VALID_BATCH_SIZE)

device = torch.device("cpu")

model = BERTSentiment()

Freeze BERT

In [6]:
for param in model.bert_layer.parameters():
    param.requires_grad = False

model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 1e-4},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0},
]

num_training_steps = int(len(df_train)/config.TRAIN_BATCH_SIZE * config.FROZEN_BERT_EPOCHS)

optimizer = AdamW(optimizer_parameters, lr=1e-4)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_training_steps
)

best_accuracy = 0
early_stopping_counter = 0

In [None]:
for epochs in range(config.FROZEN_BERT_EPOCHS):
    print("Epoch :", epochs)
    loss, train_accuracy = train_fn(train_loader, model, optimizer, device, scheduler)
    print(f"Total Epoch Train Accuracy : {train_accuracy} with loss : {loss}")
    predicted, labels = eval_fn(eval_loader, model, device)
    val_accuracy = calculate_accuracy(predicted, labels, 'epoch')
    print(f"Total Epoch Eval Accuracy : {val_accuracy}")
    if val_accuracy > best_accuracy:
        early_stopping_counter = 0
        best_accuracy = val_accuracy
        torch.save(model.state_dict(), config.MODEL_SAVE_PATH)
    else:
        early_stopping_counter += 1
        if early_stopping_counter > config.FROZEN_BERT_EARLY_STOPPING:
            break

Unfreeze BERT for training

In [None]:
for param in model.bert_layer.parameters():
    param.requires_grad = True

model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 1e-4},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0},
]

num_training_steps = int(len(df_train)/config.TRAIN_BATCH_SIZE * config.BERT_EPOCHS)

optimizer = AdamW(optimizer_parameters, lr=1e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_training_steps
)

early_stopping_counter = 0

In [None]:
for epochs in range(config.BERT_EPOCHS):
    print("Epoch :", epochs)
    loss, train_accuracy = train_fn(train_loader, model, optimizer, device, scheduler)
    print(f"Total Epoch Train Accuracy : {train_accuracy} with loss : {loss}")
    predicted, labels = eval_fn(eval_loader, model, device)
    val_accuracy = calculate_accuracy(predicted, labels, 'epoch')
    print(f"Total Epoch Eval Accuracy : {val_accuracy}")
    if val_accuracy > best_accuracy:
        early_stopping_counter = 0
        best_accuracy = val_accuracy
        torch.save(model.state_dict(), config.MODEL_SAVE_PATH)
    else:
        early_stopping_counter += 1
        if early_stopping_counter > config.BERT_EARLY_STOPPING:
            break

In [None]:
from transformers import AutoTokenizer
import config
import torch
import numpy as np

class TestDataset:
    def __init__(self, sentences, entity):
        self.sentences = list(map(self.remove_extra_space, sentences))
        self.entity = list(map(self.remove_extra_space, entity))
        self.tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
        self.sentence_encodings = self.tokenizer(self.sentences, return_offsets_mapping=True)
        self.max_len = config.MAX_LEN

    def sentiment_encoder(self, sentiment):
        if sentiment == 'positive':
            return 1
        else:
            return 0

    def remove_extra_space(self, text):
        return " ".join(text.split())

    def __len__(self):
        return len(self.entity)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        entity = self.entity[idx]

        tok_sentence_ids = self.sentence_encodings.input_ids[idx]
        tok_sentence_offsets = self.sentence_encodings.offset_mapping[idx][1:-1]
        tok_sentence_type_id = self.sentence_encodings.token_type_ids[idx]
        tok_sentence_mask = self.sentence_encodings.attention_mask[idx]

        start_ids = [i for i in range(len(sentence)) if sentence.startswith(entity, i)]

        aspect_word_masking = np.zeros(len(tok_sentence_ids))

        word_counter = 0
        word_started = 0
        for i, (start_id, end_id) in enumerate(tok_sentence_offsets):
            if word_started:
                aspect_word_masking[i] = 1
                if start_ids[word_counter] + len(entity) == end_id:
                    word_counter += 1
                    word_started = 0
            else:
                if word_counter < len(start_ids) and start_ids[word_counter] == start_id:
                    word_started = 1
                    aspect_word_masking[i] = 1
                    if start_ids[word_counter] + len(entity) == end_id:
                        word_counter += 1
                        word_started = 0

        # Need to pad them 
        padding_len = self.max_len - len(tok_sentence_ids)

        tok_sentence_ids = tok_sentence_ids + [0] * padding_len
        tok_sentence_mask = tok_sentence_mask + [0] * padding_len
        tok_sentence_type_id = tok_sentence_type_id + [0] * padding_len
        aspect_word_masking = [0] + aspect_word_masking.tolist() + [0] + [0] * (padding_len-2)

        tok_sentence_ids = tok_sentence_ids[:self.max_len]
        tok_sentence_mask = tok_sentence_mask[:self.max_len]
        tok_sentence_type_id = tok_sentence_type_id[:self.max_len]
        aspect_word_masking = aspect_word_masking[:self.max_len]

        return {
            'input_ids' : torch.tensor(tok_sentence_ids, dtype=torch.long),
            'attention_mask' : torch.tensor(tok_sentence_mask, dtype=torch.long),
            'aspect_word_masking' : torch.tensor(aspect_word_masking, dtype=torch.bool),
            'token_type_ids' : torch.tensor(tok_sentence_type_id, dtype=torch.long)
        }

In [None]:
test_dataset = TestDataset(
    sentences = (test_data.Sentence.values),  
    entity = (test_data.Entity.values)
)

test_loader = DataLoader(test_dataset, 
                         batch_size=8)

In [None]:
from tqdm import tqdm

predictions = []

for batch_index, dataset in tqdm(enumerate(test_loader), total=len(test_loader)):
    with torch.no_grad():
        outputs = model(dataset['input_ids'].to(device), 
                        dataset['attention_mask'].to(device),
                        dataset['token_type_ids'].to(device),
                        dataset['aspect_word_masking'].to(device),
                    )
    predictions.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

In [None]:
predictions = (np.array(predictions) >= 0.5).astype(int).reshape(-1)

In [None]:
predictions

In [None]:
test_data['final_sentiment'] = predictions

In [None]:
test_data.to_csv(config.TEST_FILE_SAVE_PATH, index=False)