In [3]:
import os, sys, re, datetime, random, gzip, json
from tqdm.autonotebook import tqdm
import pandas as pd
import numpy as np
import glob
from pathlib import Path
from itertools import accumulate
import argparse
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertModel, BertForSequenceClassification

from time import time
from math import ceil
from multiprocessing import Pool
from sentence_transformers import SentenceTransformer, models, losses, InputExample

import pytorch_lightning as pl
from pytorch_lightning.trainer.trainer import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.utilities.seed import seed_everything

PROJ_PATH = Path(os.path.join(re.sub("/BERT_ABSA.*$", '', os.getcwd()), 'BERT_ABSA'))
print(f'PROJ_PATH={PROJ_PATH}')
sys.path.insert(1, str(PROJ_PATH))
sys.path.insert(1, str(PROJ_PATH/'src'))
import utils

PROJ_PATH=/home/hoang/github/BERT_ABSA


In [5]:
glob.glob('../model/restaurants/*.ckpt')

['../model/restaurants/epoch=4-val_loss=0.6392-val_acc=0.7826-val_macro_f1=0.6944-val_micro_f1=0.7826.ckpt',
 '../model/restaurants/epoch=2-val_loss=0.6676-val_acc=0.7673-val_macro_f1=0.6605-val_micro_f1=0.7673.ckpt',
 '../model/restaurants/epoch=8-val_loss=0.6323-val_acc=0.8034-val_macro_f1=0.7262-val_micro_f1=0.8034.ckpt']

## Parser

In [2]:
import os
import json
import pandas as pd
import re
import xml.etree.ElementTree as ET
from pathlib import Path

In [3]:
def parseXML(data_path):
    tree = ET.ElementTree(file=data_path)
    objs = list()
    for sentence in tree.getroot():
        obj = dict()
        obj['id'] = sentence.attrib['id']
        for item in sentence:
            if item.tag == 'text':
                obj['text'] = item.text
            elif item.tag == 'aspectTerms':
                obj['aspects'] = list()
                for aspectTerm in item:
                    if aspectTerm.attrib['polarity'] != 'conflict':
                        obj['aspects'].append(aspectTerm.attrib)
            elif item.tag == 'aspectCategories':
                obj['category'] = list()
                for category in item:
                    obj['category'].append(category.attrib)
        if 'aspects' in obj and len(obj['aspects']):
            objs.append(obj)
    return objs

def convert_to_dataframe(objs):
    output = []
    for sentence in objs:
        id = sentence['id']
        text = sentence['text']
        aspects = sentence['aspects']
        for aspect in aspects:
            term = aspect['term']
            label = aspect['polarity']
            output.append([id, text, term, label])
    output = sorted(output, key=lambda x: x[0])
    df = pd.DataFrame(output, columns=['id', 'text', 'term', 'label'])
    return df

In [4]:
dataset_files = {
    'restaurant': {
        'train': 'Restaurants_Train.xml',
        'test': 'Restaurants_Test.xml',
        'trial': 'Restaurants_Trial.xml'
    },
    'laptop': {
        'train': 'Laptops_Train.xml',
        'test': 'Laptops_Test.xml',
        'trial': 'Laptops_Trial.xml'
    }
}

In [6]:
for dsname, fnames in dataset_files.items():
    for g, fname in fnames.items():
        input_path = str(PROJ_PATH/ 'dataset/raw_data' / fname)
        output_path01 = str(PROJ_PATH/ 'dataset/preprocessed_data' / fname.replace('.xml', '.pkl'))
        output_path02 = str(PROJ_PATH/ 'dataset/preprocessed_data' / fname.replace('.xml', '.csv'))
        print(f'Load: {input_path}')
        print(f'Save: {output_path01}\n')
        objs = parseXML(input_path)
        df = convert_to_dataframe(objs)
        pd.to_pickle(objs, output_path01)
        df.to_csv(output_path02, index=False)

Load: /home/hoang/github/BERT_ABSA/dataset/raw_data/Restaurants_Train.xml
Save: /home/hoang/github/BERT_ABSA/dataset/preprocessed_data/Restaurants_Train.pkl

Load: /home/hoang/github/BERT_ABSA/dataset/raw_data/Restaurants_Test.xml
Save: /home/hoang/github/BERT_ABSA/dataset/preprocessed_data/Restaurants_Test.pkl

Load: /home/hoang/github/BERT_ABSA/dataset/raw_data/Restaurants_Trial.xml
Save: /home/hoang/github/BERT_ABSA/dataset/preprocessed_data/Restaurants_Trial.pkl

Load: /home/hoang/github/BERT_ABSA/dataset/raw_data/Laptops_Train.xml
Save: /home/hoang/github/BERT_ABSA/dataset/preprocessed_data/Laptops_Train.pkl

Load: /home/hoang/github/BERT_ABSA/dataset/raw_data/Laptops_Test.xml
Save: /home/hoang/github/BERT_ABSA/dataset/preprocessed_data/Laptops_Test.pkl

Load: /home/hoang/github/BERT_ABSA/dataset/raw_data/Laptops_Trial.xml
Save: /home/hoang/github/BERT_ABSA/dataset/preprocessed_data/Laptops_Trial.pkl



In [58]:
df = pd.read_csv('/home/hoang/github/BERT_ABSA/dataset/preprocessed_data/Restaurants_Train.csv')
df.head()

Unnamed: 0,id,text,term,label
0,1000,"The food is good, especially their more basic ...",food,positive
1,1000,"The food is good, especially their more basic ...",dishes,positive
2,1000,"The food is good, especially their more basic ...",drinks,positive
3,1002,"The view is spectacular, and the food is great.",view,positive
4,1002,"The view is spectacular, and the food is great.",food,positive


## Dataset

In [3]:
data = list(pd.read_csv('../dataset/preprocessed_data/Laptops_Train.csv').T.to_dict().values())

In [2]:
class Dataset(Dataset):
    def __init__(self, data_dir, transformation='QA_M', num_classes=3, bert_tokenizer=None, max_length=0, seed=0):
        random.seed(seed)
        assert transformation in ['QA_M', 'QA_B', 'MLI_M', 'MLI_B'], 'Invalid transformation method'
        assert num_classes in [2, 3], 'Invalid num_classes'
        
        self.transformation = transformation
        self.bert_tokenizer = bert_tokenizer
        self.max_length = max_length
        self.polarity_dict = {'positive': 0, 'negative': 1, 'neutral': 2}
        
        # load data
        self.data = list(pd.read_csv(data_dir).T.to_dict().values())
        if num_classes == 2:
            self.data = [d for d in self.data if d['label'] != 'neutral']
    
    def transform(self, sample):
        seq1 = sample['text'].lower()
        term = sample['term'].lower()
        
        if self.transformation == 'QA_M':
            seq2 = f'what is the polarity of {term} ?'
            label = self.polarity_dict[sample['label']]
        elif self.transformation == 'MLI_M':
            seq2 = term.lower()
            label = self.polarity_dict[sample['label']]
#         elif self.transformation == 'QA_B':
#         elif self.transformation == 'MLI_B':
        
        return seq1, seq2, label
        
    def encode_text(self, seq1, seq2):
        # encode
        encoded_text = self.bert_tokenizer.encode_plus(
            seq1,
            seq2,
            add_special_tokens=True,  # Add [CLS] and [SEP]
            max_length=self.max_length,  # maximum length of a sentence
            padding='max_length',  # Add [PAD]s
            truncation=True, # Truncate up to maximum length
            return_attention_mask=True,  # Generate the attention mask
            return_tensors='pt',  # Ask the function to return PyTorch tensors
        )
        return encoded_text
        
    def __getitem__(self, item):
        '''
        example = {
            'id': 1000,
            'text': 'The food is good, especially their more basic dishes, and the drinks are delicious.',
            'term': 'food',
            'label': 'positive',
            }
        '''
            
        # encoder
        sample = self.data[item]
        seq1, seq2, label = self.transform(sample)
        encoded_text = self.encode_text(seq1, seq2)

        single_input = {
            'seq1': seq1,
            'seq2': seq2,
            'term': sample['term'],
            'label': label, 
            'input_ids': encoded_text['input_ids'].flatten(),
            'token_type_ids': encoded_text['token_type_ids'].flatten(),
            'attention_mask': encoded_text['attention_mask'].flatten(),
        }
        return single_input

    def __len__(self):
        return len(self.data)
    
class DataModule(pl.LightningDataModule):
    def __init__(self, params):
        super().__init__()
        self.save_hyperparameters(params)

    def setup(self, stage=None):
        bert_tokenizer = BertTokenizer.from_pretrained(self.hparams.bert_name)
        
        # Assign train/val datasets for use in dataloaders
        if stage == "fit" or stage is None:
            data_fit = Dataset(
                data_dir=self.hparams.data_train_dir,
                transformation=self.hparams.transformation,
                num_classes=self.hparams.num_classes,
                bert_tokenizer=bert_tokenizer,
                max_length=self.hparams.max_length,
                seed=self.hparams.seed)
            
            total_samples = data_fit.__len__()
            train_samples = int(data_fit.__len__() * 0.8)
            val_samples = total_samples - train_samples
            self.data_train, self.data_val = random_split(
                data_fit, [train_samples, val_samples], generator=torch.Generator().manual_seed(self.hparams.seed))

        # Assign test dataset for use in dataloader(s)
        if stage == "test" or stage is None:
            self.data_test = Dataset(
                data_dir=self.hparams.data_test_dir,
                transformation=self.hparams.transformation,
                num_classes=self.hparams.num_classes,
                bert_tokenizer=bert_tokenizer,
                max_length=self.hparams.max_length,
                seed=self.hparams.seed)

    def train_dataloader(self):
        return DataLoader(
            self.data_train, 
            batch_size=self.hparams.batch_size, 
            num_workers=4, 
            shuffle=False, # Already shuffle in random_split() 
            drop_last=True, 
#             collate_fn=lambda x: x,
        )

    def val_dataloader(self):
        return DataLoader(
            self.data_val, 
            batch_size=self.hparams.batch_size, 
            num_workers=4, 
            shuffle=False,
#             drop_last=True, 
#             collate_fn=lambda x: x,
        )
    def test_dataloader(self):
        return DataLoader(
            self.data_test, 
            batch_size=self.hparams.batch_size, 
            num_workers=4, 
            shuffle=False,
#             drop_last=True, 
#             collate_fn=lambda x: x,
        )

## Model

In [33]:
class SentimentClassifier(pl.LightningModule):
    def __init__(self, params):
        super().__init__()
        self.save_hyperparameters(params)
        self.tokenizer = BertTokenizer.from_pretrained(self.hparams.pretrained_bert_name)
        self.bert = BertForSequenceClassification.from_pretrained(
            self.hparams.pretrained_bert_name, num_labels=3, output_hidden_states=True, output_attentions=True, return_dict=False)
        self.hidden_size = self.bert.config.hidden_size
        self.cross_entropy_loss = nn.CrossEntropyLoss()
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
        return optimizer  
    
    def forward(self, input_ids, attention_mask, token_type_ids, labels):
        loss, logits, hidden, _ = self.bert(
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            token_type_ids=token_type_ids,
            labels=labels,
        )
        
        return logits
    
    def margin_loss(self,  embedding_query, embedding_pos, embedding_neg):
        scores_pos = (embeddings_query * embeddings_pos).sum(dim=-1)
        scores_neg = (embeddings_query * embeddings_neg).sum(dim=-1) * self.scale
        return scores_pos - scores_neg
        
    def training_step(self, batch, batch_idx):
        # ['seq1', 'seq2', 'term', 'label', 'input_ids', 'token_type_ids', 'attention_mask']
        logits = self.forward(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            token_type_ids=batch['token_type_ids'],
            labels=batch['label'],
        )
        
        labels = batch['label']
        ce_loss = self.cross_entropy_loss(logits, labels)        
#         acc = utils.calc_accuracy(logits, labels).squeeze()
#         logs = {
#             'loss': ce_loss,
#             'acc': acc,
#         }
#         self.log_dict(logs, prog_bar=True)
        return ce_loss
    
    def validation_step(self, batch, batch_idx):
        logits = self.forward(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            token_type_ids=batch['token_type_ids'],
            labels=batch['label'],
        )
        
        labels = batch['label']
        ce_loss = self.cross_entropy_loss(logits, labels)        
        acc = utils.calc_accuracy(logits, labels).squeeze()
        macro_f1 = utils.calc_f1(logits, labels, avg_type='macro').squeeze()
        micro_f1 = utils.calc_f1(logits, labels, avg_type='micro').squeeze()

        logs = {
            'loss': ce_loss, 
            'acc': acc,
            'macro_f1': macro_f1,
            'micro_f1': micro_f1
        }
        self.log_dict(logs, prog_bar=True)
        return logs
    
    def validation_epoch_end(self, val_step_outputs):
        avg_loss = torch.stack([x['loss'] for x in val_step_outputs]).mean().cpu()
        avg_acc = torch.stack([x['acc'] for x in val_step_outputs]).mean().cpu()
        avg_macro_f1 = torch.stack([x['macro_f1'] for x in val_step_outputs]).mean().cpu()
        avg_micro_f1 = torch.stack([x['micro_f1'] for x in val_step_outputs]).mean().cpu()
        logs = {
            'val_loss': avg_loss, 
            'val_acc': avg_acc,
            'val_macro_f1': avg_macro_f1,
            'val_micro_f1': avg_micro_f1,
        }
        self.log_dict(logs, prog_bar=True)
     
    def test_step(self, batch, batch_idx):
        logits = self.forward(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            token_type_ids=batch['token_type_ids'],
            labels=batch['label'],
        )
        
        labels = batch['label']
        ce_loss = self.cross_entropy_loss(logits, labels)        
        acc = utils.calc_accuracy(logits, labels).squeeze()
        macro_f1 = utils.calc_f1(logits, labels, avg_type='macro').squeeze()
        micro_f1 = utils.calc_f1(logits, labels, avg_type='micro').squeeze()

        logs = {
            'loss': ce_loss, 
            'acc': acc,
            'macro_f1': macro_f1,
            'micro_f1': micro_f1
        }
        return logs
    
    def test_epoch_end(self, test_step_outputs):
        avg_loss = torch.stack([x['loss'] for x in test_step_outputs]).mean().cpu()
        avg_acc = torch.stack([x['acc'] for x in test_step_outputs]).mean().cpu()
        avg_macro_f1 = torch.stack([x['macro_f1'] for x in test_step_outputs]).mean().cpu()
        avg_micro_f1 = torch.stack([x['micro_f1'] for x in test_step_outputs]).mean().cpu()

        logs = {
            'test_loss': avg_loss, 
            'test_acc': avg_acc,
            'test_macro_f1': avg_macro_f1,
            'test_micro_f1': avg_micro_f1,
        }
        self.log_dict(logs, prog_bar=True)
        return logs
     

In [34]:
import commentjson
from collections import OrderedDict

def read_json(fname):
    '''
    Read in the json file specified by 'fname'
    '''
    with open(fname, 'rt') as handle:
        return commentjson.load(handle, object_hook=OrderedDict)

def build_model(config):
    data_params, model_params = config['data_params'], config['model_params']
    data = DataModule(data_params)
    model = SentimentClassifier(model_params)
    return data, model

In [35]:
def build_trainder(config):
    trainer_params = config['trainer_params']
    data_params = config['data_params']
    
    # callbacks
    checkpoint = ModelCheckpoint(
        dirpath=trainer_params['checkpoint_dir'], 
        filename='{epoch}-{val_loss:.4f}-{val_acc:.4f}-{val_macro_f1:.4f}-{val_micro_f1:.4f}',
        save_top_k=trainer_params['top_k'],
        verbose=True,
        monitor=trainer_params['metric'],
        mode=trainer_params['mode'],
    )
    early_stopping = EarlyStopping(
        monitor='val_loss', 
        min_delta=0.00, 
        patience=trainer_params['patience'],
        verbose=False,
        mode=trainer_params['mode'],
    )
    callbacks = [checkpoint, early_stopping]
    
    # trainer_kwargs
    trainer_kwargs = {
        'max_epochs': trainer_params['max_epochs'],
        'gpus': 1 if torch.cuda.is_available() else 0,
    #     "progress_bar_refresh_rate":p_refresh,
    #     'gradient_clip_val': hyperparameters['grad_clip'],
        'weights_summary': 'full',
        'deterministic': True,
        'callbacks': callbacks,
    }

    trainer = Trainer(**trainer_kwargs)
    return trainer, trainer_kwargs

In [36]:
# parser = argparse.ArgumentParser(description='Training.')

# parser.add_argument('-config_file', help='config file path', default='../src/restaurant_config.json', type=str)
# parser.add_argument('-f', '--fff', help='a dummy argument to fool ipython', default='1')
# args = parser.parse_args()

# args.config = read_json(args.config_file)
# seed_everything(args.config['data_params']['seed'], workers=True)
# data, clf = build_model(args.config)
# trainer, trainer_kwargs = build_trainder(args.config)
# trainer.fit(clf, data)

## Predict

In [37]:
parser = argparse.ArgumentParser(description='Training.')

parser.add_argument('-config_file', help='config file path', default='../src/restaurant_config.json', type=str)
parser.add_argument('-f', '--fff', help='a dummy argument to fool ipython', default='1')
args = parser.parse_args()

args.config = read_json(args.config_file)
seed_everything(args.config['data_params']['seed'], workers=True)
data, clf = build_model(args.config)
trainer, trainer_kwargs = build_trainder(args.config)

Global seed set to 12345
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized fro

In [38]:
paths = sorted(glob.glob('/home/hoang/github/BERT_ABSA/model/restaurants/*.ckpt'))
model_test = SentimentClassifier.load_from_checkpoint(paths[0])
result = trainer.test(model_test, datamodule=data)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Testing: 0it [00:00, ?it/s]

[{'loss': tensor(0.4673, device='cuda:0'), 'acc': tensor(0.8203, dtype=torch.float64), 'macro_f1': tensor(0.7241, dtype=torch.float64), 'micro_f1': tensor(0.8203, dtype=torch.float64)}, {'loss': tensor(0.4060, device='cuda:0'), 'acc': tensor(0.8906, dtype=torch.float64), 'macro_f1': tensor(0.7525, dtype=torch.float64), 'micro_f1': tensor(0.8906, dtype=torch.float64)}, {'loss': tensor(0.4146, device='cuda:0'), 'acc': tensor(0.8359, dtype=torch.float64), 'macro_f1': tensor(0.6356, dtype=torch.float64), 'micro_f1': tensor(0.8359, dtype=torch.float64)}, {'loss': tensor(0.6661, device='cuda:0'), 'acc': tensor(0.7734, dtype=torch.float64), 'macro_f1': tensor(0.6064, dtype=torch.float64), 'micro_f1': tensor(0.7734, dtype=torch.float64)}, {'loss': tensor(0.7138, device='cuda:0'), 'acc': tensor(0.7578, dtype=torch.float64), 'macro_f1': tensor(0.5623, dtype=torch.float64), 'micro_f1': tensor(0.7578, dtype=torch.float64)}, {'loss': tensor(0.7445, device='cuda:0'), 'acc': tensor(0.7500, dtype=torc