In [1]:
import warnings
warnings.filterwarnings('ignore')
from build_graph import BuildGraph
from py_bertgcn import BERTGCN
import torch.nn.functional as F
import torch 
from tqdm import tqdm
import torch.utils.data as Data
from torch.optim import lr_scheduler
from sklearn.metrics import accuracy_score
from ignite.engine import Events, Engine
from ignite.metrics import Accuracy, Loss, Precision, Recall

In [2]:
# Build Graph
bg = BuildGraph("UIT_VFSC")
g = bg.g
n_labels = int(bg.df_total["label"].max().item() + 1)
print(g)

step pre processing
step add word doc edge


Processing documents: 16175it [00:07, 2049.62it/s]


step add word word edge


Constructing word pair count frequency: 100%|██████████| 87309/87309 [00:11<00:00, 7597.74it/s]
Adding word_word edges: 100%|██████████| 337534/337534 [00:02<00:00, 160672.79it/s]


step setup graph
Graph(num_nodes=19020, num_edges=595385,
      ndata_schemes={'x': Scheme(shape=(2845,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.float32), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={'weight': Scheme(shape=(), dtype=torch.float32)})


In [3]:
training_mask = g.ndata['train_mask']
val_mask = g.ndata['val_mask']
test_mask = g.ndata['test_mask']

In [4]:
nb_train,nb_val, nb_test = training_mask.sum(), val_mask.sum(), test_mask.sum()
node_features = g.ndata['x']
nb_node = node_features.shape[0]
nb_word = nb_node - nb_train - nb_test


In [None]:
# Build model
# Example usage
model = BERTGCN(bert_model_name='vinai/phobert-base-v2', classes=n_labels, m=0.7, n_hidden=300, dropout=0.5)
optimizer = torch.optim.Adam([
        {'params': model.bert_model.parameters(), 'lr': 1e-5},
        {'params': model.gcn.parameters(), 'lr': 1e-3},
    ], lr=1e-3
)
scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[30], gamma=0.1)
max_length = model.max_length
# Dummy data

inputs = model.bert(bg.df_total["corpus"].tolist(), return_tensors='pt', padding=True, truncation=True, max_length=max_length)
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
max_length = input_ids.shape[1]
input_ids = torch.cat([
    input_ids,
    torch.zeros((len(bg.vocab), max_length), dtype=torch.long)

])

attention_mask = torch.cat([
    attention_mask,
    torch.zeros((len(bg.vocab), max_length), dtype=torch.long),
])
g.ndata['train'], g.ndata['test'], g.ndata['val'] = \
    torch.FloatTensor(bg.training_mask), torch.FloatTensor(bg.test_mask), torch.FloatTensor(bg.val_mask)
max_acc = 0
max_f1 = 0

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
g.ndata['input_ids'], g.ndata['attention_mask'] = input_ids, attention_mask
doc_mask = training_mask + val_mask + test_mask
g.ndata['cls_feats'] = torch.zeros((nb_node, model.feat_dim))

In [None]:
def update_feature():
    global model, g, doc_mask
    # no gradient needed, uses a large batchsize to speed up the process
    dataloader = Data.DataLoader(
        Data.TensorDataset(g.ndata['input_ids'][doc_mask], g.ndata['attention_mask'][doc_mask]),
        batch_size=64
    )
    num_batches = len(dataloader)
    with torch.no_grad():
        model.eval()
        cls_list = []
        for i, batch in tqdm(enumerate(dataloader),total=num_batches, desc='Updating features'):
            input_ids, attention_mask = [x for x in batch]
            output = model.bert_model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True).hidden_states[-1][:, 0]
            cls_list.append(output.cpu())
        cls_feat = torch.cat(cls_list, axis=0)
    g.ndata['cls_feats'][doc_mask] = cls_feat
    torch.cuda.set_device(2)
    torch.cuda.empty_cache()
    return g

In [None]:
# Training model
def train_step(engine, batch):
    global model, g
    model.train()
    optimizer.zero_grad()
    (idx, ) = [x for x in batch]
    optimizer.zero_grad()
    train_mask = g.ndata['train'][idx].type(torch.BoolTensor)
    y_pred = model(g, idx)[train_mask].to(torch.float32)
    y_true = g.ndata['label'][idx][train_mask].type(torch.long)
    loss = F.nll_loss(y_pred, y_true)
    loss.backward()
    optimizer.step()
    g.ndata['cls_feats'].detach_()
    train_loss = loss.item()
    with torch.no_grad():
        if train_mask.sum() > 0:
            y_true = y_true.detach().cpu()
            y_pred = y_pred.argmax(axis=1).detach().cpu()
            train_acc = accuracy_score(y_true, y_pred)
        else:
            train_acc = 1
    return train_loss, train_acc


trainer = Engine(train_step)


@trainer.on(Events.EPOCH_COMPLETED)
def reset_graph(trainer):
    scheduler.step()
    update_feature()
    torch.cuda.empty_cache()


In [None]:
# test function
def test_step(engine, batch):
    global model, g
    with torch.no_grad():
        model.eval()
        (idx, ) = [x for x in batch]
        y_pred = model(g, idx).to(torch.float32)
        y_true = g.ndata['label'][idx].to(torch.long)
        torch.cuda.empty_cache()
        return y_pred, y_true

In [None]:
# prepare index
batch_size = 64
train_idx = Data.TensorDataset(torch.arange(0, nb_train, dtype=torch.long))
val_idx = Data.TensorDataset(torch.arange(nb_train, nb_train+nb_val, dtype=torch.long))
test_idx = Data.TensorDataset(torch.arange(nb_node-nb_test, nb_node, dtype=torch.long))
doc_idx = Data.ConcatDataset([train_idx, val_idx, test_idx])
idx_loader_train = Data.DataLoader(train_idx, batch_size=64, shuffle=True)
idx_loader_val = Data.DataLoader(val_idx, batch_size=64)
idx_loader_test = Data.DataLoader(test_idx, batch_size=64)
idx_loader = Data.DataLoader(doc_idx, batch_size=64, shuffle=True)

In [None]:
evaluator = Engine(test_step)
metrics={
    'acc': Accuracy(),
    'nll': Loss(torch.nn.NLLLoss()),
    'precision': Precision(average=False),
    'recall': Recall(average=False)
}
for n, f in metrics.items():
    f.attach(evaluator, n)


@trainer.on(Events.EPOCH_COMPLETED)
def log_training_results(trainer):
    evaluator.run(idx_loader_train)
    metrics = evaluator.state.metrics
    train_acc, train_nll = metrics["acc"], metrics["nll"]

    evaluator.run(idx_loader_val)
    metrics = evaluator.state.metrics
    val_acc, val_nll = metrics["acc"], metrics["nll"]

    evaluator.run(idx_loader_test)
    metrics = evaluator.state.metrics
    test_acc, test_nll = metrics["acc"], metrics["nll"]

    test_precision, test_recall = metrics["precision"].sum(), metrics["recall"].sum()
    test_f1 = (test_precision * test_recall * 2 / (test_precision + test_recall ))


    print(f"Epoch {trainer.state.epoch} -  train acc: {train_acc:.4f} - val acc: {val_acc:.4f} - test acc: {test_acc:.4f} - test f1: {test_f1:.4f}")
        

g = update_feature()
trainer.run(idx_loader, max_epochs=50)