# GAT Event Embedding
This notebook establishes a training pipeline for our Event Embedding model.

## Installing our libraries and required scripts

In [None]:
!git clone https://github.com/joaopedromattos/pyGAT
!pip install --quiet spektral

Cloning into 'pyGAT'...
remote: Enumerating objects: 50, done.[K
remote: Counting objects: 100% (50/50), done.[K
remote: Compressing objects: 100% (37/37), done.[K
remote: Total 213 (delta 26), reused 34 (delta 13), pack-reused 163[K
Receiving objects: 100% (213/213), 328.27 KiB | 325.00 KiB/s, done.
Resolving deltas: 100% (115/115), done.
[K     |████████████████████████████████| 112kB 11.7MB/s 
[?25h

In [None]:
!pip install git+https://github.com/rmarcacini/sentence-transformers
!pip install gdown
!gdown https://drive.google.com/uc?id=1NV5t1YhyyOzMF5zAovfbSLdZZLvqrfZ_
!unzip distiluse-base-multilingual-cased.zip -d language_model
from sentence_transformers import SentenceTransformer, LoggingHandler
language_model = SentenceTransformer('distiluse-base-multilingual-cased')

Collecting git+https://github.com/rmarcacini/sentence-transformers
  Cloning https://github.com/rmarcacini/sentence-transformers to /tmp/pip-req-build-id3vp4vc
  Running command git clone -q https://github.com/rmarcacini/sentence-transformers /tmp/pip-req-build-id3vp4vc
Collecting transformers<3.2.0,>=3.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/ae/05/c8c55b600308dc04e95100dc8ad8a244dd800fe75dfafcf1d6348c6f6209/transformers-3.1.0-py3-none-any.whl (884kB)
[K     |████████████████████████████████| 890kB 1.6MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 31.4MB/s 
Collecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)


100%|██████████| 504M/504M [00:08<00:00, 56.0MB/s]


In [None]:
import os

os.chdir('./pyGAT')

In [None]:
import networkx as nx
from tqdm import tqdm
import pandas as pd
from google.colab import auth
from google.cloud import bigquery
from google.cloud import bigquery_storage
import numpy as np
import logging
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer, LabelEncoder
from event_graph_utils import mount_graph, regularization, process_event_dataset_from_networkx

# auth.authenticate_user()
# print('Authenticated')

## 5W1H Graph Events

In [None]:
!gdown --id 1RF_bIo5ndxPhu9SJw-T8HBcuHyaGQGL0

Downloading...
From: https://drive.google.com/uc?id=1RF_bIo5ndxPhu9SJw-T8HBcuHyaGQGL0
To: /content/pyGAT/datasets.tar.gz
22.7MB [00:00, 37.0MB/s]


In [None]:
!tar -xzvf datasets.tar.gz

datasets_runs/
datasets_runs/run_1_google_news_5w1h_graph_hin.nx
datasets_runs/run_6_40er_5w1h_graph_hin.nx
datasets_runs/run_4_bbc_5w1h_graph_hin.nx
datasets_runs/run_8_gold_standard_5w1h_graph_hin.nx
datasets_runs/run_5_bbc_5w1h_graph_hin.nx
datasets_runs/run_9_google_news_5w1h_graph_hin.nx
datasets_runs/run_5_gold_standard_5w1h_graph_hin.nx
datasets_runs/run_2_bbc_5w1h_graph_hin.nx
datasets_runs/run_9_news_cluster_5w1h_graph_hin.nx
datasets_runs/run_7_40er_5w1h_graph_hin.nx
datasets_runs/run_9_gold_standard_5w1h_graph_hin.nx
datasets_runs/run_8_google_news_5w1h_graph_hin.nx
datasets_runs/run_10_bbc_5w1h_graph_hin.nx
datasets_runs/run_8_news_cluster_5w1h_graph_hin.nx
datasets_runs/run_2_news_cluster_5w1h_graph_hin.nx
datasets_runs/run_8_40er_5w1h_graph_hin.nx
datasets_runs/run_6_bbc_5w1h_graph_hin.nx
datasets_runs/run_4_google_news_5w1h_graph_hin.nx
datasets_runs/run_2_google_news_5w1h_graph_hin.nx
datasets_runs/run_7_gold_standard_5w1h_graph_hin.nx
datasets_runs/run_4_gold_standard_

# Features from Adj Matrix

In [None]:
import numpy as np
from tqdm.notebook import tqdm
import random
import networkx as nx


def features_by_adj(G):

    nodes = []

    for node in G.nodes():
        nodes.append(node)

    adj_matrix = nx.adjacency_matrix(G, nodelist=nodes).todense()

    counter = 0
    for node in nodes:
      G.nodes[node]['f'] = np.array(adj_matrix[counter].tolist()[0])
      counter += 1

    return G

In [None]:
import numpy as np
import networkx as nx
import random
from tqdm.notebook import tqdm
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import logging
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer, LabelEncoder
from sklearn.model_selection import train_test_split


def process_event_dataset_from_networkx(G, features_attr="f"):
    """
    Builds an event graph dataset used in GAT model
    Parameters:
        G -> Graph representation of the event network (Networkx graph)
        df_labels -> user labeled data
        features_att -> Feature attribute of each node (str)
        random_state -> A random seed to train_test_split
    Returns:
        adj -> Sparse and symmetric adjacency matrix of our graph.
        features -> A NumPy matrix with our graph features.
        idx_train -> A NumPy array with the indexes of the training nodes.
        idx_val -> A NumPy array with the indexes of the validation nodes.
        idx_test -> A NumPy array with the indexes of the test nodes.
    """

    num_nodes = len(G.nodes)


    

    # validation_split_percentage = val_split / (1 - train_split)

    # df_val, df_test = train_test_split(
    #     df_test_and_val, train_size=validation_split_percentage, random_state=random_state)

    # Organizing our feature matrix...
    # feature_matrix = np.array([ G.nodes[i]['embedding'] if 'embedding' in G.nodes[i].keys() else G.nodes[i][features_attr] for i in G.nodes()])
    #features = np.array([G.nodes[i][features_attr] for i in G.nodes()])
    L_features = []
    L_train = []
    L_test = []
    L_labels = []
    label_codes = {}
    for node in G.nodes():
      L_features.append( (G.nodes[node]['id'], G.nodes[node]['f']) )
      if 'train' in G.nodes[node]: L_train.append(G.nodes[node]['id'])
      if 'test' in G.nodes[node]: L_test.append(G.nodes[node]['id'])
      if 'label' in G.nodes[node]:
        if G.nodes[node]['label'] not in label_codes: label_codes[G.nodes[node]['label']] = len(label_codes) 
        L_labels.append( [G.nodes[node]['id'],G.nodes[node]['label'],label_codes[G.nodes[node]['label']]] )
    df_features = pd.DataFrame(L_features)
    df_features.columns = ['node_id','embedding']
    features = np.array(df_features.sort_values(by=['node_id'])['embedding'].to_list())

    idx_train = L_train
    idx_test = L_test
    labels = [-1]*num_nodes
    df_labels = pd.DataFrame(L_labels)
    df_labels.columns = ['event_id','label','label_code']
    for index,row in df_labels.iterrows():
      labels[row['event_id']] = row['label_code']

    adj = nx.adjacency_matrix(G)

    return adj, features, labels, idx_train, idx_test, df_labels

## Training

In [None]:
from __future__ import division
from __future__ import print_function

import os
import glob
import time
import random
import argparse
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import scipy.sparse as sp


from models import GAT, SpGAT


def accuracy(output, labels):
    preds = output.max(1)[1].type_as(labels)
    correct = preds.eq(labels).double()
    correct = correct.sum()
    return correct / len(labels)


def normalize_adj(mx):
    """Row-normalize sparse matrix"""
    rowsum = np.array(mx.sum(1))
    r_inv_sqrt = np.power(rowsum, -0.5).flatten()
    r_inv_sqrt[np.isinf(r_inv_sqrt)] = 0.
    r_mat_inv_sqrt = sp.diags(r_inv_sqrt)
    return mx.dot(r_mat_inv_sqrt).transpose().dot(r_mat_inv_sqrt)

class Namespace(object):
    def __init__(self, adict):
        self.__dict__.update(adict)

class GAT_wrapper():
    def __init__(self, args={"alpha": 0.2, "cuda": True, "dropout": 0.6, "epochs": 10, "fastmode": False, "hidden": 8, "lr": 0.005, "nb_heads": 8, "no_cuda": False, "patience": 100, "seed": 72, "sparse": False, "weight_decay": 0.0005}):

        if (type(args) == dict):
            args = Namespace(args)

        self.args = args

        self.model = None

        self.loss_test = 0.0
        self.acc_test = 0.0

        self.adj = None
        self.features = None
        self.labels = None
        self.idx_train = None
        self.idx_val = None
        self.idx_test = None

    def compute_test(self):
        self.model.eval()
        output = self.model(self.features, self.adj)
        loss_test = F.nll_loss(
            output[self.idx_test], self.labels[self.idx_test])
        acc_test = accuracy(output[self.idx_test], self.labels[self.idx_test])
        print("Test set results:",
              "loss= {:.4f}".format(loss_test.item()),
              "accuracy= {:.4f}".format(acc_test.item()))

        self.loss_test = loss_test
        self.acc_test = acc_test

        return loss_test, acc_test, output[self.idx_test].max(1)[1]

    def train_pipeline(self, adj, features, labels, idx_train, idx_val, idx_test, *args):

        adj = normalize_adj(adj + sp.eye(adj.shape[0]))

        if (sp.issparse(adj)):
            adj = adj.todense()

        if (sp.issparse(features)):
            features = features.todense()

        # With networkx, we no longer need to convert from one-hot encoding...
        #labels = np.where(labels)[1]

        adj = torch.FloatTensor(adj)
        features = torch.FloatTensor(features)
        labels = torch.LongTensor(labels)
        idx_train = torch.LongTensor(idx_train)
        idx_val = torch.LongTensor(idx_val)
        idx_test = torch.LongTensor(idx_test)

        random.seed(self.args.seed)
        np.random.seed(self.args.seed)
        torch.manual_seed(self.args.seed)
        if self.args.cuda:
            torch.cuda.manual_seed(self.args.seed)

        # Load data
        # adj, features, labels, idx_train, idx_val, idx_test = new_load_data(
        #     *args, custom_function=custom_function, function=function)

        # Model and optimizer
        if self.args.sparse:
            model = SpGAT(nfeat=features.shape[1],
                          nhid=self.args.hidden,
                          nclass=int(labels.max()) + 1,
                          dropout=self.args.dropout,
                          nheads=self.args.nb_heads,
                          alpha=self.args.alpha)
        else:
            model = GAT(nfeat=features.shape[1],
                        nhid=self.args.hidden,
                        nclass=int(labels.max()) + 1,
                        dropout=self.args.dropout,
                        nheads=self.args.nb_heads,
                        alpha=self.args.alpha)
        optimizer = optim.Adam(model.parameters(),
                               lr=self.args.lr,
                               weight_decay=self.args.weight_decay)

        if self.args.cuda:
            model.cuda()
            features = features.cuda()
            adj = adj.cuda()
            labels = labels.cuda()
            idx_train = idx_train.cuda()
            idx_val = idx_val.cuda()
            idx_test = idx_test.cuda()

        features, adj, labels = Variable(
            features), Variable(adj), Variable(labels)

        # TODO: Test if these lines could be written below line 41.
        self.adj = adj
        self.features = features
        self.labels = labels
        self.idx_train = idx_train
        self.idx_val = idx_val
        self.idx_test = idx_test

        def train(epoch):
            t = time.time()
            model.train()
            optimizer.zero_grad()
            output = model(features, adj)
            loss_train = F.nll_loss(output[idx_train], labels[idx_train])
            acc_train = accuracy(output[idx_train], labels[idx_train])
            loss_train.backward()
            optimizer.step()

            if not self.args.fastmode:
                # Evaluate validation set performance separately,
                # deactivates dropout during validation run.
                model.eval()
                output = model(features, adj)

            loss_val = F.nll_loss(output[idx_val], labels[idx_val])
            acc_val = accuracy(output[idx_val], labels[idx_val])
            print('Epoch: {:04d}'.format(epoch+1),
                  'loss_train: {:.4f}'.format(loss_train.data.item()),
                  'acc_train: {:.4f}'.format(acc_train.data.item()),
                  'loss_val: {:.4f}'.format(loss_val.data.item()),
                  'acc_val: {:.4f}'.format(acc_val.data.item()),
                  'time: {:.4f}s'.format(time.time() - t))

            return loss_val.data.item()

        # Train model
        t_total = time.time()
        loss_values = []
        bad_counter = 0
        best = self.args.epochs + 1
        best_epoch = 0
        for epoch in range(self.args.epochs):
            loss_values.append(train(epoch))

            torch.save(model.state_dict(), '{}.pkl'.format(epoch))
            if loss_values[-1] < best:
                best = loss_values[-1]
                best_epoch = epoch
                bad_counter = 0
            else:
                bad_counter += 1

            if bad_counter == self.args.patience:
                break

            files = glob.glob('*.pkl')
            for file in files:
                epoch_nb = int(file.split('.')[0])
                if epoch_nb < best_epoch:
                    os.remove(file)

        files = glob.glob('*.pkl')
        for file in files:
            epoch_nb = int(file.split('.')[0])
            if epoch_nb > best_epoch:
                os.remove(file)

        print("Optimization Finished!")
        print("Total time elapsed: {:.4f}s".format(time.time() - t_total))

        # Restore best model
        print('Loading {}th epoch'.format(best_epoch))
        model.load_state_dict(torch.load('{}.pkl'.format(best_epoch)))

        self.model = model

        return model



In [None]:
from os import listdir
from os.path import isfile, join
path_datasets = 'datasets_runs/'
network_files = [f for f in listdir(path_datasets) if isfile(join(path_datasets, f))]
print(network_files)

['run_1_google_news_5w1h_graph_hin.nx', 'run_6_40er_5w1h_graph_hin.nx', 'run_4_bbc_5w1h_graph_hin.nx', 'run_8_gold_standard_5w1h_graph_hin.nx', 'run_5_bbc_5w1h_graph_hin.nx', 'run_9_google_news_5w1h_graph_hin.nx', 'run_5_gold_standard_5w1h_graph_hin.nx', 'run_2_bbc_5w1h_graph_hin.nx', 'run_9_news_cluster_5w1h_graph_hin.nx', 'run_7_40er_5w1h_graph_hin.nx', 'run_9_gold_standard_5w1h_graph_hin.nx', 'run_8_google_news_5w1h_graph_hin.nx', 'run_10_bbc_5w1h_graph_hin.nx', 'run_8_news_cluster_5w1h_graph_hin.nx', 'run_2_news_cluster_5w1h_graph_hin.nx', 'run_8_40er_5w1h_graph_hin.nx', 'run_6_bbc_5w1h_graph_hin.nx', 'run_4_google_news_5w1h_graph_hin.nx', 'run_2_google_news_5w1h_graph_hin.nx', 'run_7_gold_standard_5w1h_graph_hin.nx', 'run_4_gold_standard_5w1h_graph_hin.nx', 'run_5_40er_5w1h_graph_hin.nx', 'run_3_gold_standard_5w1h_graph_hin.nx', 'run_4_40er_5w1h_graph_hin.nx', 'run_5_google_news_5w1h_graph_hin.nx', 'run_10_news_cluster_5w1h_graph_hin.nx', 'run_10_40er_5w1h_graph_hin.nx', 'run_9_40

In [None]:
!pwd

/content/pyGAT


In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from tqdm.notebook import tqdm

experimental_results = []

for network_file in tqdm(network_files):

  if 'news_cluster_5w1h_graph_hin.nx' in network_file: continue # usando toda a RAM as vezes???
  print('Networkfile',network_file)
  

  G = nx.read_gpickle(path_datasets+network_file)

  features_by_adj(G)
  adj, features, labels, idx_train, idx_test, df_labels = process_event_dataset_from_networkx(G)
  print(adj.shape,features.shape,len(idx_train),len(idx_test))
  gat = GAT_wrapper({"alpha": 0.2, "cuda": False, "dropout": 0.5, "epochs": 20, "fastmode": False, "hidden": 8, "lr": 0.005, "nb_heads": 8, "no_cuda": False, "patience": 100, "seed": 72, "sparse": False, "weight_decay": 0.0005})
  gat.train_pipeline(adj, features, labels, idx_train, idx_train, idx_test)
  loss, acc, output = gat.compute_test()
  y_pred = output.numpy()
  y_true = []
  for event_id in idx_test:
    for node in G.nodes():
      if ':event' in node:
        if G.nodes[node]['id'] == event_id:
          y_true.append(df_labels[df_labels.event_id==event_id].label_code.values[0])

  f1_macro = f1_score(y_true, y_pred, average='macro')
  acc = accuracy_score(y_true, y_pred)

  print('--->' ,network_file,'f1_macro',f1_macro,'acc',acc)
  experimental_results.append((network_file,'f1_macro',f1_macro,'acc',acc,y_true,y_pred))
  del gat
  del adj
  del features
  del G

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Networkfile run_1_google_news_5w1h_graph_hin.nx
(227, 227) (227, 227) 7 25
Epoch: 0001 loss_train: 2.0023 acc_train: 0.1429 loss_val: 1.8686 acc_val: 0.4286 time: 0.3335s
Epoch: 0002 loss_train: 1.9403 acc_train: 0.1429 loss_val: 1.8033 acc_val: 0.5714 time: 0.1902s
Epoch: 0003 loss_train: 1.5676 acc_train: 0.5714 loss_val: 1.7377 acc_val: 0.7143 time: 0.1729s
Epoch: 0004 loss_train: 1.9834 acc_train: 0.1429 loss_val: 1.6764 acc_val: 0.7143 time: 0.1896s
Epoch: 0005 loss_train: 1.7537 acc_train: 0.2857 loss_val: 1.6108 acc_val: 0.7143 time: 0.1563s
Epoch: 0006 loss_train: 1.7703 acc_train: 0.2857 loss_val: 1.5440 acc_val: 0.7143 time: 0.1723s
Epoch: 0007 loss_train: 1.6440 acc_train: 0.5714 loss_val: 1.4786 acc_val: 0.7143 time: 0.1897s
Epoch: 0008 loss_train: 1.4287 acc_train: 0.4286 loss_val: 1.4157 acc_val: 0.7143 time: 0.1725s
Epoch: 0009 loss_train: 1.7376 acc_train: 0.5714 loss_val: 1.3508 acc_val: 0.7143 time: 0.1727s
Epoch: 0010 loss_train: 1.3853 acc_train: 0.4286 loss_val: 1.

In [None]:
df_results = pd.DataFrame(experimental_results)
df_results

Unnamed: 0,0,1,2,3,4,5,6
0,run_1_google_news_5w1h_graph_hin.nx,f1_macro,0.442857,acc,0.48,"[1, 0, 4, 2, 2, 5, 0, 2, 5, 6, 5, 4, 2, 1, 3, ...","[4, 2, 4, 1, 2, 5, 3, 1, 1, 2, 1, 4, 2, 4, 3, ..."
1,run_6_40er_5w1h_graph_hin.nx,f1_macro,0.715666,acc,0.84375,"[0, 1, 1, 0, 0, 2, 1, 0, 1, 0, 1, 0, 1, 1, 2, ...","[0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, ..."
2,run_4_bbc_5w1h_graph_hin.nx,f1_macro,0.397116,acc,0.409091,"[1, 2, 3, 1, 2, 0, 3, 3, 0, 4, 3, 0, 3, 2, 2, ...","[1, 2, 0, 0, 0, 0, 3, 0, 0, 2, 3, 0, 0, 4, 0, ..."
3,run_8_gold_standard_5w1h_graph_hin.nx,f1_macro,0.482059,acc,0.618421,"[0, 1, 3, 4, 5, 0, 1, 6, 3, 0, 5, 4, 6, 1, 8, ...","[0, 1, 1, 8, 9, 0, 1, 6, 3, 0, 5, 8, 6, 1, 8, ..."
4,run_5_bbc_5w1h_graph_hin.nx,f1_macro,0.525696,acc,0.522727,"[1, 2, 3, 1, 2, 0, 3, 0, 4, 3, 0, 2, 2, 4, 0, ...","[1, 0, 3, 0, 0, 0, 0, 0, 2, 0, 1, 2, 0, 0, 0, ..."
5,run_9_google_news_5w1h_graph_hin.nx,f1_macro,0.505028,acc,0.56,"[0, 2, 0, 4, 2, 2, 5, 0, 5, 5, 6, 5, 4, 1, 2, ...","[4, 4, 2, 4, 2, 4, 5, 4, 5, 5, 2, 5, 4, 4, 4, ..."
6,run_5_gold_standard_5w1h_graph_hin.nx,f1_macro,0.48992,acc,0.605263,"[0, 1, 2, 3, 4, 0, 1, 6, 0, 1, 5, 7, 0, 4, 6, ...","[0, 1, 0, 0, 8, 0, 1, 6, 0, 1, 5, 7, 0, 0, 6, ..."
7,run_2_bbc_5w1h_graph_hin.nx,f1_macro,0.31338,acc,0.340909,"[1, 2, 2, 3, 1, 2, 3, 0, 4, 3, 0, 3, 2, 2, 4, ...","[0, 0, 2, 2, 0, 0, 0, 0, 0, 3, 0, 2, 0, 0, 0, ..."
8,run_7_40er_5w1h_graph_hin.nx,f1_macro,0.596618,acc,0.8125,"[0, 0, 1, 0, 0, 1, 2, 0, 1, 0, 1, 1, 0, 1, 1, ...","[0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, ..."
9,run_9_gold_standard_5w1h_graph_hin.nx,f1_macro,0.546596,acc,0.684211,"[0, 0, 2, 3, 4, 5, 1, 0, 1, 6, 3, 0, 1, 5, 7, ...","[0, 0, 0, 0, 8, 1, 1, 0, 1, 6, 3, 0, 1, 0, 0, ..."


In [None]:
df_results.to_excel('gat_noreg_results_r1.xls')