# Label Propagation - Event Classification
This notebook establishes a training pipeline for our Event Embedding model.

In [None]:
import networkx as nx
from tqdm import tqdm
import pandas as pd


# auth.authenticate_user()
# print('Authenticated')

## 5W1H Graph Events

In [None]:
!gdown --id 1RF_bIo5ndxPhu9SJw-T8HBcuHyaGQGL0

Downloading...
From: https://drive.google.com/uc?id=1RF_bIo5ndxPhu9SJw-T8HBcuHyaGQGL0
To: /content/datasets.tar.gz
22.7MB [00:00, 85.7MB/s]


In [None]:
!tar -xzvf datasets.tar.gz

datasets_runs/
datasets_runs/run_1_google_news_5w1h_graph_hin.nx
datasets_runs/run_6_40er_5w1h_graph_hin.nx
datasets_runs/run_4_bbc_5w1h_graph_hin.nx
datasets_runs/run_8_gold_standard_5w1h_graph_hin.nx
datasets_runs/run_5_bbc_5w1h_graph_hin.nx
datasets_runs/run_9_google_news_5w1h_graph_hin.nx
datasets_runs/run_5_gold_standard_5w1h_graph_hin.nx
datasets_runs/run_2_bbc_5w1h_graph_hin.nx
datasets_runs/run_9_news_cluster_5w1h_graph_hin.nx
datasets_runs/run_7_40er_5w1h_graph_hin.nx
datasets_runs/run_9_gold_standard_5w1h_graph_hin.nx
datasets_runs/run_8_google_news_5w1h_graph_hin.nx
datasets_runs/run_10_bbc_5w1h_graph_hin.nx
datasets_runs/run_8_news_cluster_5w1h_graph_hin.nx
datasets_runs/run_2_news_cluster_5w1h_graph_hin.nx
datasets_runs/run_8_40er_5w1h_graph_hin.nx
datasets_runs/run_6_bbc_5w1h_graph_hin.nx
datasets_runs/run_4_google_news_5w1h_graph_hin.nx
datasets_runs/run_2_google_news_5w1h_graph_hin.nx
datasets_runs/run_7_gold_standard_5w1h_graph_hin.nx
datasets_runs/run_4_gold_standard_

# Regularization

In [None]:
import numpy as np
from tqdm.notebook import tqdm
import random


def LP(G,iterations=30):

    label_codes = {}
    for node in G.nodes():
      if 'train' in G.nodes[node]:
        label = G.nodes[node]['label']
        if label not in label_codes: label_codes[label] = len(label_codes)

    num_labels = len(label_codes)
    for node in G.nodes():
      if 'train' in G.nodes[node]:
        G.nodes[node]['y'] = np.array([0.0]*num_labels)
        label = G.nodes[node]['label']
        G.nodes[node]['y'][label_codes[label]] = 1.0
    
    nodes = []

    # inicializando vetor f para todos os nodes
    for node in G.nodes():
        G.nodes[node]['f'] = np.array([0.0]*num_labels)
        if 'y' in G.nodes[node]:
            G.nodes[node]['f'] = G.nodes[node]['y']*1.0
        nodes.append(node)

    pbar = tqdm(range(0, iterations))

    for iteration in pbar:
        random.shuffle(nodes)
        energy = 0.0

        # percorrendo cada node
        for node in nodes:
            f_new = np.array([0.0]*num_labels)
            f_old = np.array(G.nodes[node]['f'])*1.0
            sum_w = 0.0

            # percorrendo vizinhos do onde
            for neighbor in G.neighbors(node):
                w = 1.0
                if 'weight' in G[node][neighbor]:
                    w = G[node][neighbor]['weight']

                w /= np.sqrt(G.degree[neighbor])

                f_new += w*G.nodes[neighbor]['f']

                sum_w += w

            f_new /= sum_w

            G.nodes[node]['f'] = f_new*1.0

            if 'y' in G.nodes[node]:
                G.nodes[node]['f'] = G.nodes[node]['y']*1.0

            energy += np.linalg.norm(f_new-f_old)

        iteration += 1
        message = 'Iteration '+str(iteration)+' | Energy = '+str(energy)
        pbar.set_description(message)

    return G, label_codes

In [None]:
G = nx.read_gpickle('datasets_runs/run_3_40er_5w1h_graph_hin.nx')
G

<networkx.classes.graph.Graph at 0x7fa49dfc6b70>

In [None]:
G, label_codes = LP(G)

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




In [None]:
label_codes

{'business': 0, 'technology': 2, 'world': 1}

In [None]:
y_true = []
y_pred = []
for node in G.nodes():
  if 'test' in G.nodes[node]:
    y_true.append(G.nodes[node]['label'])
    cod_predicted = np.argmax(G.nodes[node]['f'])
    label_predicted = None
    for label in label_codes:
      if label_codes[label]==cod_predicted:
        label_predicted = label
        break
    y_pred.append(label_predicted)

In [None]:
print(y_true)
print(y_pred)

['business', 'world', 'world', 'business', 'technology', 'world', 'business', 'world', 'business', 'world', 'world', 'world', 'world', 'technology', 'world', 'business', 'world', 'technology', 'technology', 'business', 'business', 'business', 'technology', 'business', 'world', 'world', 'world', 'business', 'world', 'world', 'world', 'business']
['business', 'world', 'world', 'business', 'world', 'world', 'business', 'world', 'business', 'world', 'world', 'world', 'world', 'technology', 'world', 'business', 'world', 'world', 'world', 'business', 'business', 'business', 'world', 'business', 'world', 'world', 'world', 'business', 'world', 'world', 'world', 'business']


In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

f1_macro = f1_score(y_true, y_pred, average='macro')
acc = accuracy_score(y_true, y_pred)

network_file = ''
print('--->' ,network_file,'f1_macro',f1_macro,'acc',acc)
#experimental_results.append((network_file,'f1_macro',f1_macro,'acc',acc,y_true,y_pred))

--->  f1_macro 0.7407407407407408 acc 0.875


In [None]:
from os import listdir
from os.path import isfile, join
path_datasets = 'datasets_runs/'
network_files = [f for f in listdir(path_datasets) if isfile(join(path_datasets, f))]
print(network_files)

['run_1_google_news_5w1h_graph_hin.nx', 'run_6_40er_5w1h_graph_hin.nx', 'run_4_bbc_5w1h_graph_hin.nx', 'run_8_gold_standard_5w1h_graph_hin.nx', 'run_5_bbc_5w1h_graph_hin.nx', 'run_9_google_news_5w1h_graph_hin.nx', 'run_5_gold_standard_5w1h_graph_hin.nx', 'run_2_bbc_5w1h_graph_hin.nx', 'run_9_news_cluster_5w1h_graph_hin.nx', 'run_7_40er_5w1h_graph_hin.nx', 'run_9_gold_standard_5w1h_graph_hin.nx', 'run_8_google_news_5w1h_graph_hin.nx', 'run_10_bbc_5w1h_graph_hin.nx', 'run_8_news_cluster_5w1h_graph_hin.nx', 'run_2_news_cluster_5w1h_graph_hin.nx', 'run_8_40er_5w1h_graph_hin.nx', 'run_6_bbc_5w1h_graph_hin.nx', 'run_4_google_news_5w1h_graph_hin.nx', 'run_2_google_news_5w1h_graph_hin.nx', 'run_7_gold_standard_5w1h_graph_hin.nx', 'run_4_gold_standard_5w1h_graph_hin.nx', 'run_5_40er_5w1h_graph_hin.nx', 'run_3_gold_standard_5w1h_graph_hin.nx', 'run_4_40er_5w1h_graph_hin.nx', 'run_5_google_news_5w1h_graph_hin.nx', 'run_10_news_cluster_5w1h_graph_hin.nx', 'run_10_40er_5w1h_graph_hin.nx', 'run_9_40

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from tqdm.notebook import tqdm

experimental_results = []

for network_file in tqdm(network_files):

  print('Networkfile',network_file)
  
  G = nx.read_gpickle(path_datasets+network_file)

  G, label_codes = LP(G)

  y_true = []
  y_pred = []
  for node in G.nodes():
    if 'test' in G.nodes[node]:
      y_true.append(G.nodes[node]['label'])
      cod_predicted = np.argmax(G.nodes[node]['f'])
      label_predicted = None
      for label in label_codes:
        if label_codes[label]==cod_predicted:
          label_predicted = label
          break
      y_pred.append(label_predicted)

  f1_macro = f1_score(y_true, y_pred, average='macro')
  acc = accuracy_score(y_true, y_pred)

  print('--->' ,network_file,'f1_macro',f1_macro,'acc',acc)
  experimental_results.append((network_file,'f1_macro',f1_macro,'acc',acc,y_true,y_pred))

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Networkfile run_1_google_news_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_1_google_news_5w1h_graph_hin.nx f1_macro 0.7985375968569246 acc 0.84
Networkfile run_6_40er_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_6_40er_5w1h_graph_hin.nx f1_macro 0.7508547008547009 acc 0.75
Networkfile run_4_bbc_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_4_bbc_5w1h_graph_hin.nx f1_macro 0.4757027286439051 acc 0.4772727272727273
Networkfile run_8_gold_standard_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_8_gold_standard_5w1h_graph_hin.nx f1_macro 0.5323863636363636 acc 0.6973684210526315
Networkfile run_5_bbc_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_5_bbc_5w1h_graph_hin.nx f1_macro 0.4622348178137652 acc 0.45454545454545453
Networkfile run_9_google_news_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_9_google_news_5w1h_graph_hin.nx f1_macro 0.6717687074829932 acc 0.72
Networkfile run_5_gold_standard_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_5_gold_standard_5w1h_graph_hin.nx f1_macro 0.5732056070291364 acc 0.6842105263157895
Networkfile run_2_bbc_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_2_bbc_5w1h_graph_hin.nx f1_macro 0.31730769230769235 acc 0.36363636363636365
Networkfile run_9_news_cluster_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_9_news_cluster_5w1h_graph_hin.nx f1_macro 0.4957435820662775 acc 0.6209150326797386
Networkfile run_7_40er_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_7_40er_5w1h_graph_hin.nx f1_macro 0.5446775446775446 acc 0.75
Networkfile run_9_gold_standard_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_9_gold_standard_5w1h_graph_hin.nx f1_macro 0.6544507235683706 acc 0.7763157894736842
Networkfile run_8_google_news_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_8_google_news_5w1h_graph_hin.nx f1_macro 0.7880333951762523 acc 0.84
Networkfile run_10_bbc_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_10_bbc_5w1h_graph_hin.nx f1_macro 0.4718954248366013 acc 0.5
Networkfile run_8_news_cluster_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_8_news_cluster_5w1h_graph_hin.nx f1_macro 0.48978257610811 acc 0.6045751633986928
Networkfile run_2_news_cluster_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_2_news_cluster_5w1h_graph_hin.nx f1_macro 0.5017262030852452 acc 0.6274509803921569
Networkfile run_8_40er_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_8_40er_5w1h_graph_hin.nx f1_macro 0.8204555129842485 acc 0.84375
Networkfile run_6_bbc_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_6_bbc_5w1h_graph_hin.nx f1_macro 0.380491875274484 acc 0.4318181818181818
Networkfile run_4_google_news_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_4_google_news_5w1h_graph_hin.nx f1_macro 0.7056122448979593 acc 0.68
Networkfile run_2_google_news_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_2_google_news_5w1h_graph_hin.nx f1_macro 0.7676252319109462 acc 0.8
Networkfile run_7_gold_standard_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_7_gold_standard_5w1h_graph_hin.nx f1_macro 0.596592146959794 acc 0.7236842105263158
Networkfile run_4_gold_standard_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_4_gold_standard_5w1h_graph_hin.nx f1_macro 0.5342354224934317 acc 0.631578947368421
Networkfile run_5_40er_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_5_40er_5w1h_graph_hin.nx f1_macro 0.6394576905095839 acc 0.65625
Networkfile run_3_gold_standard_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_3_gold_standard_5w1h_graph_hin.nx f1_macro 0.5590261862320686 acc 0.6973684210526315
Networkfile run_4_40er_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_4_40er_5w1h_graph_hin.nx f1_macro 0.6619047619047619 acc 0.78125
Networkfile run_5_google_news_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_5_google_news_5w1h_graph_hin.nx f1_macro 0.8053375196232339 acc 0.88
Networkfile run_10_news_cluster_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_10_news_cluster_5w1h_graph_hin.nx f1_macro 0.578716850138267 acc 0.673202614379085
Networkfile run_10_40er_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_10_40er_5w1h_graph_hin.nx f1_macro 0.6924369747899158 acc 0.75
Networkfile run_9_40er_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_9_40er_5w1h_graph_hin.nx f1_macro 0.7183150183150183 acc 0.75
Networkfile run_10_google_news_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_10_google_news_5w1h_graph_hin.nx f1_macro 0.7761650114591292 acc 0.8
Networkfile run_6_google_news_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_6_google_news_5w1h_graph_hin.nx f1_macro 0.5654661864745899 acc 0.64
Networkfile run_1_news_cluster_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_1_news_cluster_5w1h_graph_hin.nx f1_macro 0.5202008993596896 acc 0.6372549019607843
Networkfile run_3_news_cluster_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_3_news_cluster_5w1h_graph_hin.nx f1_macro 0.5580913393770697 acc 0.6633986928104575
Networkfile run_5_news_cluster_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_5_news_cluster_5w1h_graph_hin.nx f1_macro 0.5677486141874393 acc 0.6764705882352942
Networkfile run_4_news_cluster_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_4_news_cluster_5w1h_graph_hin.nx f1_macro 0.5224553390776974 acc 0.6274509803921569
Networkfile run_7_bbc_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_7_bbc_5w1h_graph_hin.nx f1_macro 0.5365663322185061 acc 0.5681818181818182
Networkfile run_1_gold_standard_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_1_gold_standard_5w1h_graph_hin.nx f1_macro 0.6444673775943127 acc 0.75
Networkfile run_7_google_news_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_7_google_news_5w1h_graph_hin.nx f1_macro 0.5544217687074829 acc 0.6
Networkfile run_10_gold_standard_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_10_gold_standard_5w1h_graph_hin.nx f1_macro 0.6003035469787389 acc 0.7236842105263158
Networkfile run_6_gold_standard_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_6_gold_standard_5w1h_graph_hin.nx f1_macro 0.6501068648127472 acc 0.7631578947368421
Networkfile run_3_bbc_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_3_bbc_5w1h_graph_hin.nx f1_macro 0.4371490818549642 acc 0.4772727272727273
Networkfile run_1_bbc_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_1_bbc_5w1h_graph_hin.nx f1_macro 0.48097123464770525 acc 0.4772727272727273
Networkfile run_2_gold_standard_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_2_gold_standard_5w1h_graph_hin.nx f1_macro 0.6306455476927614 acc 0.7236842105263158
Networkfile run_3_google_news_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_3_google_news_5w1h_graph_hin.nx f1_macro 0.6890109890109891 acc 0.8
Networkfile run_9_bbc_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_9_bbc_5w1h_graph_hin.nx f1_macro 0.36185117525270977 acc 0.38636363636363635
Networkfile run_2_40er_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_2_40er_5w1h_graph_hin.nx f1_macro 0.48926237161531283 acc 0.6875
Networkfile run_8_bbc_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_8_bbc_5w1h_graph_hin.nx f1_macro 0.44988344988344997 acc 0.45454545454545453
Networkfile run_6_news_cluster_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_6_news_cluster_5w1h_graph_hin.nx f1_macro 0.5497998693479325 acc 0.6535947712418301
Networkfile run_1_40er_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_1_40er_5w1h_graph_hin.nx f1_macro 0.47509578544061304 acc 0.65625
Networkfile run_7_news_cluster_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_7_news_cluster_5w1h_graph_hin.nx f1_macro 0.5084017981876038 acc 0.6241830065359477
Networkfile run_3_40er_5w1h_graph_hin.nx


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


---> run_3_40er_5w1h_graph_hin.nx f1_macro 0.7407407407407408 acc 0.875



In [None]:
df_results = pd.DataFrame(experimental_results)
df_results

Unnamed: 0,0,1,2,3,4,5,6
0,run_1_google_news_5w1h_graph_hin.nx,f1_macro,0.798538,acc,0.84,"[Equifax breach, IPhone-X Green-Line, Star War...","[North Korea Missile, IPhone-X Green-Line, Sta..."
1,run_6_40er_5w1h_graph_hin.nx,f1_macro,0.750855,acc,0.75,"[business, world, world, business, business, t...","[business, world, world, business, business, b..."
2,run_4_bbc_5w1h_graph_hin.nx,f1_macro,0.475703,acc,0.477273,"[business, politics, sport, business, politics...","[business, politics, sport, entertainment, tec..."
3,run_8_gold_standard_5w1h_graph_hin.nx,f1_macro,0.532386,acc,0.697368,"[toberone-gate, clinton blames comey, cubs win...","[toberone-gate, clinton blames comey, clinton ..."
4,run_5_bbc_5w1h_graph_hin.nx,f1_macro,0.462235,acc,0.454545,"[business, politics, sport, business, politics...","[business, tech, sport, tech, tech, tech, poli..."
5,run_9_google_news_5w1h_graph_hin.nx,f1_macro,0.671769,acc,0.72,"[IPhone-X Green-Line, Dieselgate, IPhone-X Gre...","[IPhone-X Green-Line, Star Wars: Battlefront I..."
6,run_5_gold_standard_5w1h_graph_hin.nx,f1_macro,0.573206,acc,0.684211,"[toberone-gate, clinton blames comey, china we...","[toberone-gate, clinton blames comey, toberone..."
7,run_2_bbc_5w1h_graph_hin.nx,f1_macro,0.317308,acc,0.363636,"[business, politics, politics, sport, business...","[entertainment, tech, politics, politics, tech..."
8,run_9_news_cluster_5w1h_graph_hin.nx,f1_macro,0.495744,acc,0.620915,"[harambe, The_New_Infinity_War_Trailer, elon_m...","[harambe, thousands_flee_fireworks_explosion_a..."
9,run_7_40er_5w1h_graph_hin.nx,f1_macro,0.544678,acc,0.75,"[business, business, world, business, business...","[business, business, world, business, business..."


In [None]:
df_results.to_excel('LP_results_r1.xls')