# Goals

Using mc_dropout.py we can generate a lot of outputs. Here we try to use them to actually predict edges.

In [125]:
# Imports
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm, trange
from matplotlib import pyplot as plt
from pathlib import Path
from tabulate import tabulate
from run import Runner
from collections import Counter
import networkx as nx

from mytorch.utils.goodies import FancyDict

# 0. Prep

Pull the graph, and get a notion of what's stored where

In [130]:
args = {'name' : 'testrun',
'dataset' : 'RLF/lf',
'model' : 'compgcn',
'score_func' : 'conve',
'opn' : 'corr',
'use_wandb' : False, 
'batch_size' : 128,
'gamma' : 40.0,
'gpu' : '-1',
'max_epochs' : 1,
'l2' : 0.0,
'lr' : 0.001,
'lbl_smooth' : 0.1,
'num_workers' : 10,
'seed' : 41504,
'restore' : False,
'bias' : False   ,
'num_bases' : -1,
'init_dim' : 100,
'gcn_dim' : 200 ,
'embed_dim' : None,
'gcn_layer' : 1     ,
'dropout' : 0.05    ,
'hid_drop' : 0.15    ,
'hid_drop2' : 0.15    ,
'feat_drop' : 0.15    ,
'k_w' : 10     ,
'k_h' : 20     ,
'num_filt' : 200,
'ker_sz' : 7     ,
'log_dir' : './log/',
'config_dir' : './config/',
'trim': False,
'trim_ratio': 0.00005,
'use_fasttext': False
}
args = FancyDict(args)

model = Runner(args)
# # Now load the saved model
model.load_model('./checkpoints/compgcn-conve-rlffam')


2023-08-12 16:44:04,272 - [INFO] - {}
2023-08-12 16:44:04,272 - [INFO] - {}
{}




In [216]:
graph_nx = pd.read_csv("data/RLF/all_triplets.txt", sep='\t')
graph_nx.columns = ["source", "type", "target"]
graph_nx['source'] = graph_nx['source'].str.lower()
graph_nx['target'] = graph_nx['target'].str.lower()
graph_nx['type'] = graph_nx['type'].str.lower()

#MultiDiGraph = Directed graph with multiple edges between two nodes 
Graphtype = nx.MultiDiGraph()
G = nx.from_pandas_edgelist(graph_nx, edge_attr='type', create_using=Graphtype)

nodes_degree = [(node, G.degree[node]) for node in G.nodes]
nodes_degree_dict = {node: degree for node, degree in nodes_degree}
degree_sorted = sorted(nodes_degree, key=lambda x: x[1], reverse=True)
count = Counter(item[1] for item in degree_sorted)

# DF Truc
nodes_degree_df = pd.DataFrame(nodes_degree, columns=['sub_name', 'sub_degree'])
# nodes_degree_df['sub_name'] = nodes_degree_df['sub_name'].str.lower()


In [173]:
# First load the graph dataframe
pathdir = Path('./mc_dropout') / 'compgcn-conve-rlffam'
graph = pd.read_pickle(pathdir / 'graph.pickle')

# TODO: this should already be present in next iterations donc c'est bon, delete it apres
graph['index'] = graph.index

graph

Unnamed: 0,sub,rel,train_objs,valid_objs,test_objs,all_obj,index
0,0,0,{1},-1,-1,[1],0
1,1,0,{0},-1,-1,[0],1
2,1,15,{17572},-1,-1,[17572],2
3,2,0,{5074},-1,-1,[5074],3
4,2,1,"{1857, 65, 20450, 3, 4932, 11239, 4809, 9228, ...","{17477, 2629, 2630}",{13476},"[65, 1857, 3, 4932, 20450, 2629, 11239, 17477,...",4
...,...,...,...,...,...,...,...
38807,26386,0,-1,-1,{18261},[18261],38807
38808,26387,0,-1,-1,{7933},[7933],38808
38809,26388,34,-1,-1,{878},[878],38809
38810,26389,0,-1,-1,{26084},[26084],38810


In [174]:
graph['sub_name'] = graph['sub'].map(model.id2ent)
graph['rel_name'] = graph['rel'].map(model.id2rel)
graph

Unnamed: 0,sub,rel,train_objs,valid_objs,test_objs,all_obj,index,sub_name,rel_name
0,0,0,{1},-1,-1,[1],0,investigation,syn
1,1,0,{0},-1,-1,[0],1,étude i.2,syn
2,1,15,{17572},-1,-1,[17572],2,étude i.2,v0
3,2,0,{5074},-1,-1,[5074],3,jupe i,syn
4,2,1,"{1857, 65, 20450, 3, 4932, 11239, 4809, 9228, ...","{17477, 2629, 2630}",{13476},"[65, 1857, 3, 4932, 20450, 2629, 11239, 17477,...",4,jupe i,antimagn
...,...,...,...,...,...,...,...,...,...
38807,26386,0,-1,-1,{18261},[18261],38807,rond de jambe i,syn
38808,26387,0,-1,-1,{7933},[7933],38808,inviter ii.b,syn
38809,26388,34,-1,-1,{878},[878],38809,tête-à-queue iii,oper1
38810,26389,0,-1,-1,{26084},[26084],38810,scène ii,syn


In [175]:
# Add node degrees here as well
graph = graph.merge(nodes_degree_df, how='left', on='sub_name')

assert graph['sub_degree'].isna().sum() == 0, "There are some nodes with no degree"

In [210]:
get_path('abus', 'abuser ii')

(['v0', 'métaphore'], [])

In [218]:
# Create an index of what sub, rel combination is stored where
fnames = sorted([int(fname.stem) for fname in pathdir.rglob('*.torch')])
def get_fname(ind: int, return_ind: bool = True, return_in_file_index: bool = False):
    prev = fnames[0]
    for i in range(len(fnames)):
        _from = fnames[i-1] if i>0 else 0
        _to = fnames[i]

        if _from <= ind < _to:
            # The file is here
            matching_fname = f"{_to}.torch" if not return_ind else _to
            if return_in_file_index:
                return matching_fname, ind-_from
            else:
                return matching_fname
        elif ind < _from:
            raise ValueError
        elif _to <= ind:
            continue

def get_index_for_one(sub: int, rel: int, graph: pd.DataFrame = graph) -> int:
    row = graph[(graph['sub'] == sub) & (graph['rel'] == rel)]
    return row.index.item()

def get_prediction_for_one(sub: int, rel: int):
    fname, gap = get_fname(get_index_for_one(sub, int), False)
    fname = pathdir / fname
    obj = torch.load(fname)
    
def _get_path_(sub: str, obj: str):
    global G

    # Try to get sp
    try:
        sp = nx.shortest_path(G, sub, obj)
    except nx.NetworkXNoPath:
        return []

    pathGraph = nx.path_graph(sp)

    real_path = []
    for ea in pathGraph.edges():
        edge = G.get_edge_data(ea[0], ea[1])
        if len(edge) == 2:
            raise IOError(f"Found multiple edges. Copy paste all of this to Priyansh stp")
        val = list(edge.values())[0]['type']
        real_path.append(val)
    return real_path


def get_path(sub: str, obj: str):
    right_rel = _get_path_(sub=sub, obj=obj)
    opposite_rel = _get_path_(sub=obj, obj=sub)
    return right_rel, opposite_rel

def get_density(sub: str) -> int:
    global nodes_degree_dict
    return nodes_degree_dict[sub]

        
# fnames
print(tabulate([(x, get_fname(x, False, True)) for x in range(20)], headers=["Index", "Filename"]))

  Index  Filename
-------  ---------------
      0  ('5.torch', 0)
      1  ('5.torch', 1)
      2  ('5.torch', 2)
      3  ('5.torch', 3)
      4  ('5.torch', 4)
      5  ('10.torch', 0)
      6  ('10.torch', 1)
      7  ('10.torch', 2)
      8  ('10.torch', 3)
      9  ('10.torch', 4)
     10  ('15.torch', 0)
     11  ('15.torch', 1)
     12  ('15.torch', 2)
     13  ('15.torch', 3)
     14  ('15.torch', 4)
     15  ('18.torch', 0)
     16  ('18.torch', 1)
     17  ('18.torch', 2)
     18  ('21.torch', 0)
     19  ('21.torch', 1)


Ok so now, we have the graph. Here the subject and relation columns appear in an index.
Corresponding to every prediction, we have stored 
- a (100 X num_ent) matrix of predictions with dropout
- a (num_ent) vector of predictions without dropout

E.g. sub 2 and rel 0 appear on index 3
Then we will
0. Find the index based on the dataframe.
1. Find the torch file which contains this particular index 
2. Navigate to that index and fetch the matrices
3. Compute the MC Dropout result based on some params
4. Compute the normal result
5. Mask out paths from both directions 
6. et voila

## The Functions which calculate the actual predictions

In [70]:
def top_k(preds, k=10):
    # Get the top k candidates for each prediction and make a mask like that (boolean)
    top_k = preds.argsort(dim=1, descending=True)[:,:k]
    res = torch.zeros_like(preds, dtype=torch.bool)
    for i, candidates in enumerate(top_k):
        res[i, candidates] = True
        
    return res

def score_threshold(preds, threshold = 0.1):
    return preds > threshold

def avg(pred_masked):
    per_entity_freq = pred_masked.mean(dim=0)
    # print top ten candidates
    scores, indices = per_entity_freq.sort(descending=True)[:10]
    print("Top scoring candidates by this method: \n\n(conf): node")
    for i in range(10):
        print(f"{scores[i]:.10f}: {indices[i].item()}")
        
    return per_entity_freq

# Ability 0: Compute Result for __one__ Sub + Rel combination

In [36]:
sub = 2
rel = 39

row = graph[(graph['sub'] == sub) & (graph['rel'] == rel)]
all_objs = row['all_obj'].to_list()
row.index.item().__class__

int

# Ability 1: Compute Result for a given list of subject, relation

In [49]:
graph[graph['sub'] == 1]

Unnamed: 0,sub,rel,train_objs,valid_objs,test_objs,all_obj,index
0,0,0,{1},-1,-1,[1],0
1,1,0,{0},-1,-1,[0],1
2,1,15,{17572},-1,-1,[17572],2
3,2,0,{5074},-1,-1,[5074],3
4,2,1,"{1857, 65, 20450, 3, 4932, 11239, 4809, 9228, ...","{17477, 2629, 2630}",{13476},"[65, 1857, 3, 4932, 20450, 2629, 11239, 17477,...",4
...,...,...,...,...,...,...,...
38807,26386,0,-1,-1,{18261},[18261],38807
38808,26387,0,-1,-1,{7933},[7933],38808
38809,26388,34,-1,-1,{878},[878],38809
38810,26389,0,-1,-1,{26084},[26084],38810


In [243]:
def aggregate(raw, selected, model: Runner, row: pd.Series, conf_threshold=0.5, max_items: int = 10,):
    # Selected is a boolean mat of (num_ent, num_samples)
    selected = selected.float()
    per_entity_freq = selected.mean(dim=0)
    avg = raw.mean(dim=0)
    
    # print top ten candidates
    conf, indices = per_entity_freq.sort(descending=True)
    conf = conf[:max_items+len(row['all_obj'])]
    indices = indices[:max_items+len(row['all_obj'])]
    # print("Top scoring candidates by this method: \n\n(conf): (agg. score): node")
    print("Top scoring candidates by this method: \n\n")
    headers = ['Sub ID','Rel ID','Obj ID','Sub Nm','Rel Nm','Obj Nm','Conf','Score','Sub Deg','Obj Deg', 'Short. Dist.','Right Path', 'Left Path'] 
    table_rows = []
    for conf, obj_id in zip(conf, indices):

        # Filter out the ones which are in org graph
        if obj_id in row['all_obj']:
            continue

        # Filter out the ones which have low confidence
        if conf < conf_threshold:
            continue
        
        # confidence score is _conf
        # object ID is _obj_id

        sub_id = row['sub'].item()
        rel_id = row['rel'].item()
        obj_id = obj_id.item()

        # Names
        sub_name = model.id2ent[sub_id]
        rel_name = model.id2rel[rel_id]
        obj_name = model.id2ent[obj_id]

        # Degrees
        sub_degree = row['sub_degree'].item()
        obj_degree = get_density(obj_name)

        # Graph data stuff
        right_path, left_path = get_path(sub_name, obj_name)
        right_path = '->'.join(right_path)
        left_path = '<-'.join(left_path[::-1])
        shortest_dist = min(len(right_path), len(left_path))

        # Score Stuff
        conf = round(conf.item(), 4)
        avg_score = round(avg[obj_id].item(), 8)

        table_rows.append([sub_id, rel_id, obj_id, sub_name, rel_name, obj_name, conf, avg_score, sub_degree, obj_degree, shortest_dist, right_path, left_path])

    print(tabulate(table_rows, headers=headers))
    return table_rows
            
        
        
    #     # print(f"{conf[i]:.4f}: {avg[indices[i]].item():.10f}: {model.id2ent[indices[i].item()]}")
    #     print(f"{conf:.4f}: {avg[_obj_id].item():.10f}: {_ind}")
                            
        
    # return per_entity_freq

# Stuff we want to get

- Name of nodes and relations
- Density of predicted object
- Shortest distance between subject, object
- Relations in the shortest distance
- Whether this appears in the vanilla prediction
- Confidence
- Aggregated Score

### Parameters 
- dropout: 0.05
- hid_drop: 0.15
- hid_drop2: 0.15
- feat_drop: 0.15
- top_k = 10
- confidence threshold = 0.5

In [None]:
pt = torch.load('mc_dropout/compgcn-conve-rlffam/5.torch')
pt.keys(), pt['vanilla'].shape, pt['mc_dropout'].shape

In [244]:
threshold_top_k = 10
threshold_confidence = 0.5
max_values_for_one_sub_rel = 10

In [245]:
all_predictions = [] # Store table rows here

In [246]:
mcpr = pt['mc_dropout'][1]
# Take top-k
topk = top_k(mcpr, k=threshold_top_k)

# Now to aggregate

In [247]:
row = graph.loc[1]
_ = aggregate(mcpr, topk, model=model, row=row, conf_threshold=0.5, max_items=10)

Top scoring candidates by this method: 


  Sub ID    Rel ID    Obj ID  Sub Nm     Rel Nm    Obj Nm         Conf     Score    Sub Deg    Obj Deg    Short. Dist.  Right Path     Left Path
--------  --------  --------  ---------  --------  -----------  ------  --------  ---------  ---------  --------------  -------------  -----------
       1         0     25101  étude i.2  syn       étudier i.4       1  0.059065          9          2               2  v0->extension  s0
       1         0     17572  étude i.2  syn       étudier i.1       1  0.110398          9          6               2  v0             s0


G1: Graph: RLF
E1:  Embeddngs: G1 + CompGCN

E1:
    - link predict algo 1
    - link predict algo 2
    
G1.1: G1 + (E1+ LPA1)
G1.2: G1 + (E1+ LPA2)

--- G1.1, G1.2 can be noisy, and have n'importe quoi triples. As long as performance is better @ NLP; ca marche.


Task(NLP; Graph):
1.    Task(__; G1)
2.    Task(__; G1.1)
3.    Task(__; G1.2)
    
If 2/3 >> 1: 
    - so we created a bigger graph; and this bigger graph is better for NLP truc
    