## Name: TTT for link prediction
### Date: 01/04/2024
### Status: Pending
### Idea: 
1. Fit a link prediction method (let's say embeddings) for starters
2. Improve the local embeddings using TTT approach based on "neighborhood" or "similar triples"

### Results:
Pending

In [1]:
from prime_adj.pam_creation import create_pam_matrices
from prime_adj.utils import get_sparsity
from prime_adj.data_loading import load_data
import numpy as np
import pandas as pd
import scipy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearnex import patch_sklearn
patch_sklearn()


import tqdm

project_to_path = {
    "codex-s": "/home/kbougatiotis/GIT/Prime_Adj/data/codex-s/",
    "codex-l": "/home/kbougatiotis/GIT/Prime_Adj/data/codex-l/",
    "WN18RR": "/home/kbougatiotis/GIT/Prime_Adj/data/WN18RR/",
    "FB15k-237": "../data/FB15k-237/",
    "YAGO3-10-DR": "../data/YAGO3-10-DR/",
    "YAGO3-10": "../data/YAGO3-10",
    "NELL-995": "../data/NELL-995",
    "Simpathic": "/home/kbougatiotis/GIT/PAM_Biomedical/Simpathic/data/simpathic/stratified_folds_big_with_train/split_0/"
    #"hetionet": "./data/Hetionet/hetionet-v1.0-edges.tsv",
    #"ogbl-wikikg2": "path",
}
project_name = 'codex-s'
add_inverse_edges="YES__INV"
spacing_strategy = 'step_100000'
max_order = 3

df_train_orig, df_train, df_eval, df_test, already_seen_triples_ = load_data(project_to_path[project_name], project_name, add_inverse_edges=add_inverse_edges)
(pam_1hop_lossless_orig,
    _,
    _,
    rel2id_orig,
    _) = create_pam_matrices(df_train_orig, max_order=1, use_log=False, eliminate_diagonal=False)

(pam_1hop_lossless,
    pam_powers,
    node2id,
    rel2id,
    broke_cause_of_sparsity) = create_pam_matrices(df_train, 
                                                   max_order=max_order, 
                                                   use_log=True, 
                                                   eliminate_diagonal=True, 
                                                   spacing_strategy=spacing_strategy
                                                   )
for p_i, p in enumerate(pam_powers):
    print(f"P@{p_i+1} Sparsity : {get_sparsity(p):.2f} %")

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


Will add the inverse train edges as well..
Total: 67603 triples in train + eval!)
In train: 65776
In valid: 1827
In test: 1828
P@1 Sparsity : 98.55 %
P@2 Sparsity : 47.66 %
P@3 Sparsity : 4.05 %


In [2]:
import graphblas as gb
from prime_adj.utils import get_sparsity

def get_ppr_damping_coeff(num_step:int, num_matrix_order:int, a:float=0.85):
    b = (1-a)
    return (num_step - num_matrix_order + 1) * a ** (num_step - num_matrix_order) * b ** (num_matrix_order)

def get_powers_of_adjacency(PPR_W, max_order:int=3):
    A_gb = gb.io.from_scipy_sparse(PPR_W)

    # Generate the PAM^k matrices
    pam_powers = [PPR_W]
    pam_power_gb = [A_gb]
    for ii in range(1, max_order):
        print(f"Hop {ii}: {get_sparsity(pam_powers[-1])} %")
        cur_previous_power = pam_power_gb[-1].dup()
        updated_power_gb = cur_previous_power.mxm(A_gb).new()

        updated_power = gb.io.to_scipy_sparse(updated_power_gb)

        pam_powers.append(updated_power)
        pam_power_gb.append(updated_power_gb)
    print(f"Hop {ii}: {get_sparsity(pam_powers[-1])} %")
    return pam_powers


def create_ppr_w(pam, add_inverse_edge):
    A_with_ones = pam.copy()
    A_with_ones.data = np.ones(len(A_with_ones.data))
    eps = 1e-4
    tmp_degree = scipy.sparse.diags(1 / (A_with_ones.sum(axis=1) + eps))
    PPR_W = scipy.sparse.eye(A_with_ones.shape[0]) + tmp_degree @ A_with_ones
    # if add_inverse_edges:
    #     rows, cols = PPR_W.nonzero()
    #     PPR_W[cols, rows] = PPR_W[rows, cols]
    #     PPR_W.eliminate_zeros()
    return PPR_W

def get_ppr(pam, num_step:int=5, add_inverse_edge:bool=False,  a:float=0.85, fill_diagonal_zeros:bool=True, fill_original_zeros:bool=False):
    print(f"Creating PPR matrix")
    PPR_W = create_ppr_w(pam, add_inverse_edge)
    PPR_powers = get_powers_of_adjacency(PPR_W, max_order=num_step)
    power_coeffs = [get_ppr_damping_coeff(num_step, i, a) for i in range(num_step)]
    aggr = power_coeffs[0] * PPR_powers[0]
    for coeff, power in zip(power_coeffs[1:],PPR_powers[1:]):
        aggr += coeff*power
    if fill_diagonal_zeros:
        aggr.setdiag(0)
    if fill_original_zeros:
        rows, cols = PPR_W.nonzero()
        aggr[rows, cols] = 0
    aggr.eliminate_zeros()
    return aggr

ppr = get_ppr(pam_powers[0], num_step=3)
get_sparsity(ppr)

Creating PPR matrix
Hop 1: 98.4968901341888 %
Hop 2: 47.66230715012922 %
Hop 2: 4.047901505285278 %


4.097065713741522

In [15]:
from scipy.sparse.linalg import svds


# def exp_A(pam_powers, order_of_ppr=3, weights="uniform"):
#     aggr = pam_powers[0]

#     if isinstance(weights, str):
#         if weights == "uniform":
#             w = np.ones(order_of_ppr)
#         elif weights == "power_decay":
#             w = np.array([1 / np.sqrt(power) for power in range(1, order_of_ppr + 1)])
#         elif weights == "power_incay":
#             w = np.array([np.sqrt(power) for power in range(1, order_of_ppr + 1)])
#     else:
#         w = weights

#     aggr.data = np.ones(len(aggr.data)) * w[0]
#     c = 1
#     print(w)
#     while c < order_of_ppr and c <= len(pam_powers):
#         cur_power = pam_powers[c]
#         cur_power.data = np.ones(len(cur_power.data)) * w[c]
#         aggr = aggr + cur_power
#         c += 1
#     return aggr



from scipy.sparse.linalg import svds
# to_use = exp_A(pam_powers, order_of_ppr=max_order, weights='power_incay')
u, s, vh = svds(ppr, k=100)
# recon = u @ np.diag(s) @ vh
# error = ((ppr - recon)**2).sum()
# print(f"Errpr: {error:.2f} Per cell: {error / (ppr.shape[0]*ppr.shape[1]):.2f}")

In [6]:
df_train['head_mapped'] = df_train['head'].map(node2id)#.astype(int)
df_train['tail_mapped'] = df_train['tail'].map(node2id)#.astype(int)
df_train['rel_mapped'] = df_train['rel'].map(rel2id)
df_train = df_train.dropna()
df_train['head_mapped'] = df_train['head_mapped'].astype(int)
df_train['tail_mapped'] = df_train['tail_mapped'].astype(int)

df_eval['head_mapped'] = df_eval['head'].map(node2id)#.astype(int)
df_eval['tail_mapped'] = df_eval['tail'].map(node2id)#.astype(int)
df_eval['rel_mapped'] = df_eval['rel'].map(rel2id)
df_eval = df_eval.dropna()
df_eval['head_mapped'] = df_eval['head_mapped'].astype(int)
df_eval['tail_mapped'] = df_eval['tail_mapped'].astype(int)

df_test['head_mapped'] = df_test['head'].map(node2id)#.astype(int)
df_test['rel_mapped'] = df_test['rel'].map(rel2id)#.astype(int)

df_test['tail_mapped'] = df_test['tail'].map(node2id)#.astype(int)
df_test = df_test.dropna()
df_test['head_mapped'] = df_test['head_mapped'].astype(int)
df_test['tail_mapped'] = df_test['tail_mapped'].astype(int)

In [None]:
# ppr = u @ vh
# argsorted = np.fliplr(np.argsort(ppr, axis=1))

In [8]:
df_test_mapped = df_test.copy()
df_test_mapped["rel"] = df_test["rel"].map(rel2id)
df_test_mapped["head"] = df_test["head"].map(node2id)
df_test_mapped["tail"] = df_test["tail"].map(node2id)
df_test_mapped.dropna(inplace=True)

node2uniqe = {node_index:unq_index for unq_index, node_index in enumerate(df_test_mapped["head"].unique().astype(int))}

In [12]:
argsorted = []
for head_id in tqdm.tqdm_notebook(node2uniqe.keys()):
    try:
        ppr.nnz
        cur_sort = np.argsort(ppr[[head_id], :].toarray().ravel())[::-1]
    except:
        cur_sort = np.argsort(ppr[head_id].ravel())[::-1]
    argsorted.append(cur_sort)
argsorted = np.array(argsorted)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for head_id in tqdm.tqdm_notebook(node2uniqe.keys()):


  0%|          | 0/1045 [00:00<?, ?it/s]

In [19]:
pam_1hop_lossless_orig

<2034x2034 sparse array of type '<class 'numpy.int64'>'
	with 32611 stored elements in Compressed Sparse Row format>

In [159]:
from scipy.sparse.linalg import svds as sp_svds
u_lossless, s_lossless, vh_lossless = sp_svds(np.random.rand(100,100), k=10, which='LM')
s_lossless

array([ 4.83138103,  4.94985556,  5.00027938,  5.09473394,  5.20881341,
        5.27151821,  5.40173932,  5.49074566,  5.68810948, 49.95493098])

array([  261077.13205047,   267638.977796  ,   273114.4676951 ,
         339503.09730677,   340592.46022521,   343739.38725787,
         375431.38315829,   413215.76656574,   483089.28053417,
       21040297.18205072])

In [None]:
ppr_lossless = u_lossless @ np.diag(s_lossless) @ vh_lossless
# ppr_lossless = np.clip(ppr_lossless, 0, pam_1hop_lossless_orig.max())
# possible_labels = np.sort(np.unique([0] + pam_1hop_lossless_orig.data.tolist()))
# inds = np.digitize(ppr_lossless, possible_labels) - 1
# ppr_lossless = possible_labels[inds]
argsorted_lossless = np.fliplr(np.argsort(ppr_lossless, axis=1))

In [143]:
np.linalg.norm(ppr_lossless - pam_1hop_lossless_orig) / np.prod(pam_1hop_lossless_orig.shape)

0.004755627649672863

In [144]:
import tqdm
from collections import defaultdict
def get_filtering_cache(df_train, df_eval, df_test):
    cache_triples = defaultdict(list)
    all_triples = pd.concat((df_train, df_eval, df_test))
    print(all_triples.columns)
    for triple in tqdm.tqdm_notebook(all_triples.to_records()):
        # Adding h,r ->t
        cache_triples[(triple[1], triple[2])].append(triple[3])
        # Addubg r, t -> h
        cache_triples[(triple[2], triple[3])].append(triple[1])
    return cache_triples

id2node = {v:k for k,v in node2id.items()}
id2rel_orig = {v:k for k,v in rel2id_orig.items()}


cache_triples = get_filtering_cache(df_train_orig, df_eval, df_eval)

Index(['head', 'rel', 'tail', 'rel_mapped', 'head_mapped', 'tail_mapped'], dtype='object')


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for triple in tqdm.tqdm_notebook(all_triples.to_records()):


  0%|          | 0/36542 [00:00<?, ?it/s]

In [145]:
# res = []
# for i_row, row in tqdm.tqdm_notebook(df_test_mapped.iterrows(), total=len(df_test_mapped)):
#     unq_index = node2uniqe[int(row['head'])]
#     sim_scores = np.abs(ppr_lossless[unq_index] - row['rel_mapped'])
#     already_seen_tails = cache_triples[(row['head'], row['rel'])]
#     if len(already_seen_tails) > 0:
#         sim_scores[np.array([node2id[tail] for tail in already_seen_tails])] =  max(sim_scores) + 1 
#     tail_indices = np.argsort(sim_scores)
#     index_of_tail = int(row['tail'])
#     rank = tail_indices.tolist().index(index_of_tail) + 1
#     cur_res = {
#         "predicted": tail_indices,
#         "probas": sim_scores[tail_indices],
#         "rank": rank,
#         **row,
#     }
#     res.append(cur_res) 
# df_res_ppr = pd.DataFrame(res)

# pr_results = {}
# pr_results["MRR"] = (1 / df_res_ppr["rank"]).mean()
# print(f"MRR:{pr_results['MRR']:.4f}")

# for k in [1, 3, 10,100,1000,10000,20000] + [0.1, 0.2, 0.5]:
#     if isinstance(k, int):
#         k_int = k
#     else:
#         k_int = int(k*argsorted.shape[1])
#     pr_results[f"h@{k}"] = (df_res_ppr["rank"] <= k_int).sum() / df_res_ppr.shape[0]
#     print(f"Hits@{k}: {pr_results[f'h@{k}']:.4f}")

In [149]:
res = []
for i_row, row in tqdm.tqdm_notebook(df_test_mapped.iterrows(), total=len(df_test_mapped)):
    unq_index = node2uniqe[int(row['head'])]
    tail_indices = argsorted_lossless[unq_index]
    index_of_tail = int(row['tail'])
    rank = tail_indices.tolist().index(index_of_tail) + 1
    cur_res = {
        "predicted": tail_indices,
        "probas": argsorted_lossless[unq_index][tail_indices],
        "rank": rank,
        **row,
    }
    res.append(cur_res) 
df_res_ppr = pd.DataFrame(res)

pr_results = {}
pr_results["MRR"] = (1 / df_res_ppr["rank"]).mean()
print(f"MRR:{pr_results['MRR']:.4f}")

for k in [1, 3, 10,100,1000,10000,20000] + [0.1, 0.2, 0.5]:
    if isinstance(k, int):
        k_int = k
    else:
        k_int = int(k*argsorted.shape[1])
    pr_results[f"h@{k}"] = (df_res_ppr["rank"] <= k_int).sum() / df_res_ppr.shape[0]
    print(f"Hits@{k}: {pr_results[f'h@{k}']:.4f}")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i_row, row in tqdm.tqdm_notebook(df_test_mapped.iterrows(), total=len(df_test_mapped)):


  0%|          | 0/1828 [00:00<?, ?it/s]

MRR:0.0070
Hits@1: 0.0005
Hits@3: 0.0038
Hits@10: 0.0137
Hits@100: 0.1012
Hits@1000: 0.4136
Hits@10000: 1.0000
Hits@20000: 1.0000
Hits@0.1: 0.1887
Hits@0.2: 0.3206
Hits@0.5: 0.4152


In [None]:
## Implementing a two step approach.

# Given a query, fetch similar queries + 

In [266]:
import torch
import torch.nn as nn
import torch.optim as optim
import dgl
import numpy as np
from sklearn.metrics import roc_auc_score

# Sample WN18RR data (replace with actual dataset loading)
def load_wn18rr_sample(num_entities=100, num_relations=10, num_edges=500):
    src = np.random.randint(0, num_entities, num_edges)
    dst = np.random.randint(0, num_entities, num_edges)
    rel = np.random.randint(0, num_relations, num_edges)
    graph = dgl.graph((src, dst))
    graph.edata['rel'] = torch.tensor(rel)

    train_mask = torch.rand(num_edges) < 0.8
    val_mask = (torch.rand(num_edges) < 0.1) & ~train_mask
    test_mask = ~(train_mask | val_mask)

    train_graph = dgl.edge_subgraph(graph, train_mask, relabel_nodes=False)
    val_graph = dgl.edge_subgraph(graph, val_mask, relabel_nodes=False)
    test_graph = dgl.edge_subgraph(graph, test_mask, relabel_nodes=False)

    return train_graph, val_graph, test_graph, num_entities, num_relations, {i:i for i in np.arange(num_entities)}, {i:i for i in np.arange(num_relations)}


# Sample WN18RR data (replace with actual dataset loading)
def load_wn18rr_true(name_of_project):
    df_train_orig, df_train, df_eval, df_test, already_seen_triples_ = load_data(project_to_path[project_name], project_name, add_inverse_edges=add_inverse_edges)
    df_train['type']= 'train'
    df_eval['type'] = 'eval'
    df_test['type'] = 'test'
    
    df_all = pd.concat([df_train, df_eval, df_test])
    unq_nodes = np.concatenate((df_all['head'].unique(), df_all['tail'].unique()))
    unq_rels = df_all['rel'].unique()
    node2id_graph = {v:index for index, v in enumerate(unq_nodes)}
    rel2id_graph = {r:index for index, r in enumerate(unq_rels)}
    
    df_all['head_mapped'] = df_all['head'].map(node2id_graph)
    df_all['tail_mapped'] = df_all['tail'].map(node2id_graph)
    df_all['rel_mapped'] = df_all['rel'].map(rel2id_graph)
    df_all.dropna(inplace=True)
    
    graph = dgl.graph((df_all['head_mapped'].values.astype(int), df_all['tail_mapped'].values.astype(int)))
    graph.edata['rel'] = torch.tensor(df_all['rel_mapped'].values)
    print(df_all['head_mapped'].unique().shape[0], df_all['tail_mapped'].unique().shape[0], graph)
    return 
    train_mask = torch.tensor((df_all['type'] == 'train').values.astype(bool))
    val_mask = torch.tensor((df_all['type'] == 'eval').values.astype(bool))
    test_mask = torch.tensor((df_all['type'] == 'test').values.astype(bool))

    train_graph = dgl.edge_subgraph(graph, train_mask, relabel_nodes=False)
    val_graph = dgl.edge_subgraph(graph, val_mask, relabel_nodes=False)
    test_graph = dgl.edge_subgraph(graph, test_mask, relabel_nodes=False)

    return train_graph, val_graph, test_graph, len(node2id), len(rel2id), node2id_graph, rel2id_graph

class TransE(nn.Module):
    def __init__(self, num_entities, num_relations, embedding_dim, margin=1.0, p_norm=1):
        super(TransE, self).__init__()
        self.entity_embeddings = nn.Embedding(num_entities, embedding_dim)
        self.relation_embeddings = nn.Embedding(num_relations, embedding_dim)
        self.margin = margin
        self.p_norm = p_norm

        nn.init.xavier_uniform_(self.entity_embeddings.weight.data)
        nn.init.xavier_uniform_(self.relation_embeddings.weight.data)

    def forward(self, h, r, t):
        print(h,h.dtype)
        print(r,r.dtype)
        print(t,t.dtype)
        h_emb = self.entity_embeddings(h)
        r_emb = self.relation_embeddings(r)
        t_emb = self.entity_embeddings(t)
        return h_emb, r_emb, t_emb

    def compute_loss(self, h_emb, r_emb, t_emb, neg_h_emb, neg_t_emb):
        pos_score = torch.norm(h_emb + r_emb - t_emb, p=self.p_norm, dim=-1)
        # print(neg_h_emb.shape, r_emb.shape, t_emb.shape)
        # print(r_emb.reshape(r_emb.shape[0], 1, r_emb.shape[1]).repeat(1,10,1).shape, t_emb.reshape(t_emb.shape[0], 1, t_emb.shape[1]).repeat(1,10,1).shape)
        expanded_r_emb =  r_emb.reshape(r_emb.shape[0], 1, r_emb.shape[1]).repeat(1,10,1)
        expanded_h_emb = h_emb.reshape(h_emb.shape[0], 1, h_emb.shape[1]).repeat(1,10,1)
        expanded_t_emb = t_emb.reshape(t_emb.shape[0], 1, t_emb.shape[1]).repeat(1,10,1)
        neg_score_h = torch.norm(neg_h_emb + expanded_r_emb -expanded_t_emb, p=self.p_norm, dim=-1)
        neg_score_t = torch.norm(expanded_h_emb + expanded_r_emb - neg_t_emb, p=self.p_norm, dim=-1)

        loss_h = torch.relu(self.margin + pos_score - neg_score_h.mean(1)).mean()
        loss_t = torch.relu(self.margin + pos_score - neg_score_t.mean(1)).mean()
        return loss_h + loss_t

    def get_embeddings(self, h, r, t):
        h_emb = self.entity_embeddings(h) if h is not None else None
        r_emb = self.relation_embeddings(r) if r is not None else None
        t_emb = self.entity_embeddings(t) if t is not None else None
        return h_emb, r_emb, t_emb

def generate_negative_samples(graph, batch_size):
    num_neg = 10 # number of negative samples per positive sample
    neg_head = torch.randint(0, graph.num_nodes(), (batch_size,num_neg))
    neg_tail = torch.randint(0, graph.num_nodes(), (batch_size,num_neg))
    return neg_head, neg_tail

def train(model, graph, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        h, t = graph.edges()
        r = graph.edata['rel']
        h_emb, r_emb, t_emb = model(h, r, t)
        neg_h, neg_t = generate_negative_samples(graph, h_emb.shape[0])
        print(h_emb.shape, r_emb.shape, neg_h.shape, neg_t.shape)
        neg_h_emb = model.entity_embeddings(neg_h)
        neg_t_emb = model.entity_embeddings(neg_t)

        loss = model.compute_loss(h_emb, r_emb, t_emb, neg_h_emb, neg_t_emb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch}, Loss: {loss.item()}")

# def extract_neighborhood(graph, node, k=2):
#     # bfs = dgl.sampling.bfs_nodes_generator(graph, node, k)
#     # neighborhood_nodes = torch.cat([nodes for nodes in bfs])
#     # subgraph = dgl.node_subgraph(graph, neighborhood_nodes)
#     print(node.item())
#     subgraph, _ = dgl.khop_in_subgraph(graph, nodes=node.item(), k=k, relabel_nodes=False)
#     return subgraph


def extract_neighborhood(graph, node, k=2):
    """Extracts the k-hop in-neighborhood for a node in a homogeneous graph."""
    visited = {node.item()}
    queue = [(node.item(), 0)]  # (node, distance)
    nodes_to_include = [node.item()]

    while queue:
        current_node, distance = queue.pop(0)
        if distance >= k:
            continue

        neighbors = graph.in_edges(current_node)[0].tolist() # get source nodes of in_edges
        for neighbor in neighbors:
            if neighbor not in visited:
                visited.add(neighbor)
                queue.append((neighbor, distance + 1))
                nodes_to_include.append(neighbor)

    subgraph = dgl.node_subgraph(graph, nodes_to_include, relabel_nodes=False, store_ids=True)
    return subgraph

def adapt_transE(model, query, neighborhood, learning_rate=0.01, num_steps=10):
    h_test, r_test, t_test = query
    h_test_emb, r_test_emb, t_test_emb = model.get_embeddings(h_test, r_test, t_test)

    if t_test is None:
        adapted_h = h_test_emb.clone().detach().requires_grad_(True)
        adapted_t = model.entity_embeddings.weight.clone().detach().requires_grad_(True)
    elif h_test is None:
        adapted_h = model.entity_embeddings.weight.clone().detach().requires_grad_(True)
        adapted_t = t_test_emb.clone().detach().requires_grad_(True)

    optimizer = optim.Adam([adapted_h, adapted_t], lr=learning_rate)

    for _ in range(num_steps):
        optimizer.zero_grad()
        loss = torch.tensor(0.0, requires_grad=True)
        for h_i, t_i, eid in zip(neighborhood.edges()[0], neighborhood.edges()[1], neighborhood.edata[dgl.EID]):
            print(h_i, h_i.shape, t_i, t_i.shape, eid, eid.shape)
            if eid > train_graph.edata['rel'].shape[0]:
                print('Wrong indexing...')
            else:
                r_i = train_graph.edata['rel'][eid]
                h_i_emb, r_i_emb, t_i_emb = model.get_embeddings(h_i, r_i, t_i)
                if t_test is None:
                    loss += torch.relu(model.margin + torch.norm(adapted_h + r_i_emb - t_i_emb,p=model.p_norm) - torch.norm(h_i_emb + r_i_emb - t_i_emb, p=model.p_norm))
                    print('added loss', loss)
                elif h_test is None:
                    loss += torch.relu(model.margin + torch.norm(h_i_emb + r_i_emb - adapted_t, p=model.p_norm) - torch.norm(h_i_emb + r_i_emb - t_i_emb, p=model.p_norm))
                    print('added loss', loss)
        loss.backward()
        optimizer.step()

    if t_test is None:
        scores = -torch.norm(adapted_h + r_test_emb - model.entity_embeddings.weight, dim=1, p=model.p_norm)
        return scores
    elif h_test is None:
        scores = -torch.norm(model.entity_embeddings.weight + r_test_emb - adapted_t, dim=1, p=model.p_norm)
        return scores

def evaluate(model, test_graph):
    model.eval()
    auc_scores = []
    with torch.no_grad():
        for h, t, eid in zip(test_graph.edges()[0], test_graph.edges()[1], test_graph.edata[dgl.EID]):
            if eid > test_graph.edata['rel'].shape[0]:
                print('Wrong indexing...')
            else:
                r = test_graph.edata['rel'][eid]
                neighborhood = extract_neighborhood(test_graph, h)
                scores = adapt_transE(model, (h, r, None), neighborhood)
                labels = torch.zeros(model.entity_embeddings.num_embeddings)
                labels[t] = 1.0
                auc = roc_auc_score(labels.numpy(), scores.numpy())
                auc_scores.append(auc)
    return np.mean(auc_scores)

# Main Execution
train_graph, val_graph, test_graph, num_entities, num_relations, node2id_graph, rel2id_graph = load_wn18rr_true('WN18RR')#load_wn18rr_sample()
model = TransE(num_entities, num_relations, embedding_dim=50)
optimizer = optim.Adam(model.parameters(), lr=0.001)

train(model, train_graph, optimizer, epochs=30)
auc = evaluate(model, test_graph)
print(f"Test AUC: {auc}")

Will add the inverse train edges as well..
Total: 67603 triples in train + eval!)
In train: 65776
In valid: 1827
In test: 1828
2034 2034 Graph(num_nodes=4068, num_edges=69431,
      ndata_schemes={}
      edata_schemes={'rel': Scheme(shape=(), dtype=torch.int64)})


TypeError: cannot unpack non-iterable NoneType object

In [None]:
df_all = pd.concat([df_train, df_eval, df_test])
graph2 = dgl.graph((df_all['head_mapped'].values.astype(int), df_all['tail_mapped'].values.astype(int)))
graph2.edata['rel'] = torch.tensor(df_all['rel_mapped'].values)
graph2

In [267]:
df_all = pd.concat([df_train, df_eval, df_test])
df_all['rel_mapped_int'] = df_all['rel'].map(rel2id_graph) + 1
graph2 = dgl.graph((df_all['head_mapped'].values.astype(int), df_all['tail_mapped'].values.astype(int)))

graph2.edata['rel'] = torch.tensor(df_all['rel_mapped_int'].values.astype(int))


train_mask = torch.zeros(len(df_all), dtype=int)
train_mask[:len(df_train)] = 1
val_mask = torch.zeros(len(df_all), dtype=int)
val_mask[len(df_train):len(df_train) + len(df_eval)] = 1
test_mask = torch.zeros(len(df_all), dtype=int)
test_mask[len(df_train) + len(df_eval):] = 1

train_graph = dgl.edge_subgraph(graph2, train_mask.bool(), relabel_nodes=False)
val_graph = dgl.edge_subgraph(graph2, val_mask.bool(), relabel_nodes=False)
test_graph = dgl.edge_subgraph(graph2, test_mask.bool(), relabel_nodes=False)



In [270]:
model = TransE(graph2.num_nodes(), len(rel2id_graph), embedding_dim=50)
optimizer = optim.Adam(model.parameters(), lr=0.001)
train(model, train_graph, optimizer, epochs=13)
auc = evaluate(model, test_graph)
print(f"Test AUC: {auc}")

tensor([1771, 1823, 1755,  ..., 1904,  377,  145]) torch.int64
tensor([ 1,  2,  3,  ..., 57, 46, 44]) torch.int64
tensor([ 533,  135, 1663,  ..., 1020, 1963, 1587]) torch.int64


IndexError: index out of range in self

In [None]:
from sklearn.base import BaseEstimator
from scipy.special import softmax

class NEQBoost(BaseEstimator):
    
    def __init__(self, strategy = 'min_residual_selection'):
        self.ohe = OneHotEncoder()
        self.sc = StandardScaler()
        self.strategy = strategy
        self.W = []
    
    def fit(self, X, y):
        
        X_sc = self.sc.fit_transform(X)
        y_ohe = self.ohe.fit_transform(y.reshape(-1,1)).toarray()
        wrong = np.arange(X_sc.shape[0])
        while sum(wrong) > 0:
            X_leftover, y_leftover = X_sc[wrong], y_ohe[wrong]
            W_leftover, _, _, _ = np.linalg.lstsq(X_leftover, y_leftover, rcond=None)
            preds_ohe = X_leftover@W_leftover
            wrong = preds_ohe.argmax(axis=1) != y_leftover.argmax(axis=1)    
            self.W.append(W_leftover)
        self.num_neqs = len(self.W)
        # print(f"Rounds of boosting: {self.num_neqs}")
        return self
    
    def predict(self, X):
        probas = self.predict_proba(X)
        return probas.argmax(axis=1)
    
    def predict_proba(self, X):
        X_sc = self.sc.transform(X)
        y_probas_all = []
        dists = []
        for W in self.W:
            y_logit = X_sc @ W
            dist = np.linalg.norm(y_logit, axis=1)
            y_proba = softmax(y_logit, axis=1)
            y_probas_all.append(y_proba)
            dists.append(dist)
        dists = np.vstack(dists).T
        dists = softmax(dists, axis=1)
        y_probas_all = np.array(y_probas_all).reshape(self.num_neqs, X.shape[0], -1)
        #print(y_probas_all.shape)
        if self.strategy == 'weighted_residual_voting':
            y_probas = np.einsum('mic, im-> ic', y_probas_all, dists)
        elif self.strategy == 'min_residual_selection':
            neq_to_use = dists.argmin(axis=1)
            y_probas = y_probas_all[neq_to_use, np.arange(X.shape[0]), :]
        else:
            raise NotImplementedError(f"Can't understand {self.strategy}")
        return y_probas

clf = NEQBoost(strategy='min_residual_selection')  
clf.fit(X,y)
y_pred = clf.predict(X)
print(f"Full data fit acc: {accuracy_score(y, y_pred):.4f}")

NameError: name 'OneHotEncoder' is not defined

In [58]:
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score

def fit_subsamples(X_leftovers, labels_leftovers, max_to_check='sqrt'):
    if max_to_check == 'sqrt':
        max_num_clusters =  int(np.round(np.sqrt(X_leftovers.shape[0]),0)) + 1
    elif max_to_check == 'full':
        max_num_clusters =  X_leftovers.shape[0] + 1
    elif isinstance(max_to_check, int):
        max_num_clusters = max_to_check
    else:
        raise NotImplementedError()
        
    res = []
    cluster_dict = {}
    for k in range(1, max_num_clusters):
        cl = KMeans(n_clusters=k, n_init='auto')
        cl.fit(X_leftovers)
        cluster_dict[k] = {'cl':cl, 'W': []}
        rr = 0
        acc = 0
        for unq_label in np.unique(cl.labels_):
            current_cluster = np.where(cl.labels_ == unq_label)[0]
            X_cur, y_cur_ohe = X_leftovers[current_cluster], labels_leftovers[current_cluster]
            W_ohe_cur, residual_sums, _, _= np.linalg.lstsq(X_cur, y_cur_ohe, rcond=None)
            cluster_dict[k]['W'].append(W_ohe_cur)
            cur_acc = accuracy_score(y_cur_ohe.argmax(axis=1), (X_cur @ W_ohe_cur).argmax(axis=1))
            rr += residual_sums.sum()
            acc += cur_acc
            
        #print(rr)
        res.append((int(k), cl.inertia_, rr, acc/k))
        #break
    res_df = pd.DataFrame(res, columns=['k', 'inertia', 'ss', 'acc'])
    wanted_k = res_df.sort_values(['acc', 'k'], ascending=[False,True]).iloc[0]['k']
    return cluster_dict[wanted_k]

In [60]:
from sklearn.base import BaseEstimator
from scipy.special import softmax
from sklearn.neighbors import BallTree
from sklearn.neighbors import KNeighborsClassifier

class NEQ_Local(BaseEstimator):
    
    def __init__(self, strategy = 'selection', k="sqrt"):
        self.ohe = OneHotEncoder()
        self.sc = StandardScaler()
        self.k = k
        if isinstance(self.k, int):
            self.num_neigh = self.k
        self.strategy = strategy
        self.W = None
        self.subsamples_dict = {}
        self.neighbor_tree = None
    
    def fit(self, X, y):
        
        X_sc = self.sc.fit_transform(X)
        
        if self.k == 'auto':
            accs = []
            for k in range(1, int(np.sqrt(X.shape[0]))):
                kn = KNeighborsClassifier(n_neighbors=k,)
                kn.fit(X_sc, y)
                accs.append(kn.score(X_sc, y))
            self.num_neigh = np.argmax(accs) + 1
                
        elif self.k == 'sqrt':
            self.num_neigh = int(np.sqrt(X.shape[0]))
        
        self.num_neigh = self.num_neigh + 1 if self.num_neigh % 2 == 0 else self.num_neigh
            
        
        self.neighbor_tree = BallTree(X_sc)
        
        
        y_ohe = self.ohe.fit_transform(y.reshape(-1,1)).toarray()
        
        self.W, _, _, _= np.linalg.lstsq(X_sc, y_ohe, rcond=None)
        preds_ohe = X_sc@self.W
        correct = (preds_ohe.argmax(axis=1) == y).astype(int)
        self.neq_train_labels = correct
        X_leftovers = X_sc[~correct.astype(bool)]
        labels_leftovers = y_ohe[~correct.astype(bool)]
        self.subsamples_dict = fit_subsamples(X_leftovers, labels_leftovers)
        return self
    
    def predict(self, X):
        probas = self.predict_proba(X)
        return probas.argmax(axis=1)
    
    def predict_proba(self, X):
        X_sc = self.sc.transform(X)
        dist, indices = self.neighbor_tree.query(X_sc, k=self.num_neigh)
        # num_test X 1
        neq_percentage_correct = self.neq_train_labels[indices].mean(axis=1)
        y_neq = X_sc @ self.W
        
        if self.strategy == 'selection':
            keep_neq = (neq_percentage_correct > 0.5)
            if (~keep_neq).sum():
                X_leftovers = X_sc[~keep_neq]
                local_neq_to_use = self.subsamples_dict['cl'].predict(X_leftovers)
                y_locals = []
                for sample_index, neq_index in enumerate(local_neq_to_use):
                    cur_X = X_leftovers[sample_index,:]
                    cur_W = self.subsamples_dict['W'][neq_index]
                    y_locals.append(cur_X @ cur_W)
                y_locals = np.vstack(y_locals)
                y_neq[~keep_neq] = y_locals
            y_probas = softmax(y_neq, axis=1)
        elif self.strategy == 'weighted_voting':
            keep_neq = (neq_percentage_correct > 0.5)
            if (~keep_neq).sum():
                X_leftovers = X_sc[~keep_neq]
                perc_nec = neq_percentage_correct[~keep_neq]
                local_neq_to_use = self.subsamples_dict['cl'].predict(X_leftovers)
                y_locals = []
                for sample_index, neq_index in enumerate(local_neq_to_use):
                    cur_X = X_leftovers[sample_index,:]
                    cur_W = self.subsamples_dict['W'][neq_index]
                    y_locals.append(cur_X @ cur_W)
                y_locals = np.vstack(y_locals)
                y_neq[~keep_neq] = perc_nec.reshape(-1,1) * softmax(y_neq[~keep_neq], axis=1) + (1-perc_nec).reshape(-1,1) * softmax(y_locals, axis=1)
            y_probas = softmax(y_neq, axis=1)
        else:
            raise NotImplementedError(f"Can't understand {self.strategy}")
    
        return y_probas

clf = NEQ_Local(k=3, strategy='weighted_voting')  
clf.fit(X,y)
y_pred = clf.predict(X)
print(f"Full data fit acc: {accuracy_score(y, y_pred):.4f}")

Full data fit acc: 0.9772


In [26]:
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)


clf = DecisionTreeClassifier(random_state=42, max_depth=None)
# y_pred = cross_val_predict(clf, X, y, cv=cv)

y_pred_all = []
y_true_all = []
for train, test in cv.split(X,y):
    X_train, y_train = X[train], y[train]
    X_test, y_test = X[test], y[test]
    clf = DecisionTreeClassifier(random_state=42, max_depth=3)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_pred_all.extend(y_pred.tolist())
    y_true_all.extend(y_test.tolist())

print(classification_report(y_true_all, y_pred_all))
print(confusion_matrix(y_true_all, y_pred_all))

              precision    recall  f1-score   support

           0       0.84      0.85      0.85       100
           1       0.85      0.84      0.84       100

    accuracy                           0.84       200
   macro avg       0.85      0.84      0.84       200
weighted avg       0.85      0.84      0.84       200

[[85 15]
 [16 84]]


In [39]:
import pandas as pd
res = pd.read_csv("./results/neq_boost_results.csv")
# Step 2: Sort each group by 'f1'
sorted_df = res.groupby('dataset').apply(lambda x: x.sort_values(by='f1', ascending=False)).reset_index(drop=True)

# Step 3: Assign ranks within each group
sorted_df['rank'] = sorted_df.groupby('dataset').cumcount() + 1

# Step 4: Calculate mean rank for each model across all datasets
mean_ranks = sorted_df.groupby('model')['rank'].mean().reset_index().sort_values(by='rank')

print(mean_ranks)
            

                      model      rank
4     NEQ_Local_selection_5  3.864865
3                       NEQ  3.905405
2                        DT  3.986486
6  NEQ_Local_selection_sqrt  4.297297
7   NEQ_Local_weighted_auto  4.310811
8   NEQ_Local_weighted_sqrt  4.554054
5  NEQ_Local_selection_auto  4.675676
1                Boost_Mean  7.486486
0                 Boost_Max  7.918919


  sorted_df = res.groupby('dataset').apply(lambda x: x.sort_values(by='f1', ascending=False)).reset_index(drop=True)


In [51]:
models = mean_ranks.model[:3].values
wins_score = np.zeros((len(models), len(models)))

metric_to_score = 'f1'
res_local = res[res['model'].isin(models)]
for classification_dataset in res_local['dataset'].unique():
    cur_df = res_local[res_local['dataset'] == classification_dataset]
    cur_df = cur_df.set_index('model')
    score_metric = cur_df[metric_to_score]
    for i, m1 in enumerate(models):
        for j, m2 in enumerate(models[i:]):
            if cur_df.loc[m1][metric_to_score] > cur_df.loc[m2][metric_to_score]:
                wins_score[i, j+i] += 1
            elif cur_df.loc[m1][metric_to_score] < cur_df.loc[m2][metric_to_score]:
                wins_score[j+i, i] += 1
            else:
                pass
order_of_models = wins_score.mean(axis=1).argsort()[::-1]
wins_score = wins_score[order_of_models, :][:, order_of_models]
print('WINS')
print(pd.DataFrame(wins_score, columns = np.array(models)[order_of_models], index=np.array(models)[order_of_models]))

WINS
                         DT  NEQ_Local_selection_5   NEQ
DT                      0.0                   40.0  37.0
NEQ_Local_selection_5  33.0                    0.0  41.0
NEQ                    36.0                   25.0   0.0
