In [1]:
from sklearn.manifold import TSNE
from src.main import *
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import pickle as pkl
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.multiclass import OneVsRestClassifier
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from matplotlib.lines import Line2D
from src.generate_graph import *
from src.link_prediction import *
import sys

In [None]:
seed = 21
stellargraph.random.set_seed(seed)

In [None]:
# Load Polblogs
path_data = "data/"
g, labArray = loadPolblog(path_data, verbose=True)

## Repairing with OT (EMD version)

In [None]:
# Repair the graph
emd_x_l, s, gamma, M = total_repair_emd(g)
emd_g = nx.from_numpy_matrix(emd_x_l)

# Remove edges with small weights to have similar density with original graph
list_edge = [(u, v) for (u, v, d) in emd_g.edges(data=True) if d['weight'] < 0.5]
emd_g.remove_edges_from(list_edge)
lab = {k: j for k, j in zip(emd_g.nodes, labArray[:, 1])}
g_emd = nx.relabel_nodes(emd_g, lab)

In [None]:
print(nx.density(g))
print(nx.density(g_emd))

## Link prediction

We are now ready to illustrate the impact of the repairing on the node embeddings and the link prediction task. 
To proceed, we use the Stellar library. 

In [26]:
from stellargraph import StellarGraph
from stellargraph.data import EdgeSplitter
from sklearn.model_selection import train_test_split
import pandas as pd

In [32]:
# Create two graphs with stellar format (original and repaired one)
stellar_polblogs_emd = StellarGraph.from_networkx(g_emd)
stellar_polblogs_lap = StellarGraph.from_networkx(g_lap)
stellar_polblogs = StellarGraph.from_networkx(g)

In [13]:
node_list = list(g.nodes(data='value'))
lab_node_list = [(node_list.index(i), i[0]) for i in node_list]
lab_node_array = np.array(lab_node_list)

def Convert(tup):
    dic = dict(tup)
    return dic

tups = node_list
dictionary = {}
protS = Convert(tups)

### Link prediction with OT-emd repaired graph

In [14]:
auc, di, cons, rep_bias = [], [], [], []
# Note that we use 10 trials for the paper
trials = 5

for i in range(trials):

    # Define an edge splitter on the corrected graph:
    edge_splitter_test = EdgeSplitter(stellar_polblogs_emd)
    graph_test, examples_test, labels_test = edge_splitter_test.train_test_split(p=0.3, method="global",
                                                                                  keep_connected=True)
    # Do the same process to compute a training subset from within the test graph
    edge_splitter_train = EdgeSplitter(graph_test, stellar_polblogs_emd)
    graph_train, examples, labels = edge_splitter_train.train_test_split(p=0.3, method="global", keep_connected=True)
    (
        examples_train,
        examples_model_selection,
        labels_train,
        labels_model_selection,
    ) = train_test_split(examples, labels, train_size=0.75, test_size=0.25)

    # Clear labels by removing edges created by the repairing from the test set
    for k, i in enumerate(examples_test):
        tup = (i[0], i[1])
        labels_test[k] = int(g.has_edge(*tup))

    for k, i in enumerate(examples_train):
        tup = (i[0], i[1])
        labels_train[k] = int(g.has_edge(*tup))

    for k, i in enumerate(examples_model_selection):
        tup = (i[0], i[1])
        labels_model_selection[k] = int(g.has_edge(*tup))

    # Compute absolute difference for the protected attribute
    abs_diff_train = abs_diff(examples_train, protS)
    abs_diff_model_selection = abs_diff(examples_model_selection, protS)
    abs_diff_test = abs_diff(examples_test, protS)

    # Node2vec on the graph train
    embedding_train, vec_train, s_train = node2vec_embedding(graph_train, "Train Graph", protS)
    
    # Choose operator for concatenating the embeddings 
    binary_operators = [operator_hadamard]
    results = [run_link_prediction(op, 
                                   examples_train, 
                                   labels_train, 
                                   embedding_train,
                                   examples_model_selection,
                                   labels_model_selection, 
                                   abs_diff_model_selection) for op in binary_operators]
    best_result = max(results, key=lambda result: result["score"])

    auc_protS = representation_bias(vec_train, s_train)
    rep_bias.append(auc_protS)

    test_score, test_score_bias, test_score_consistency = evaluate_link_prediction_model(
        best_result["classifier"],
        examples_test,
        labels_test,
        embedding_train,
        best_result["binary_operator"],
        abs_diff_test
    )
    print(f"ROC AUC score on test set using '{best_result['binary_operator'].__name__}': {test_score}")
    print(f"DI score on test set using '{best_result['binary_operator'].__name__}': {test_score_bias}")
    print(f"Consistency score on test set using '{best_result['binary_operator'].__name__}': {test_score_consistency}")

    auc.append(test_score)
    di.append(test_score_bias)
    cons.append(test_score_consistency)

print("Done !")

print("Average AUC over 10 trials: %8.2f (%8.2f) " % (np.asarray(auc).mean(), np.asarray(auc).std()))
print("Average DI over 10 trials: %8.2f (%8.2f) " % (np.asarray(di).mean(), np.asarray(di).std()))
print("Average Consistency over 10 trials: %8.2f (%8.2f) " % (np.asarray(cons).mean(), np.asarray(cons).std()))
print("Average Representation Bias over 10 trials: %8.2f (%8.2f) " % (np.asarray(rep_bias).mean(),
                                                                      np.asarray(rep_bias).std()))

** Sampled 3470 positive and 3470 negative edges. **
** Sampled 2429 positive and 2429 negative edges. **
Number of random walks for 'Train Graph': 12240
ROC AUC score on test set using 'operator_hadamard': 0.6710029556396165
DI score on test set using 'operator_hadamard': 0.45708446866485014
Consistency score on test set using 'operator_hadamard': 0.9024888656012575
** Sampled 3470 positive and 3470 negative edges. **
** Sampled 2429 positive and 2429 negative edges. **
Number of random walks for 'Train Graph': 12240
ROC AUC score on test set using 'operator_hadamard': 0.6531271399825775
DI score on test set using 'operator_hadamard': 0.5647249190938511
Consistency score on test set using 'operator_hadamard': 0.9185747969609641
** Sampled 3470 positive and 3470 negative edges. **
** Sampled 2429 positive and 2429 negative edges. **
Number of random walks for 'Train Graph': 12240
ROC AUC score on test set using 'operator_hadamard': 0.6620515765602953
DI score on test set using 'operato