In [1]:
import numpy as np
import pandas as pd
import torch_geometric
import networkx as nx
from torch_geometric.data import Data, Dataset
from torch_geometric.datasets.planetoid import Planetoid
from torch_geometric.transforms.to_undirected import ToUndirected
import torch

In [2]:
G = nx.read_edgelist('/home/jrm28/fairness/subgraph_sketching-original/dataset/ego-facebook/raw/facebook/1684.edges', nodetype=int)

In [3]:
edge_array = np.array([(u, v) for u, v in G.edges])
edge_array.sort()

In [4]:
edge_array

array([[2849, 3021],
       [2849, 3335],
       [2849, 3299],
       ...,
       [2691, 2792],
       [3268, 3407],
       [2788, 3271]])

In [5]:
features = np.loadtxt('/home/jrm28/fairness/subgraph_sketching-original/dataset/ego-facebook/raw/facebook/1684.feat')

# sort features by node id
sorted_idx = features[:, 0].argsort()
features = features[sorted_idx, :]


# Map node to index
node_mapper = np.arange(features.shape[0])
node_mapper = dict(zip(features[:, 0].astype(int), node_mapper))

for edge in edge_array:
    edge[0] = node_mapper[edge[0]]
    edge[1] = node_mapper[edge[1]]
    
edge_array

array([[204, 376],
       [204, 690],
       [204, 654],
       ...,
       [ 46, 147],
       [623, 762],
       [143, 626]])

In [6]:
transform = ToUndirected()

# feature 147 will be the sensitive attribute
x = torch.tensor(features[:, 1:], dtype=torch.float32)
y = x[:, 147]

# Removing columns 147 and 148 (sensitive attributes)
x = torch.cat((x[:, :147], x[:, 149:]), dim=1)

facebook = Data(x=x, y=y, edge_index=torch.tensor(edge_array.T))
facebook = transform(facebook)

In [7]:
facebook

Data(x=[792, 317], edge_index=[2, 28048], y=[792])

In [7]:
torch.save(facebook, '/Users/joaopedromattos/Documents/fairness/subgraph_sketching-original/dataset/facebook/processed/facebook_1684.pt')

In [11]:
torch.load('/home/jrm28/fairness/subgraph_sketching-original/dataset/gplus/processed/gplus_100637660947564674695.pt')

Data(x=[308, 112], edge_index=[2, 6364], y=[308])

## EDA

In [8]:
torch.vstack((y[facebook.edge_index[0]], y[facebook.edge_index[1]])).sum(axis=0)  

tensor([0., 0., 1.,  ..., 0., 1., 0.])

In [9]:
(torch.vstack((y[facebook.edge_index[0]], y[facebook.edge_index[1]])).sum(axis=0) == 0).sum() + (torch.vstack((y[facebook.edge_index[0]], y[facebook.edge_index[1]])).sum(axis=0) == 2).sum(), (torch.vstack((y[facebook.edge_index[0]], y[facebook.edge_index[1]])).sum(axis=0) == 1).sum()

(tensor(15978), tensor(12070))

In [15]:
all_possible_males = ((y == 0).sum()**2)/2
all_possible_females = ((y == 1).sum()**2)/2
all_possible_male_female = ((y == 0).sum() * (y == 1).sum()) / 2

all_possible_males = ((y == 0).sum()**2)/2
all_possible_females = ((y == 1).sum()**2)/2
all_possible_male_female = ((y == 0).sum() * (y == 1).sum()) / 2

print("Num edges in each group - Num nodes in the group")
print("MM", (y[facebook.edge_index].sum(0) == 0).sum(), '-', (y == 0).sum())
print("MF", (y[facebook.edge_index].sum(0) == 1).sum())
print("FF", (y[facebook.edge_index].sum(0) == 2).sum(), '-', (y == 1).sum())

print("Prob. of an edge in each group")
print("MM", (y[facebook.edge_index].sum(0) == 0).sum() / all_possible_males)
print("MF", (y[facebook.edge_index].sum(0) == 1).sum() / all_possible_male_female)
print("FF", (y[facebook.edge_index].sum(0) == 2).sum() / all_possible_females)

Num edges in each group - Num nodes in the group
MM tensor(11372) - tensor(517)
MF tensor(12070)
FF tensor(4606) - tensor(275)
Prob. of an edge in each group
MM tensor(0.0851)
MF tensor(0.1698)
FF tensor(0.1218)


In [16]:
(y[facebook.edge_index].sum(0) == 1).sum() / ((y[facebook.edge_index].sum(0) == 2).sum() + (y[facebook.edge_index].sum(0) == 0).sum())

tensor(0.7554)

### BUDDY

In [16]:
dataset, splits, directed, eval_metric = torch.load("/home/jrm28/fairness/subgraph_sketching-original/dataset/splits/gplus.pt")

for set in ['train', 'valid', 'test']:
    print(f"(BUDDY Sampling) Num pos-pairs in {set} set by group")
    print("MM", (dataset.y[splits[set].edge_label_index[:, splits[set].edge_label.bool()]].sum(0) == 0).sum())
    print("FF", (dataset.y[splits[set].edge_label_index[:, splits[set].edge_label.bool()]].sum(0) == 2).sum())
    print("MF", (dataset.y[splits[set].edge_label_index[:, splits[set].edge_label.bool()]].sum(0) == 1).sum())
    print(f"(BUDDY Sampling) Num neg-pairs {set} by group")
    print("MM", (dataset.y[splits[set].edge_label_index[:, ~splits[set].edge_label.bool()]].sum(0) == 0).sum())
    print("FF", (dataset.y[splits[set].edge_label_index[:, ~splits[set].edge_label.bool()]].sum(0) == 2).sum())
    print("MF", (dataset.y[splits[set].edge_label_index[:, ~splits[set].edge_label.bool()]].sum(0) == 1).sum())
    print("------------")

(BUDDY Sampling) Num pos-pairs in train set by group
MM tensor(79820)
FF tensor(1825)
MF tensor(24592)
(BUDDY Sampling) Num neg-pairs train by group
MM tensor(81714)
FF tensor(1574)
MF tensor(22949)
------------
(BUDDY Sampling) Num pos-pairs in valid set by group
MM tensor(11374)
FF tensor(261)
MF tensor(3541)
(BUDDY Sampling) Num neg-pairs valid by group
MM tensor(11671)
FF tensor(223)
MF tensor(3282)
------------
(BUDDY Sampling) Num pos-pairs in test set by group
MM tensor(22843)
FF tensor(522)
MF tensor(6988)
(BUDDY Sampling) Num neg-pairs test by group
MM tensor(23297)
FF tensor(466)
MF tensor(6590)
------------


### NCNC Splits

In [14]:
data, splits = torch.load("/home/jrm28/fairness/NeuralCommonNeighbor/dataset/splits/gplus.pt")
for set in ['train', 'valid', 'test']:
    print(f"(NCNC Sampling) Num pos-pairs in {set} set by group")
    print("MM", (data.y[splits[set]['edge']].sum(1) == 0).sum())
    print("FF", (data.y[splits[set]['edge']].sum(1) == 2).sum())
    print("MF", (data.y[splits[set]['edge']].sum(1) == 1).sum())
    print(f"(NCNC Sampling) Num neg-pairs in {set} set by group")
    print("MM", (data.y[splits[set]['edge_neg']].sum(1) == 0).sum())
    print("FF", (data.y[splits[set]['edge_neg']].sum(1) == 2).sum())
    print("MF", (data.y[splits[set]['edge_neg']].sum(1) == 1).sum())
    print("------------")

(NCNC Sampling) Num pos-pairs in train set by group
MM tensor(79842)
FF tensor(1813)
MF tensor(24582)
(NCNC Sampling) Num neg-pairs in train set by group
MM tensor(139847)
FF tensor(2850)
MF tensor(39423)
------------
(NCNC Sampling) Num pos-pairs in valid set by group
MM tensor(11392)
FF tensor(251)
MF tensor(3533)
(NCNC Sampling) Num neg-pairs in valid set by group
MM tensor(23297)
FF tensor(440)
MF tensor(6616)
------------
(NCNC Sampling) Num pos-pairs in test set by group
MM tensor(22803)
FF tensor(544)
MF tensor(7006)
(NCNC Sampling) Num neg-pairs in test set by group
MM tensor(23228)
FF tensor(433)
MF tensor(6692)
------------


#### Full-training

In [20]:
dataset, splits, directed, eval_metric = torch.load('full_training_gplus.pt')

for set in ['train', 'valid', 'test']:
    print(f"(Full Training Sampling) Num pos-pairs in {set} set by group")
    print("MM", (dataset.y[splits[set].edge_label_index[:, splits[set].edge_label.bool()]].sum(0) == 0).sum())
    print("FF", (dataset.y[splits[set].edge_label_index[:, splits[set].edge_label.bool()]].sum(0) == 2).sum())
    print("MF", (dataset.y[splits[set].edge_label_index[:, splits[set].edge_label.bool()]].sum(0) == 1).sum())
    print(f"(Full Training Sampling) Num neg-pairs {set} by group")
    print("MM", (dataset.y[splits[set].edge_label_index[:, ~splits[set].edge_label.bool()]].sum(0) == 0).sum())
    print("FF", (dataset.y[splits[set].edge_label_index[:, ~splits[set].edge_label.bool()]].sum(0) == 2).sum())
    print("MF", (dataset.y[splits[set].edge_label_index[:, ~splits[set].edge_label.bool()]].sum(0) == 1).sum())
    print("------------")

(Full Training Sampling) Num pos-pairs in train set by group
MM tensor(193812)
FF tensor(4462)
MF tensor(59730)
(Full Training Sampling) Num neg-pairs train by group
MM tensor(1000515)
FF tensor(19714)
MF tensor(281355)
------------
(Full Training Sampling) Num pos-pairs in valid set by group
MM tensor(5741)
FF tensor(122)
MF tensor(1725)
(Full Training Sampling) Num neg-pairs valid by group
MM tensor(994774)
FF tensor(19592)
MF tensor(279630)
------------
(Full Training Sampling) Num pos-pairs in test set by group
MM tensor(11390)
FF tensor(255)
MF tensor(3531)
(Full Training Sampling) Num neg-pairs test by group
MM tensor(983384)
FF tensor(19337)
MF tensor(276099)
------------


In [14]:
data, splits = torch.load("/home/jrm28/fairness/graphair/fairgraph/method/checkpoint/out/AUGMENTED_facebook_10000_epochs_2024-03-13_12-12-50/splits.pt", map_location=torch.device('cpu'))

In [18]:
data.edge_index.nonzero().shape

torch.Size([177522, 2])