Creates _s.npy, _X.npy _y.npy, label_*.npy of the cora dataset with sensitive attribute of feature w_1177.

In [1]:
import os
import networkx as nx
import pandas as pd

In [2]:
import numpy as np

In [3]:
edgelist = pd.read_csv("cora.cites", sep='\t', header=None, names=["target", "source"])
edgelist["label"] = "cites"

In [4]:
Gnx = nx.from_pandas_edgelist(edgelist, edge_attr="label")
nx.set_node_attributes(Gnx, "paper", "label")

In [5]:
Gnx.nodes[1103985]

{'label': 'paper'}

In [4]:
feature_names = ["w_{}".format(ii) for ii in range(1433)]
column_names =  feature_names + ["subject"]
node_data = pd.read_csv("cora.content", sep='\t', header=None, names=column_names)

In [7]:
node_data[node_data.index==35]

Unnamed: 0,w_0,w_1,w_2,w_3,w_4,w_5,w_6,w_7,w_8,w_9,...,w_1424,w_1425,w_1426,w_1427,w_1428,w_1429,w_1430,w_1431,w_1432,subject
35,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Genetic_Algorithms


In [8]:
set(node_data["subject"])

{'Case_Based',
 'Genetic_Algorithms',
 'Neural_Networks',
 'Probabilistic_Methods',
 'Reinforcement_Learning',
 'Rule_Learning',
 'Theory'}

In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
node_data['subject'] = le.fit_transform(node_data['subject'])
node_data['subject'] = node_data['subject'].astype('int64')
print(node_data['subject'])

31336      2
1061127    5
1106406    4
13195      4
37879      3
          ..
1128975    1
1128977    1
1128978    1
117328     0
24043      2
Name: subject, Length: 2708, dtype: int64


In [6]:
nodes = node_data.index.unique()
nodes

Index([  31336, 1061127, 1106406,   13195,   37879, 1126012, 1107140, 1102850,
         31349, 1106418,
       ...
        626531, 1131180, 1130454, 1131184, 1128974, 1128975, 1128977, 1128978,
        117328,   24043],
      dtype='int64', length=2708)

In [7]:
node_to_index = {node: index for index, node in enumerate(nodes)}

# The matrix is symmetric
coassoc_matrix = np.zeros((len(nodes), len(nodes)))

for index, edge in edgelist.drop(["label"], axis=1).iterrows():
    if edge['source'] in node_to_index and edge['target'] in node_to_index:
        i = node_to_index[edge['source']]
        j = node_to_index[edge['target']]
        coassoc_matrix[i, j] = 1
        coassoc_matrix[j, i] = 1  

coassoc_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [8]:
subject_array = node_data['subject'].values
np.save('cora_y.npy', subject_array)
subject_array

array([2, 5, 4, ..., 1, 0, 2], dtype=int64)

In [10]:
np.save('cora_coassoc.npy', coassoc_matrix)

In [11]:
node_data_X = node_data.drop(columns=['subject', 'w_1177'])
np.save('cora_X.npy', node_data_X.values)
node_data_X.values

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [12]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_ravel = node_data['subject'].values.ravel()
y_labels = label_encoder.fit_transform(y_ravel)
np.save('labels_cora.npy', y_labels)

In [13]:
np.save('cora_s.npy', node_data['w_1177'].values)
node_data['w_1177'].values

array([0, 1, 1, ..., 1, 0, 0], dtype=int64)

In [10]:
set(node_data["subject"])

{0, 1, 2, 3, 4, 5, 6}

In [11]:
node_data["subject"]

31336      2
1061127    5
1106406    4
13195      4
37879      3
          ..
1128975    1
1128977    1
1128978    1
117328     0
24043      2
Name: subject, Length: 2708, dtype: int64

In [12]:
for column in node_data.drop(["subject"],axis=1).columns:
    try:
        counts = node_data[column].value_counts().loc[[0, 1]]
        if all(counts >= 500):
            print(f"{column}:\n{counts}")
    except:
        print(f"Skipping {column}")

w_19:
w_19
0    2148
1     560
Name: count, dtype: int64
Skipping w_444
w_507:
w_507
0    2032
1     676
Name: count, dtype: int64
w_1177:
w_1177
0    1625
1    1083
Name: count, dtype: int64
w_1209:
w_1209
0    2124
1     584
Name: count, dtype: int64
w_1263:
w_1263
0    1728
1     980
Name: count, dtype: int64


In [13]:
# Sample 500 rows where "w_1177" is 0
zeros_sample = node_data[node_data['w_1177'] == 0].sample(n=500)

# Sample 500 rows where "w_1177" is 1
ones_sample = node_data[node_data['w_1177'] == 1].sample(n=500)

# Concatenate the two samples
sampled_data = pd.concat([zeros_sample, ones_sample])

sampled_data['w_1177']

158172     0
1130780    0
1108570    0
153598     0
310530     0
          ..
14062      1
595056     1
37879      1
976284     1
592986     1
Name: w_1177, Length: 1000, dtype: int64

In [14]:
nodes = sampled_data.index.unique()
nodes

Index([ 158172, 1130780, 1108570,  153598,  310530, 1106849,    1688,  157401,
       1132948, 1128881,
       ...
        321861,   25794, 1125944,  919885,      40,   14062,  595056,   37879,
        976284,  592986],
      dtype='int64', length=1000)

In [15]:
node_to_index = {node: index for index, node in enumerate(nodes)}

# The matrix is symmetric
coassoc_matrix = np.zeros((len(nodes), len(nodes)))

for index, edge in edgelist.drop(["label"], axis=1).iterrows():
    if edge['source'] in node_to_index and edge['target'] in node_to_index:
        i = node_to_index[edge['source']]
        j = node_to_index[edge['target']]
        coassoc_matrix[i, j] = 1
        coassoc_matrix[j, i] = 1  

coassoc_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [16]:
np.count_nonzero(coassoc_matrix == 1)

1426

In [17]:
np.count_nonzero(coassoc_matrix == 0)

998574

In [22]:
subject_array = sampled_data['subject'].values
np.save('cora_y.npy', subject_array)
subject_array

array([3, 3, 6, 4, 0, 2, 1, 2, 2, 5, 1, 2, 1, 0, 5, 2, 6, 0, 1, 2, 1, 3,
       0, 1, 3, 4, 2, 4, 2, 6, 2, 2, 3, 2, 5, 0, 2, 2, 2, 4, 6, 2, 0, 5,
       1, 2, 3, 2, 4, 5, 3, 4, 6, 3, 2, 6, 3, 3, 3, 0, 3, 0, 4, 5, 1, 3,
       2, 4, 0, 2, 0, 1, 6, 1, 1, 2, 2, 3, 5, 6, 3, 3, 2, 6, 1, 3, 2, 0,
       3, 1, 3, 0, 4, 2, 3, 0, 2, 2, 2, 2, 0, 6, 6, 3, 3, 2, 2, 1, 3, 1,
       6, 1, 2, 2, 2, 6, 2, 3, 1, 1, 1, 0, 3, 3, 2, 0, 3, 2, 2, 2, 2, 2,
       2, 0, 1, 4, 3, 1, 5, 2, 2, 1, 3, 1, 3, 2, 3, 5, 2, 2, 2, 0, 1, 1,
       3, 6, 6, 3, 6, 3, 2, 2, 2, 2, 3, 1, 0, 1, 4, 2, 6, 1, 2, 2, 6, 4,
       2, 0, 4, 1, 2, 3, 5, 2, 3, 2, 3, 3, 2, 1, 1, 2, 2, 3, 6, 6, 6, 3,
       6, 0, 3, 2, 2, 0, 2, 6, 2, 3, 2, 2, 5, 1, 2, 2, 1, 2, 1, 6, 2, 2,
       3, 3, 2, 1, 0, 1, 1, 2, 0, 3, 2, 2, 3, 2, 6, 2, 4, 6, 6, 4, 2, 2,
       3, 0, 2, 6, 1, 6, 3, 3, 2, 2, 0, 1, 2, 1, 4, 3, 1, 4, 2, 0, 2, 2,
       2, 0, 1, 2, 2, 2, 1, 3, 3, 2, 2, 3, 1, 4, 4, 1, 2, 4, 0, 0, 2, 2,
       6, 3, 4, 0, 2, 3, 4, 6, 2, 0, 4, 2, 5, 2, 4,

In [23]:
np.save('cora_coassoc.npy', coassoc_matrix)

In [24]:
sampled_data_X = sampled_data.drop(columns=['subject', 'w_1177'])
np.save('cora_X.npy', sampled_data_X.values)
sampled_data_X.values

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [18]:
len(np.unique(sampled_data['subject'].values))

7

In [19]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_ravel = sampled_data['subject'].values.ravel()
y_labels = label_encoder.fit_transform(y_ravel)

In [20]:
y_labels

array([3, 3, 6, 4, 0, 2, 1, 2, 2, 5, 1, 2, 1, 0, 5, 2, 6, 0, 1, 2, 1, 3,
       0, 1, 3, 4, 2, 4, 2, 6, 2, 2, 3, 2, 5, 0, 2, 2, 2, 4, 6, 2, 0, 5,
       1, 2, 3, 2, 4, 5, 3, 4, 6, 3, 2, 6, 3, 3, 3, 0, 3, 0, 4, 5, 1, 3,
       2, 4, 0, 2, 0, 1, 6, 1, 1, 2, 2, 3, 5, 6, 3, 3, 2, 6, 1, 3, 2, 0,
       3, 1, 3, 0, 4, 2, 3, 0, 2, 2, 2, 2, 0, 6, 6, 3, 3, 2, 2, 1, 3, 1,
       6, 1, 2, 2, 2, 6, 2, 3, 1, 1, 1, 0, 3, 3, 2, 0, 3, 2, 2, 2, 2, 2,
       2, 0, 1, 4, 3, 1, 5, 2, 2, 1, 3, 1, 3, 2, 3, 5, 2, 2, 2, 0, 1, 1,
       3, 6, 6, 3, 6, 3, 2, 2, 2, 2, 3, 1, 0, 1, 4, 2, 6, 1, 2, 2, 6, 4,
       2, 0, 4, 1, 2, 3, 5, 2, 3, 2, 3, 3, 2, 1, 1, 2, 2, 3, 6, 6, 6, 3,
       6, 0, 3, 2, 2, 0, 2, 6, 2, 3, 2, 2, 5, 1, 2, 2, 1, 2, 1, 6, 2, 2,
       3, 3, 2, 1, 0, 1, 1, 2, 0, 3, 2, 2, 3, 2, 6, 2, 4, 6, 6, 4, 2, 2,
       3, 0, 2, 6, 1, 6, 3, 3, 2, 2, 0, 1, 2, 1, 4, 3, 1, 4, 2, 0, 2, 2,
       2, 0, 1, 2, 2, 2, 1, 3, 3, 2, 2, 3, 1, 4, 4, 1, 2, 4, 0, 0, 2, 2,
       6, 3, 4, 0, 2, 3, 4, 6, 2, 0, 4, 2, 5, 2, 4,

In [21]:
np.save('labels_cora.npy', y_labels)

In [25]:
# Save the 'w_1177' column as a .npy file
np.save('cora_s.npy', sampled_data['w_1177'].values)
sampled_data['w_1177'].values

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,