#### This notebook demonstrates the use of InFoRM algorithms to mitigate bias for spectral clustering
InFoRM includes 3 algorithms, namely debiasing the input graph, debiasing the mining model and debiasing the mining result. We will show how to run all 3 algorithms for spectral clustering in this notebook.

### Get vanilla clustering membership matrix first

In [1]:
# load necessary packages
import pickle
import load_graph
import utils

import networkx as nx

from scipy.sparse.csgraph import laplacian
from scipy.sparse.linalg import eigsh

In [2]:
def vanilla(name, v0):
    try:
        with open('result/sc/vanilla.pickle', 'rb') as f:
            udict = pickle.load(f)
    except:
        udict = dict()

    if name == 'ppi':
        data = load_graph.read_mat(name)
        graph = data['graph']
    else:
        graph = load_graph.read_graph(name)
    lcc = max(nx.connected_components(graph), key=len)  # take largest connected components
    adj = nx.to_scipy_sparse_matrix(graph, nodelist=lcc, dtype='float', format='csc')
    lap = laplacian(adj)
    lap *= -1
    _, u = eigsh(lap, which='LM', k=10, sigma=1.0, v0=v0[name])
    udict[name] = dict()
    udict[name]['eigenvectors'] = u

    with open('result/sc/vanilla.pickle', 'wb') as f:
        pickle.dump(udict, f, protocol=pickle.HIGHEST_PROTOCOL)

In [3]:
# load starting vector for decomposition
v0 = pickle.load(open('data/v0.pickle', 'rb'))

In [4]:
# get vanilla clustering membership
vanilla(name='ppi', v0=v0)

### Let's debias the input graph

In [5]:
# load debias model
from method.debias_graph import DebiasGraph

In [6]:
def debias_input_graph(name, v0, alpha=0.0, lr=0.0, metric=None):
    # load graph
    if name == 'ppi':
        data = load_graph.read_mat(name)
        graph = data['graph']
    else:
        graph = load_graph.read_graph(name)
    lcc = max(nx.connected_components(graph), key=len)  # take largest connected components
    init_adj = nx.to_scipy_sparse_matrix(graph, nodelist=lcc, dtype='float', format='csc')

    # build similarity matrix
    sim = utils.get_similarity_matrix(init_adj, metric=metric)

    # debias spectral clustering
    FairGraph = DebiasGraph()
    adj = FairGraph.spectral_clustering(init_adj, sim, alpha, ncluster=10, v0=v0[name], maxiter=100, lr=lr, tol=1e-6)
    lap = laplacian(adj)
    lap *= -1
    _, u = eigsh(lap, which='LM', k=10, sigma=1.0, v0=v0[name])

    print('dataset: {}\tmetric: {} similarity'.format(name, metric))
    print('Finished!')

    return u

In [7]:
v0 = pickle.load(open('data/v0.pickle', 'rb'))

# jaccard index
result = dict()
result['ppi'] = debias_input_graph(name='ppi', v0=v0, alpha=1e7, lr=0.05, metric='jaccard')
with open('result/sc/graph/jaccard.pickle', 'wb') as f:
    pickle.dump(result, f, protocol=pickle.HIGHEST_PROTOCOL)

# cosine similarity
result = dict()
result['ppi'] = debias_input_graph(name='ppi', v0=v0, alpha=1e7, lr=0.05, metric='cosine')
with open('result/sc/graph/cosine.pickle', 'wb') as f:
    pickle.dump(result, f, protocol=pickle.HIGHEST_PROTOCOL)

dataset: ppi	metric: jaccard similarity
Finished!
dataset: ppi	metric: cosine similarity
Finished!


### Let's debias the mining model

In [8]:
# load debias model
from method.debias_model import DebiasModel

In [9]:
def debias_mining_model(name, v0, alpha=0.0, metric=None):
    # load dataset
    if name == 'ppi':
        data = load_graph.read_mat(name)
        graph = data['graph']
    else:
        graph = load_graph.read_graph(name)
    lcc = max(nx.connected_components(graph), key=len)  # take largest connected components
    adj = nx.to_scipy_sparse_matrix(graph, nodelist=lcc, dtype='float', format='csc')

    # build similarity matrix
    sim = utils.get_similarity_matrix(adj, metric=metric)

    # debias spectral clustering
    FairModel = DebiasModel()
    # V, U = sc.debias_alg(adj, sim, alpha, ncluster=10, v0=v0[name])
    u = FairModel.spectral_clustering(adj, sim, alpha, ncluster=10, v0=v0[name])

    print('dataset: {}\t metric: {} similarity'.format(name, metric))
    print('Finished!')

    return u

In [10]:
alpha = 0.5
v0 = pickle.load(open('data/v0.pickle', 'rb'))
# jaccard index
result = dict()
result['ppi'] = debias_mining_model(name='ppi', v0=v0, alpha=alpha, metric='jaccard')
with open('result/sc/model/jaccard.pickle', 'wb') as f:
    pickle.dump(result, f, protocol=pickle.HIGHEST_PROTOCOL)

# cosine similarity    
result = dict()
result['ppi'] = debias_mining_model(name='ppi', v0=v0, alpha=alpha, metric='cosine')
with open('result/sc/model/cosine.pickle', 'wb') as f:
    pickle.dump(result, f, protocol=pickle.HIGHEST_PROTOCOL)

dataset: ppi	 metric: jaccard similarity
Finished!
dataset: ppi	 metric: cosine similarity
Finished!


### Let's debias the mining result

In [11]:
# load debias model
from method.debias_result import DebiasResult

In [12]:
def debias_mining_result(name, vanilla, alpha=0.0, metric=None):
    # vanilla result
    u = vanilla[name]['eigenvectors']

    # load dataset
    if name == 'ppi':
        data = load_graph.read_mat(name)
        graph = data['graph']
    else:
        graph = load_graph.read_graph(name)
    cc = max(nx.connected_components(graph), key=len)  # take largest connected components
    adj = nx.to_scipy_sparse_matrix(graph, nodelist=cc, dtype='float', format='csc')

    # build similarity matrix
    sim = utils.get_similarity_matrix(adj, metric=metric)

    # debias spectral clustering
    FairResult = DebiasResult()
    u = FairResult.fit(u, sim, alpha)
    print('dataset: {}\tmetric: {} similarity'.format(name, metric))
    print('Finished!')

    return u

In [13]:
alpha = 0.5

with open('result/sc/vanilla.pickle', 'rb') as f:
    vanilla = pickle.load(f)

# jaccard index
result = dict()
result['ppi'] = debias_mining_result(name='ppi', vanilla=vanilla, alpha=alpha, metric='jaccard')
with open('result/sc/result/jaccard.pickle', 'wb') as f:
    pickle.dump(result, f, protocol=pickle.HIGHEST_PROTOCOL)

# cosine similarity    
result = dict()
result['ppi'] = debias_mining_result(name='ppi', vanilla=vanilla, alpha=alpha, metric='cosine')
with open('result/sc/result/cosine.pickle', 'wb') as f:
    pickle.dump(result, f, protocol=pickle.HIGHEST_PROTOCOL)

dataset: ppi	metric: jaccard similarity
Finished!
dataset: ppi	metric: cosine similarity
Finished!


### Now, let's see how much we debiased and how good debiased results are

In [14]:
# load evaluation functions
from evaluate.sc import *

In [15]:
evaluate(name='ppi', metric='jaccard', task='graph')
evaluate(name='ppi', metric='cosine', task='graph')
evaluate(name='ppi', metric='jaccard', task='model')
evaluate(name='ppi', metric='cosine', task='model')
evaluate(name='ppi', metric='jaccard', task='result')
evaluate(name='ppi', metric='cosine', task='result')

{'dataset': 'ppi', 'metric': 'jaccard similarity', 'task': 'debias the input graph', 'diff': 1.034504879751878, 'nmi': 0.8682420701208994, 'bias': 0.19534454221269826}
{'dataset': 'ppi', 'metric': 'cosine similarity', 'task': 'debias the input graph', 'diff': 0.9334052752109784, 'nmi': 0.9137902163453485, 'bias': 0.24145807349987436}
{'dataset': 'ppi', 'metric': 'jaccard similarity', 'task': 'debias the mining model', 'diff': 0.9943213464407561, 'nmi': 0.6968137058881874, 'bias': 0.6702471197888372}
{'dataset': 'ppi', 'metric': 'cosine similarity', 'task': 'debias the mining model', 'diff': 0.8966720571837244, 'nmi': 0.6175175840740242, 'bias': 0.752200615626692}
{'dataset': 'ppi', 'metric': 'jaccard similarity', 'task': 'debias the mining result', 'diff': 0.2418338751897768, 'nmi': 0.8524465277247899, 'bias': 0.7750581491773365}
{'dataset': 'ppi', 'metric': 'cosine similarity', 'task': 'debias the mining result', 'diff': 0.34270967780629136, 'nmi': 0.7205238894920537, 'bias': 0.873662