Importing libraries and function for chunks for 10-folds

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import random
import operator
import pandas as pd
import scipy.io as sci
%matplotlib inline
def chunk(xs, n):
    ys = list(xs)
    random.seed(0)
    random.shuffle(ys)
    size = len(ys) // n
    leftovers= ys[size*n:]
    for c in xrange(n):
        if leftovers:
           extra= [ leftovers.pop() ] 
        else:
           extra= []
        yield ys[c*size:(c+1)*size] + extra

In [2]:
def common_neighbours(sub):
    mat = np.array((nx.to_numpy_matrix(sub) != 0) * 1).dot(np.array((nx.to_numpy_matrix(sub) != 0) * 1))
    edgesWithScore = {}
    edges = nx.non_edges(sub)
    for e in edges:
        edgesWithScore[e] = mat[e[0]][e[1]]
    return edgesWithScore 

def salton_index(sub):
    mat = np.array((nx.to_numpy_matrix(sub) != 0) * 1).dot(np.array((nx.to_numpy_matrix(sub) != 0) * 1))
    edgesWithScore = {}
    edges = nx.non_edges(sub)
    for e in edges:
        j = nx.degree(sub, e[0])
        k = nx.degree(sub, e[1])
        if j != 0 and k != 0:
            edgesWithScore[e] = float(mat[e[0]][e[1]])/float(np.sqrt(j * k))
        else:
            edgesWithScore[e] = 0
    return edgesWithScore

def jaccard_index(sub):
    mat = np.array((nx.to_numpy_matrix(sub) != 0) * 1).dot(np.array((nx.to_numpy_matrix(sub) != 0) * 1))
    edgesWithScore = {}
    edges = nx.non_edges(sub)
    for e in edges:
        if nx.degree(sub, e[0]) != 0 or nx.degree(sub, e[1]) != 0:
            edgesWithScore[e] = float(mat[e[0]][e[1]])/float(len(set(sub[e[0]])|set(sub[e[1]])))
        else:
            edgesWithScore[e] = 0
    return edgesWithScore

def sorensen_index(sub):
    mat = np.array((nx.to_numpy_matrix(sub) != 0) * 1).dot(np.array((nx.to_numpy_matrix(sub) != 0) * 1))
    edgesWithScore = {}
    edges = nx.non_edges(sub)
    for e in edges:
        j = nx.degree(sub, e[0])
        k = nx.degree(sub, e[1])
        if j != 0 or k != 0:
            edgesWithScore[e] = 2*float(mat[e[0]][e[1]])/float(j + k)
        else:
            edgesWithScore[e] = 0
    return edgesWithScore

def hub_promoted_index(sub):
    mat = np.array((nx.to_numpy_matrix(sub) != 0) * 1).dot(np.array((nx.to_numpy_matrix(sub) != 0) * 1))
    edgesWithScore = {}
    edges = nx.non_edges(sub)
    for e in edges:
        j = nx.degree(sub, e[0])
        k = nx.degree(sub, e[1])
        if j != 0 and k != 0:
            edgesWithScore[e] = float(mat[e[0]][e[1]])/float(min(j, k))
        else:
            edgesWithScore[e] = 0
    return edgesWithScore

def hub_depressed_index(sub):
    mat = np.array((nx.to_numpy_matrix(sub) != 0) * 1).dot(np.array((nx.to_numpy_matrix(sub) != 0) * 1))
    edgesWithScore = {}
    edges = nx.non_edges(sub)
    for e in edges:
        j = nx.degree(sub, e[0])
        k = nx.degree(sub, e[1])
        if j != 0 or k != 0:
            edgesWithScore[e] = float(mat[e[0]][e[1]])/float(max(j, k))
        else:
            edgesWithScore[e] = 0
    return edgesWithScore

def LHN1_index(sub):
    mat = np.array((nx.to_numpy_matrix(sub) != 0) * 1).dot(np.array((nx.to_numpy_matrix(sub) != 0) * 1))
    edgesWithScore = {}
    edges = nx.non_edges(sub)
    for e in edges:
        j = nx.degree(sub, e[0])
        k = nx.degree(sub, e[1])
        if j != 0 and k != 0:
            edgesWithScore[e] = float(mat[e[0]][e[1]])/float(j * k)
        else:
            edgesWithScore[e] = 0
    return edgesWithScore

def preferential_attachment_index(sub):
    edgesWithScore = {}
    edges = nx.non_edges(sub)
    for e in edges:
        edgesWithScore[e] = nx.degree(sub, e[0]) * nx.degree(sub, e[1])
    return edgesWithScore

def adamic_adar_index(sub):
    edgesWithScore = {}
    edges = nx.non_edges(sub)
    for e in edges:
        edgesWithScore[e] = np.sum(1/np.log(nx.degree(sub, sorted(nx.common_neighbors(sub, e[0], e[1]))).values()))
    return edgesWithScore

def resource_allocation_index(sub):
    edgesWithScore = {}
    edges = nx.non_edges(sub)
    for e in edges:
        edgesWithScore[e] = np.sum(1/np.array(
                nx.degree(sub, sorted(nx.common_neighbors(sub, e[0], e[1]))).values()).astype(float))
    return edgesWithScore

def katz_index(sub):
    mat = np.array((nx.to_numpy_matrix(sub) != 0) * 1.)
    ide = np.identity(len(mat))
    beta = (1/float(max(np.linalg.eigh(mat)[0])))/2
    sim = np.linalg.inv(ide - beta*mat) - ide
    edgesWithScore = {}
    edges = nx.non_edges(sub)
    for e in edges:
        edgesWithScore[e] = sim[e[0]][e[1]]
    return edgesWithScore

def lhn2_index(sub, phi = 0.1):
    mat = np.array((nx.to_numpy_matrix(sub) != 0) * 1.)
    ide = np.identity(len(mat))
    dma = np.diagflat(mat.sum(axis = 1))
    edgesWithScore = {}
    edges = nx.non_edges(sub)
    for e in edges:
        edgesWithScore[e] = 0
    if np.linalg.det(dma) != 0:
        lambd = float(max(np.linalg.eigh(mat)[0]))
        sim = (2 * sub.number_of_edges() * lambd * np.linalg.inv(dma)).dot(np.linalg.inv(
        ide - (phi/lambd) * mat)).dot(np.linalg.inv(dma))
        edges = nx.non_edges(sub)
        for e in edges:
            edgesWithScore[e] = sim[e[0]][e[1]]
    return edgesWithScore

def act_index(sub):
    mat = np.array((nx.to_numpy_matrix(sub) != 0) * 1.)
    dma = np.diagflat(mat.sum(axis = 1))
    sim = np.linalg.pinv(dma - mat)
    edgesWithScore = {}
    edges = nx.non_edges(sub)
    for e in edges:
        if sim[e[0]][e[0]] + sim[e[1]][e[1]] - 2*sim[e[0]][e[1]] != 0:
            edgesWithScore[e] = 1/float(sim[e[0]][e[0]] + sim[e[1]][e[1]] - 2*sim[e[0]][e[1]])
        else:
            edgesWithScore[e] = 0
    return edgesWithScore

def cbl_index(sub):
    mat = np.array((nx.to_numpy_matrix(sub) != 0) * 1.)
    dma = np.diagflat(mat.sum(axis = 1))
    sim = np.linalg.pinv(dma - mat)
    edgesWithScore = {}
    edges = nx.non_edges(sub)
    for e in edges:
        if sim[e[0]][e[0]] * sim[e[1]][e[1]] != 0:
            edgesWithScore[e] = float(sim[e[0]][e[1]])/np.sqrt(sim[e[0]][e[0]] * sim[e[1]][e[1]])
        else:
            edgesWithScore[e] = 0
    return edgesWithScore

def rwr_index(sub, c = 0.5):
    mat = np.array((nx.to_numpy_matrix(sub) != 0) * 1.)
    dma = np.diagflat(mat.sum(axis = 1))
    sim = np.linalg.pinv(dma - mat)
    ide = np.identity(len(mat))
    pmt = np.identity(len(mat))
    es = []
    for a in sub.nodes():
        e = np.zeros(len(mat))
        e[a] = 1
        es.append(e)
        for b in sub.nodes():
            if not nx.has_path(sub, source = a, target = b):
                pmt[a][b] = 0
            else:
                if nx.degree(sub, a) != 0:
                    pmt[a][b] = 1/float(nx.degree(sub, a))
                else:
                    pmt[a][b] = 0
    es = np.array(es)
    qs = []
    for a in sub.nodes():
        if np.linalg.det(ide - c*np.transpose(pmt)) != 0:
            q = ((1 - c)*np.linalg.inv(ide - c*np.transpose(pmt))).dot(es[a])
        else:
            q = es[a]
        qs.append(q)
    qs = np.array(qs)
    edgesWithScore = {}
    edges = nx.non_edges(sub)
    for e in edges:
        edgesWithScore[e] = qs[e[0]][e[1]] + qs[e[1]][e[0]]
    return edgesWithScore

def matrix_forest_index(sub, alpha = 1):
    mat = np.array((nx.to_numpy_matrix(sub) != 0) * 1.)
    dma = np.diagflat(mat.sum(axis = 1))
    ide = np.identity(len(mat))
    sim = np.linalg.inv(ide + alpha*(dma - mat))
    edgesWithScore = {}
    edges = nx.non_edges(sub)
    for e in edges:
        edgesWithScore[e] = sim[e[0]][e[1]]
    return edgesWithScore

def calculate_aucs(G, nfolds = 10):
    df = pd.DataFrame()
    random.seed(0)
    folds = [i for i in chunk(G.edges(), nfolds)]
    aucs = []
    for i in xrange(nfolds):
        print 'fold', i+1
        sub = G.copy()
        for c in folds[i]:
            sub.remove_edge(*c)
        complement = nx.non_edges(sub)
        edgesWithScore = [common_neighbours(sub), 
                           salton_index(sub),
                           jaccard_index(sub),
                           sorensen_index(sub),
                           hub_promoted_index(sub),
                           hub_depressed_index(sub),
                           LHN1_index(sub),
                           preferential_attachment_index(sub),
                           adamic_adar_index(sub),
                           resource_allocation_index(sub),
                           katz_index(sub),
                           lhn2_index(sub),
                           act_index(sub),
                           cbl_index(sub),
                           rwr_index(sub),
                           matrix_forest_index(sub)]
        auc = []
        for indn in xrange(len(edgesWithScore)):
            highScore = 0
            sameScore = 0
            allScore = 0
            ne_ar = []
            e_ar = []
            for ne in nx.non_edges(G):
                ne_ar.append(edgesWithScore[indn][ne])
            ne_ar = np.array(ne_ar)
            for e in folds[i]:
                highScore += np.sum(ne_ar < edgesWithScore[indn][e])
                sameScore += np.sum(ne_ar == edgesWithScore[indn][e])
                allScore += len(ne_ar)
            #
            #        if edgesWithScore[indn][ne] < edgesWithScore[indn][e]:
            #            highScore += 1
            #        elif edgesWithScore[indn][ne] == edgesWithScore[indn][e]:
            #            sameScore += 1
            #        allScore += 1
            auc.append(float(highScore + 0.5*sameScore)/float(allScore))
        aucs.append(auc)
    #print aucs
    auc_means = np.mean(np.array(aucs), axis = 0)
    return {'CN' : auc_means[0],
            'SaI' : auc_means[1],
            'JI' : auc_means[2],
            'SoI' : auc_means[3],
            'HPI' : auc_means[4],
            'HDI' : auc_means[5],
            'LHN1' : auc_means[6],
            'PAI' : auc_means[7],
            'AAI' : auc_means[8],
            'RAI' : auc_means[9],
            'KI' : auc_means[10],
            'LHN2' : auc_means[11],
            'ACT' : auc_means[12],
            'CBL' : auc_means[13],
            'RWR' : auc_means[14],
            'MFI' : auc_means[15]}

In [3]:
df = pd.DataFrame(columns = ['CN', 'SaI', 'JI', 'SoI', 
                             'HPI', 'HDI', 'LHN1', 'PAI', 
                             'AAI', 'RAI', 'KI', 'LHN2', 
                             'ACT', 'CBL', 'RWR', 'MFI'])
pd.set_option('precision',4)

In [4]:
newman_netscience = nx.read_gml('./netws/newman/netscience/netscience.gml')
fixed_newman_netscience = nx.Graph(nx.convert_node_labels_to_integers(newman_netscience, first_label = 0))
newman_netscience = None
df.loc['newman netscience'] = calculate_aucs(fixed_newman_netscience)
newman_power = nx.read_gml('./netws/newman/power/power.gml')
fixed_newman_power = nx.Graph(nx.convert_node_labels_to_integers(newman_power, first_label = 0))
newman_power = None
df.loc['newman power'] = calculate_aucs(fixed_newman_power)
newman_polblogs = nx.read_gml('./netws/newman/polblogs/polblogs.gml')
fixed_newman_polblogs = nx.Graph(nx.convert_node_labels_to_integers(newman_polblogs, first_label = 0))
newman_polblogs = None
df.loc['newman polblogs'] = calculate_aucs(fixed_newman_polblogs)
newman_hep_th = nx.read_gml('./netws/newman/hep-th/hep-th.gml')
fixed_newman_hep_th = nx.Graph(nx.convert_node_labels_to_integers(newman_hep_th, first_label = 0))
newman_hep_th = None
df.loc['newman hep_th'] = calculate_aucs(fixed_newman_hep_th)

fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
fold 10
fold 1


MemoryError: 

In [None]:
stanford_wiki_vote = nx.read_adjlist('./netws/stanford/wiki-Vote.txt')
fixed_stanford_wiki_vote = nx.Graph(nx.convert_node_labels_to_integers(stanford_wiki_vote, first_label = 0))
stanford_wiki_vote = None
df.loc['stanford wiki-vote'] = calculate_aucs(fixed_stanford_wiki_vote)

In [13]:
newman_adjnoun = nx.read_gml('./netws/newman/adjnoun/adjnoun.gml')
df.loc['newman adjnoun'] = calculate_aucs(newman_adjnoun)
print '1'
newman_celegansneural = nx.read_gml('./netws/newman/celegansneural/celegansneural.gml')
fixed_newman_celegansneural = nx.Graph(nx.convert_node_labels_to_integers(newman_celegansneural, first_label = 0))
df.loc['newman celegansneural'] = calculate_aucs(fixed_newman_celegansneural)
print '2'
newman_dolphins = nx.read_gml('./netws/newman/dolphins/dolphins.gml')
fixed_newman_dolphins = nx.Graph(nx.convert_node_labels_to_integers(newman_dolphins, first_label = 0))
df.loc['newman dolphins'] = calculate_aucs(fixed_newman_dolphins)
print '3'
newman_football = nx.read_gml('./netws/newman/football/football.gml')
fixed_newman_football = nx.Graph(nx.convert_node_labels_to_integers(newman_football, first_label = 0))
df.loc['newman football'] = calculate_aucs(fixed_newman_football)
print '4'
newman_lesmis = nx.read_gml('./netws/newman/lesmis/lesmis.gml')
fixed_newman_lesmis = nx.Graph(nx.convert_node_labels_to_integers(newman_lesmis, first_label = 0))
df.loc['newman lesmis'] = calculate_aucs(fixed_newman_lesmis)
print '5'
newman_polbooks = nx.read_gml('./netws/newman/polbooks/polbooks.gml')
fixed_newman_polbooks = nx.Graph(nx.convert_node_labels_to_integers(newman_polbooks, first_label = 0))
print '6'
df.loc['newman polbooks'] = calculate_aucs(fixed_newman_polbooks)
print '7'
df.loc['karate club'] = calculate_aucs(nx.karate_club_graph())
pajek_us_air = nx.read_pajek('./netws/pajekds/USAir97.net')
fixed_pajek_us_air = nx.Graph(nx.convert_node_labels_to_integers(pajek_us_air, first_label = 0))
df.loc['pajek_us_air'] = calculate_aucs(fixed_pajek_us_air)

fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
fold 10
1
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
fold 10
2
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
fold 10
3
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
fold 10
4
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
fold 10
5
6
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
fold 10
7
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
fold 10
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
fold 10


In [5]:
df

Unnamed: 0,CN,SaI,JI,SoI,HPI,HDI,LHN1,PAI,AAI,RAI,KI,LHN2,ACT,CBL,RWR,MFI
newman netscience,0.938,0.938,0.938,0.938,0.938,0.938,0.938,0.676,0.939,0.939,0.94,0.5,0.581,0.442,0.5,0.941


In [11]:
df

Unnamed: 0,CN,SaI,JI,SoI,HPI,HDI,LHN1,PAI,AAI,RAI,KI,LHN2,ACT,CBL,RWR,MFI
newman adjnoun,0.662,0.606,0.603,0.603,0.618,0.603,0.566,0.744,0.662,0.659,0.714,0.516,0.742,0.583,0.737,0.667
newman celegansneural,0.844,0.797,0.79,0.79,0.805,0.779,0.725,0.75,0.861,0.866,0.852,0.547,0.738,0.848,0.725,0.865
newman dolphins,0.779,0.774,0.779,0.779,0.763,0.781,0.762,0.619,0.781,0.781,0.799,0.662,0.76,0.785,0.647,0.804
newman football,0.846,0.858,0.858,0.858,0.856,0.857,0.859,0.27,0.846,0.846,0.857,0.877,0.588,0.885,0.272,0.878
newman lesmis,0.911,0.882,0.88,0.88,0.847,0.878,0.82,0.776,0.918,0.919,0.884,0.568,0.859,0.812,0.794,0.867
newman polbooks,0.887,0.884,0.875,0.875,0.894,0.863,0.848,0.653,0.897,0.9,0.891,0.862,0.729,0.891,0.618,0.899
karate club,0.7,0.636,0.607,0.607,0.712,0.593,0.6,0.712,0.726,0.733,0.755,0.612,0.666,0.736,0.62,0.749
pajek_us_air,0.935,0.909,0.898,0.898,0.87,0.892,0.768,0.887,0.946,0.952,0.92,0.5,0.892,0.905,0.832,0.913


In [6]:
print df[['CN', 'SaI', 'JI', 'SoI', 'HPI', 'HDI', 'LHN1', 'PAI', 'AAI', 'RAI',]].to_latex()

\begin{tabular}{lrrrrrrrrrr}
\toprule
{} &     CN &    SaI &     JI &    SoI &    HPI &    HDI &   LHN1 &    PAI &    AAI &    RAI \\
\midrule
newman adjnoun        &  0.662 &  0.606 &  0.603 &  0.603 &  0.618 &  0.603 &  0.566 &  0.744 &  0.662 &  0.659 \\
newman celegansneural &  0.844 &  0.797 &  0.790 &  0.790 &  0.805 &  0.779 &  0.725 &  0.750 &  0.861 &  0.866 \\
newman dolphins       &  0.779 &  0.774 &  0.779 &  0.779 &  0.763 &  0.781 &  0.762 &  0.619 &  0.781 &  0.781 \\
newman football       &  0.846 &  0.858 &  0.858 &  0.858 &  0.856 &  0.857 &  0.859 &  0.270 &  0.846 &  0.846 \\
newman lesmis         &  0.911 &  0.882 &  0.880 &  0.880 &  0.847 &  0.878 &  0.820 &  0.776 &  0.918 &  0.919 \\
newman polbooks       &  0.887 &  0.884 &  0.875 &  0.875 &  0.894 &  0.863 &  0.848 &  0.653 &  0.897 &  0.900 \\
karate club           &  0.700 &  0.636 &  0.607 &  0.607 &  0.712 &  0.593 &  0.600 &  0.712 &  0.726 &  0.733 \\
pajek\_us\_air          &  0.935 &  0.909 &  0.898 &

In [55]:
predict_nodes(nx.karate_club_graph())

{'AAI': 0.72559523809523818,
 'ACT': 0.66597900029577051,
 'CBL': 0.73873669032830525,
 'CN': 0.69990757172434193,
 'HDI': 0.59264086069210298,
 'HPI': 0.71227632357290749,
 'JI': 0.60682860100561964,
 'KI': 0.7551907719609583,
 'LHN1': 0.59992236024844714,
 'LHN2': 0.61252033422064478,
 'MFI': 0.74868012422360253,
 'PAI': 0.71162562851227462,
 'RAI': 0.73339618456078082,
 'RWR': 0.61805124223602481,
 'SaI': 0.63614315291333923,
 'SoI': 0.60682860100561964}

[array([ 0.72269319,  0.66199355,  0.65669865,  0.65669865,  0.67068988,
        0.65076321,  0.61040588,  0.76450627,  0.72112902,  0.7151976 ,
        0.75476983,  0.5       ,  0.76479943,  0.6558252 ,  0.75819736,
        0.7018248 ]), array([ 0.61427114,  0.5779879 ,  0.58148571,  0.58148571,  0.59116994,
        0.58374262,  0.56602065,  0.69834105,  0.6190741 ,  0.6210238 ,
        0.67066579,  0.5       ,  0.6698084 ,  0.53732134,  0.67139065,
        0.63542466]), array([ 0.71159939,  0.64825732,  0.64928337,  0.64928337,  0.65425701,
        0.65368475,  0.59705919,  0.78886243,  0.71878175,  0.71791232,
        0.75357913,  0.5       ,  0.79358106,  0.595238  ,  0.78362174,
        0.7009875 ]), array([ 0.65769659,  0.59391277,  0.58682278,  0.58682278,  0.61510443,
        0.58250172,  0.55636051,  0.77485714,  0.65418874,  0.65103228,
        0.7202897 ,  0.55324421,  0.74952513,  0.58374061,  0.75096481,
        0.67039673]), array([ 0.67339255,  0.61074522,  0.6044845 ,  

{'AAI': 0.66163687093789125,
 'ACT': 0.7421744714800701,
 'CBL': 0.58867712586434096,
 'CN': 0.66214340406400662,
 'HDI': 0.60282738633075761,
 'HPI': 0.61840145370111688,
 'JI': 0.60344266306234151,
 'KI': 0.71380368743418066,
 'LHN1': 0.56567697364432867,
 'LHN2': 0.51574740886543879,
 'MFI': 0.66749749917435941,
 'PAI': 0.74384260488981924,
 'RAI': 0.65907147609237449,
 'RWR': 0.73765957524114723,
 'SaI': 0.60584940296672207,
 'SoI': 0.60344266306234151}