In [1]:
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import random
import operator
import pandas as pd
import scipy.io as sci
%matplotlib inline
def chunk(xs, n):
    ys = list(xs)
    random.shuffle(ys)
    size = len(ys) // n
    leftovers= ys[size*n:]
    for c in xrange(n):
        if leftovers:
           extra= [ leftovers.pop() ] 
        else:
           extra= []
        yield ys[c*size:(c+1)*size] + extra

In [2]:
def common_neighbours(sub):
    i = sub
    mat = np.array((nx.to_numpy_matrix(i) != 0) * 1).dot(np.array((nx.to_numpy_matrix(i) != 0) * 1))
    edgesWithScore = {}
    edges = nx.non_edges(i)
    for e in edges:
        edgesWithScore[e] = mat[e[0]][e[1]]
    return edgesWithScore 

def salton_index(sub):
    i = sub
    mat = np.array((nx.to_numpy_matrix(i) != 0) * 1).dot(np.array((nx.to_numpy_matrix(i) != 0) * 1))
    edgesWithScore = {}
    edges = nx.non_edges(i)
    for e in edges:
        j = nx.degree(i, e[0])
        k = nx.degree(i, e[1])
        if j != 0 and k != 0:
            edgesWithScore[e] = float(mat[e[0]][e[1]])/float(np.sqrt(j * k))
        else:
            edgesWithScore[e] = 0
    return edgesWithScore

def jaccard_index(sub):
    i = sub
    mat = np.array((nx.to_numpy_matrix(i) != 0) * 1).dot(np.array((nx.to_numpy_matrix(i) != 0) * 1))
    edgesWithScore = {}
    edges = nx.non_edges(i)
    for e in edges:
        if nx.degree(i, e[0]) != 0 or nx.degree(i, e[1]) != 0:
            edgesWithScore[e] = float(mat[e[0]][e[1]])/float(len(set(i[e[0]])|set(i[e[1]])))
        else:
            edgesWithScore[e] = 0
    return edgesWithScore

def sorensen_index(sub):
    i = sub
    mat = np.array((nx.to_numpy_matrix(i) != 0) * 1).dot(np.array((nx.to_numpy_matrix(i) != 0) * 1))
    edgesWithScore = {}
    edges = nx.non_edges(i)
    for e in edges:
        j = nx.degree(i, e[0])
        k = nx.degree(i, e[1])
        if j != 0 or k != 0:
            edgesWithScore[e] = 2*float(mat[e[0]][e[1]])/float(j + k)
        else:
            edgesWithScore[e] = 0
    return edgesWithScore

def hub_promoted_index(sub):
    i = sub
    mat = np.array((nx.to_numpy_matrix(i) != 0) * 1).dot(np.array((nx.to_numpy_matrix(i) != 0) * 1))
    edgesWithScore = {}
    edges = nx.non_edges(i)
    for e in edges:
        j = nx.degree(i, e[0])
        k = nx.degree(i, e[1])
        if j != 0 and k != 0:
            edgesWithScore[e] = float(mat[e[0]][e[1]])/float(min(j, k))
        else:
            edgesWithScore[e] = 0
    return edgesWithScore

def hub_depressed_index(sub):
    i = sub
    mat = np.array((nx.to_numpy_matrix(i) != 0) * 1).dot(np.array((nx.to_numpy_matrix(i) != 0) * 1))
    edgesWithScore = {}
    edges = nx.non_edges(i)
    for e in edges:
        j = nx.degree(i, e[0])
        k = nx.degree(i, e[1])
        if j != 0 or k != 0:
            edgesWithScore[e] = float(mat[e[0]][e[1]])/float(max(j, k))
        else:
            edgesWithScore[e] = 0
    return edgesWithScore

def LHN1_index(sub):
    i = sub
    mat = np.array((nx.to_numpy_matrix(i) != 0) * 1).dot(np.array((nx.to_numpy_matrix(i) != 0) * 1))
    edgesWithScore = {}
    edges = nx.non_edges(i)
    for e in edges:
        j = nx.degree(i, e[0])
        k = nx.degree(i, e[1])
        if j != 0 and k != 0:
            edgesWithScore[e] = float(mat[e[0]][e[1]])/float(j * k)
        else:
            edgesWithScore[e] = 0
    return edgesWithScore

def preferential_attachment_index(sub):
    i = sub
    edgesWithScore = {}
    edges = nx.non_edges(i)
    for e in edges:
        edgesWithScore[e] = nx.degree(i, e[0]) * nx.degree(i, e[1])
    return edgesWithScore

def adamic_adar_index(sub):
    i = sub
    edgesWithScore = {}
    edges = nx.non_edges(i)
    for e in edges:
        edgesWithScore[e] = np.sum(1/np.log(nx.degree(i, sorted(nx.common_neighbors(i, e[0], e[1]))).values()))
    return edgesWithScore

def resource_allocation_index(sub):
    i = sub
    edgesWithScore = {}
    edges = nx.non_edges(i)
    for e in edges:
        edgesWithScore[e] = np.sum(1/np.array(
                nx.degree(i, sorted(nx.common_neighbors(i, e[0], e[1]))).values()).astype(float))
    return edgesWithScore

def katz_index(sub):
    i = sub
    mat = np.array((nx.to_numpy_matrix(i) != 0) * 1.)
    ide = np.identity(len(mat))
    beta = (1/float(max(np.linalg.eig(mat)[0])))/2
    sim = np.linalg.inv(ide - beta*mat) - ide
    edgesWithScore = {}
    edges = nx.non_edges(i)
    for e in edges:
        edgesWithScore[e] = sim[e[0]][e[1]]
    return edgesWithScore

def lhn2_index(sub, phi = 0.1):
    i = sub
    mat = np.array((nx.to_numpy_matrix(i) != 0) * 1.)
    ide = np.identity(len(mat))
    dma = np.diagflat(mat.sum(axis = 1))
    edgesWithScore = {}
    edges = nx.non_edges(i)
    for e in edges:
        edgesWithScore[e] = 0
    if np.linalg.det(dma) != 0:
        lambd = float(max(np.linalg.eig(mat)[0]))
        sim = (2 * i.number_of_edges() * lambd * np.linalg.inv(dma)).dot(np.linalg.inv(
        ide - (phi/lambd) * mat)).dot(np.linalg.inv(dma))
        edges = nx.non_edges(i)
        for e in edges:
            edgesWithScore[e] = sim[e[0]][e[1]]
    return edgesWithScore

def act_index(sub):
    i = sub
    mat = np.array((nx.to_numpy_matrix(i) != 0) * 1.)
    dma = np.diagflat(mat.sum(axis = 1))
    sim = np.linalg.pinv(dma - mat)
    edgesWithScore = {}
    edges = nx.non_edges(i)
    for e in edges:
        if sim[e[0]][e[0]] + sim[e[1]][e[1]] - 2*sim[e[0]][e[1]] != 0:
            edgesWithScore[e] = 1/float(sim[e[0]][e[0]] + sim[e[1]][e[1]] - 2*sim[e[0]][e[1]])
        else:
            edgesWithScore[e] = 0
    return edgesWithScore

def cbl_index(sub):
    i = sub
    mat = np.array((nx.to_numpy_matrix(i) != 0) * 1.)
    dma = np.diagflat(mat.sum(axis = 1))
    sim = np.linalg.pinv(dma - mat)
    edgesWithScore = {}
    edges = nx.non_edges(i)
    for e in edges:
        if sim[e[0]][e[0]] * sim[e[1]][e[1]] != 0:
            edgesWithScore[e] = float(sim[e[0]][e[1]])/np.sqrt(sim[e[0]][e[0]] * sim[e[1]][e[1]])
        else:
            edgesWithScore[e] = 0
    return edgesWithScore

def rwr_index(sub, c = 0.5):
    i = sub
    mat = np.array((nx.to_numpy_matrix(i) != 0) * 1.)
    dma = np.diagflat(mat.sum(axis = 1))
    sim = np.linalg.pinv(dma - mat)
    ide = np.identity(len(mat))
    pmt = np.identity(len(mat))
    es = []
    for a in i.nodes():
        e = np.zeros(len(mat))
        e[a] = 1
        es.append(e)
        for b in i.nodes():
            try:
                nx.shortest_path_length(i, source = a, target = b)
            except:
                pmt[a][b] = 0
            else:
                if nx.degree(i, a) != 0:
                    pmt[a][b] = 1/float(nx.degree(i, a))
                else:
                    pmt[a][b] = 0
    es = np.array(es)
    qs = []
    for a in i.nodes():
        if np.linalg.det(ide - c*np.transpose(pmt)) != 0:
            q = ((1 - c)*np.linalg.inv(ide - c*np.transpose(pmt))).dot(es[a])
        else:
            q = es[a]
        qs.append(q)
    qs = np.array(qs)
    edgesWithScore = {}
    edges = nx.non_edges(i)
    for e in edges:
        edgesWithScore[e] = qs[e[0]][e[1]] + qs[e[1]][e[0]]
    return edgesWithScore

def matrix_forest_index(sub, alpha = 1):
    i = sub
    mat = np.array((nx.to_numpy_matrix(i) != 0) * 1.)
    dma = np.diagflat(mat.sum(axis = 1))
    ide = np.identity(len(mat))
    sim = np.linalg.inv(ide + alpha*(dma - mat))
    edgesWithScore = {}
    edges = nx.non_edges(i)
    for e in edges:
        edgesWithScore[e] = sim[e[0]][e[1]]
    return edgesWithScore

def predict_nodes(G, nfolds = 10):
    df = pd.DataFrame()
    random.seed(0)
    folds = [i for i in chunk(G.edges(), nfolds)]
    subs = []
    for i in xrange(nfolds):
        graph = G.copy()
        for c in folds[i]:
            graph.remove_edge(*c)
        subs.append(graph.copy())
    es = nx.non_edges(subs[0])
    edges = []
    y = {}
    for e in es:
        edges.append(e)
        check = 0
        if e in folds[0]:
            check = 1
        y[e] = check
    df['Edges'] = edges
    df['y'] = df['Edges'].map(y.get)
    df['CN'] = df['Edges'].map(common_neighbours(subs[0]).get)
    df['SaI'] = df['Edges'].map(salton_index(subs[0]).get)
    df['JI'] = df['Edges'].map(jaccard_index(subs[0]).get)
    df['SoI'] = df['Edges'].map(sorensen_index(subs[0]).get)
    df['HPI'] = df['Edges'].map(hub_promoted_index(subs[0]).get)
    df['HDI'] = df['Edges'].map(hub_depressed_index(subs[0]).get)
    df['LHN1'] = df['Edges'].map(LHN1_index(subs[0]).get)
    df['PAI'] = df['Edges'].map(preferential_attachment_index(subs[0]).get)
    df['AAI'] = df['Edges'].map(adamic_adar_index(subs[0]).get)
    df['RAI'] = df['Edges'].map(resource_allocation_index(subs[0]).get)
    df['KzI'] = df['Edges'].map(katz_index(subs[0]).get)
    df['LHN2'] = df['Edges'].map(lhn2_index(subs[0]).get)
    df['ACT'] = df['Edges'].map(act_index(subs[0]).get)
    df['CBL'] = df['Edges'].map(cbl_index(subs[0]).get)
    df['RWR'] = df['Edges'].map(rwr_index(subs[0]).get)
    df['MFI'] = df['Edges'].map(matrix_forest_index(subs[0]).get)
    return df

In [3]:
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import ensemble
from sklearn import svm
from sklearn import neighbors
from sklearn import tree
from sklearn import lda
from sklearn import qda
from sklearn import preprocessing
from sklearn import metrics

In [9]:
from sklearn import cross_validation
def run_10_fold_on_G(G, nfolds = 10):
    random.seed(0)
    folds = [i for i in chunk(G.edges(), nfolds)]
    frames = []
    for i in xrange(nfolds):
        graph = G.copy()
        for c in folds[i]:
            graph.remove_edge(*c)
        sub = graph.copy()
        df = pd.DataFrame()
        es = nx.non_edges(sub)
        edges = []
        y = {}
        cn = common_neighbours(sub)
        sai = salton_index(sub)
        ji = jaccard_index(sub)
        soi = sorensen_index(sub)
        hpi = hub_promoted_index(sub)
        hdi = hub_depressed_index(sub)
        lhn1 = LHN1_index(sub)
        pai = preferential_attachment_index(sub)
        aai = adamic_adar_index(sub)
        rai = resource_allocation_index(sub)
        kzi = katz_index(sub)
        lhn2 = lhn2_index(sub)
        act = act_index(sub)
        cbl = cbl_index(sub)
        rwr = rwr_index(sub)
        mfi = matrix_forest_index(sub)
        cn_a = {}
        sai_a = {}
        ji_a = {}
        soi_a = {}
        hpi_a = {}
        hdi_a = {}
        lhn1_a = {}
        pai_a = {}
        aai_a = {}
        rai_a = {}
        kzi_a = {}
        lhn2_a = {}
        act_a = {}
        cbl_a = {}
        rwr_a = {}
        mfi_a = {}
        cn_b = {}
        sai_b = {}
        ji_b = {}
        soi_b = {}
        hpi_b = {}
        hdi_b = {}
        lhn1_b = {}
        pai_b = {}
        aai_b = {}
        rai_b = {}
        kzi_b = {}
        lhn2_b = {}
        act_b = {}
        cbl_b = {}
        rwr_b = {}
        mfi_b = {}
        for e in es:
            edges.append(e)
            if e in folds[i]:
                y[e] = 1
            else:
                y[e] = 0
        for e in edges:
            if sub.degree(e[0]) != 0:
                ed_ne = []
                for j in sub.neighbors(e[0]):
                    this_edge = ()
                    if j > e[1]:
                        this_edge = (e[1], j)
                    elif j < e[1]:
                        this_edge = (j, e[1])
                    if this_edge in edges:
                        ed_ne.append(this_edge)
                cn_a[e] = 1./float(sub.degree(e[0])) * np.sum([cn[i] for i in ed_ne])
                sai_a[e] = 1./float(sub.degree(e[0])) * np.sum([sai[i] for i in ed_ne])
                ji_a[e] = 1./float(sub.degree(e[0])) * np.sum([ji[i] for i in ed_ne])
                soi_a[e] = 1./float(sub.degree(e[0])) * np.sum([soi[i] for i in ed_ne])
                hpi_a[e] = 1./float(sub.degree(e[0])) * np.sum([hpi[i] for i in ed_ne])
                hdi_a[e] = 1./float(sub.degree(e[0])) * np.sum([hdi[i] for i in ed_ne])
                lhn1_a[e] = 1./float(sub.degree(e[0])) * np.sum([lhn1[i] for i in ed_ne])
                pai_a[e] = 1./float(sub.degree(e[0])) * np.sum([pai[i] for i in ed_ne])
                aai_a[e] = 1./float(sub.degree(e[0])) * np.sum([aai[i] for i in ed_ne])
                rai_a[e] = 1./float(sub.degree(e[0])) * np.sum([rai[i] for i in ed_ne])
                kzi_a[e] = 1./float(sub.degree(e[0])) * np.sum([kzi[i] for i in ed_ne])
                lhn2_a[e] = 1./float(sub.degree(e[0])) * np.sum([lhn2[i] for i in ed_ne])
                act_a[e] = 1./float(sub.degree(e[0])) * np.sum([act[i] for i in ed_ne])
                cbl_a[e] = 1./float(sub.degree(e[0])) * np.sum([cbl[i] for i in ed_ne])
                rwr_a[e] = 1./float(sub.degree(e[0])) * np.sum([rwr[i] for i in ed_ne])
                mfi_a[e] = 1./float(sub.degree(e[0])) * np.sum([mfi[i] for i in ed_ne])
            else:
                cn_a[e] = 0
                sai_a[e] = 0
                ji_a[e] = 0
                soi_a[e] = 0
                hpi_a[e] = 0
                hdi_a[e] = 0
                lhn1_a[e] = 0
                pai_a[e] = 0
                aai_a[e] = 0
                rai_a[e] = 0
                kzi_a[e] = 0
                lhn2_a[e] = 0
                act_a[e] = 0
                cbl_a[e] = 0
                rwr_a[e] = 0
                mfi_a[e] = 0
            if sub.degree(e[1]) != 0:
                ed_ne = []
                for j in sub.neighbors(e[1]):
                    this_edge = ()
                    if j > e[0]:
                        this_edge = (e[0], j)
                    elif j < e[0]:
                        this_edge = (j, e[0])
                    if this_edge in edges:
                        ed_ne.append(this_edge)
                cn_b[e] = 1./float(sub.degree(e[1])) * np.sum([cn[i] for i in ed_ne])
                sai_b[e] = 1./float(sub.degree(e[1])) * np.sum([sai[i] for i in ed_ne])
                ji_b[e] = 1./float(sub.degree(e[1])) * np.sum([ji[i] for i in ed_ne])
                soi_b[e] = 1./float(sub.degree(e[1])) * np.sum([soi[i] for i in ed_ne])
                hpi_b[e] = 1./float(sub.degree(e[1])) * np.sum([hpi[i] for i in ed_ne])
                hdi_b[e] = 1./float(sub.degree(e[1])) * np.sum([hdi[i] for i in ed_ne])
                lhn1_b[e] = 1./float(sub.degree(e[1])) * np.sum([lhn1[i] for i in ed_ne])
                pai_b[e] = 1./float(sub.degree(e[1])) * np.sum([pai[i] for i in ed_ne])
                aai_b[e] = 1./float(sub.degree(e[1])) * np.sum([aai[i] for i in ed_ne])
                rai_b[e] = 1./float(sub.degree(e[1])) * np.sum([rai[i] for i in ed_ne])
                kzi_b[e] = 1./float(sub.degree(e[1])) * np.sum([kzi[i] for i in ed_ne])
                lhn2_b[e] = 1./float(sub.degree(e[1])) * np.sum([lhn2[i] for i in ed_ne])
                act_b[e] = 1./float(sub.degree(e[1])) * np.sum([act[i] for i in ed_ne])
                cbl_b[e] = 1./float(sub.degree(e[1])) * np.sum([cbl[i] for i in ed_ne])
                rwr_b[e] = 1./float(sub.degree(e[1])) * np.sum([rwr[i] for i in ed_ne])
                mfi_b[e] = 1./float(sub.degree(e[1])) * np.sum([mfi[i] for i in ed_ne])
            else:
                cn_b[e] = 0
                sai_b[e] = 0
                ji_b[e] = 0
                soi_b[e] = 0
                hpi_b[e] = 0
                hdi_b[e] = 0
                lhn1_b[e] = 0
                pai_b[e] = 0
                aai_b[e] = 0
                rai_b[e] = 0
                kzi_b[e] = 0
                lhn2_b[e] = 0
                act_b[e] = 0
                cbl_b[e] = 0
                rwr_b[e] = 0
                mfi_b[e] = 0
        df['Edges'] = edges
        df['y'] = df['Edges'].map(y.get)
        df['CN'] = df['Edges'].map(cn.get)
        df['SaI'] = df['Edges'].map(sai.get)
        df['JI'] = df['Edges'].map(ji.get)
        df['SoI'] = df['Edges'].map(soi.get)
        df['HPI'] = df['Edges'].map(hpi.get)
        df['HDI'] = df['Edges'].map(hdi.get)
        df['LHN1'] = df['Edges'].map(lhn1.get)
        df['PAI'] = df['Edges'].map(pai.get)
        df['AAI'] = df['Edges'].map(aai.get)
        df['RAI'] = df['Edges'].map(rai.get)
        df['KzI'] = df['Edges'].map(kzi.get)
        df['LHN2'] = df['Edges'].map(lhn2.get)
        df['ACT'] = df['Edges'].map(act.get)
        df['CBL'] = df['Edges'].map(cbl.get)
        df['RWR'] = df['Edges'].map(rwr.get)
        df['MFI'] = df['Edges'].map(mfi.get)
        df['CN_A'] = df['Edges'].map(cn_a.get)
        df['SaI_A'] = df['Edges'].map(sai_a.get)
        df['JI_A'] = df['Edges'].map(ji_a.get)
        df['SoI_A'] = df['Edges'].map(soi_a.get)
        df['HPI_A'] = df['Edges'].map(hpi_a.get)
        df['HDI_A'] = df['Edges'].map(hdi_a.get)
        df['LHN1_A'] = df['Edges'].map(lhn1_a.get)
        df['PAI_A'] = df['Edges'].map(pai_a.get)
        df['AAI_A'] = df['Edges'].map(aai_a.get)
        df['RAI_A'] = df['Edges'].map(rai_a.get)
        df['KzI_A'] = df['Edges'].map(kzi_a.get)
        df['LHN2_A'] = df['Edges'].map(lhn2_a.get)
        df['ACT_A'] = df['Edges'].map(act_a.get)
        df['CBL_A'] = df['Edges'].map(cbl_a.get)
        df['RWR_A'] = df['Edges'].map(rwr_a.get)
        df['MFI_A'] = df['Edges'].map(mfi_a.get)
        df['CN_B'] = df['Edges'].map(cn_b.get)
        df['SaI_B'] = df['Edges'].map(sai_b.get)
        df['JI_B'] = df['Edges'].map(ji_b.get)
        df['SoI_B'] = df['Edges'].map(soi_b.get)
        df['HPI_B'] = df['Edges'].map(hpi_b.get)
        df['HDI_B'] = df['Edges'].map(hdi_b.get)
        df['LHN1_B'] = df['Edges'].map(lhn1_b.get)
        df['PAI_B'] = df['Edges'].map(pai_b.get)
        df['AAI_B'] = df['Edges'].map(aai_b.get)
        df['RAI_B'] = df['Edges'].map(rai_b.get)
        df['KzI_B'] = df['Edges'].map(kzi_b.get)
        df['LHN2_B'] = df['Edges'].map(lhn2_b.get)
        df['ACT_B'] = df['Edges'].map(act_b.get)
        df['CBL_B'] = df['Edges'].map(cbl_b.get)
        df['RWR_B'] = df['Edges'].map(rwr_b.get)
        df['MFI_B'] = df['Edges'].map(mfi_b.get)
        df = df.fillna(0)
        frames.append(df)
    return frames, folds

def run_indices_on_compl(sub, fold):
    df = pd.DataFrame()
    es = nx.non_edges(sub)
    edges = []
    y = {}
    cn = common_neighbours(sub)
    sai = salton_index(sub)
    ji = jaccard_index(sub)
    soi = sorensen_index(sub)
    hpi = hub_promoted_index(sub)
    hdi = hub_depressed_index(sub)
    lhn1 = LHN1_index(sub)
    pai = preferential_attachment_index(sub)
    aai = adamic_adar_index(sub)
    rai = resource_allocation_index(sub)
    kzi = katz_index(sub)
    lhn2 = lhn2_index(sub)
    act = act_index(sub)
    cbl = cbl_index(sub)
    rwr = rwr_index(sub)
    mfi = matrix_forest_index(sub)
    cn_a = {}
    sai_a = {}
    ji_a = {}
    soi_a = {}
    hpi_a = {}
    hdi_a = {}
    lhn1_a = {}
    pai_a = {}
    aai_a = {}
    rai_a = {}
    kzi_a = {}
    lhn2_a = {}
    act_a = {}
    cbl_a = {}
    rwr_a = {}
    mfi_a = {}
    cn_b = {}
    sai_b = {}
    ji_b = {}
    soi_b = {}
    hpi_b = {}
    hdi_b = {}
    lhn1_b = {}
    pai_b = {}
    aai_b = {}
    rai_b = {}
    kzi_b = {}
    lhn2_b = {}
    act_b = {}
    cbl_b = {}
    rwr_b = {}
    mfi_b = {}
    for e in es:
        edges.append(e)
        if e in fold:
            y[e] = 1
        else:
            y[e] = 0
    for e in edges:
        if sub.degree(e[0]) != 0:
            ed_ne = []
            for j in sub.neighbors(e[0]):
                this_edge = ()
                if j > e[1]:
                    this_edge = (e[1], j)
                elif j < e[1]:
                    this_edge = (j, e[1])
                if this_edge in edges:
                    ed_ne.append(this_edge)
            cn_a[e] = 1./float(sub.degree(e[0])) * np.sum([cn[i] for i in ed_ne])
            sai_a[e] = 1./float(sub.degree(e[0])) * np.sum([sai[i] for i in ed_ne])
            ji_a[e] = 1./float(sub.degree(e[0])) * np.sum([ji[i] for i in ed_ne])
            soi_a[e] = 1./float(sub.degree(e[0])) * np.sum([soi[i] for i in ed_ne])
            hpi_a[e] = 1./float(sub.degree(e[0])) * np.sum([hpi[i] for i in ed_ne])
            hdi_a[e] = 1./float(sub.degree(e[0])) * np.sum([hdi[i] for i in ed_ne])
            lhn1_a[e] = 1./float(sub.degree(e[0])) * np.sum([lhn1[i] for i in ed_ne])
            pai_a[e] = 1./float(sub.degree(e[0])) * np.sum([pai[i] for i in ed_ne])
            aai_a[e] = 1./float(sub.degree(e[0])) * np.sum([aai[i] for i in ed_ne])
            rai_a[e] = 1./float(sub.degree(e[0])) * np.sum([rai[i] for i in ed_ne])
            kzi_a[e] = 1./float(sub.degree(e[0])) * np.sum([kzi[i] for i in ed_ne])
            lhn2_a[e] = 1./float(sub.degree(e[0])) * np.sum([lhn2[i] for i in ed_ne])
            act_a[e] = 1./float(sub.degree(e[0])) * np.sum([act[i] for i in ed_ne])
            cbl_a[e] = 1./float(sub.degree(e[0])) * np.sum([cbl[i] for i in ed_ne])
            rwr_a[e] = 1./float(sub.degree(e[0])) * np.sum([rwr[i] for i in ed_ne])
            mfi_a[e] = 1./float(sub.degree(e[0])) * np.sum([mfi[i] for i in ed_ne])
        else:
            cn_a[e] = 0
            sai_a[e] = 0
            ji_a[e] = 0
            soi_a[e] = 0
            hpi_a[e] = 0
            hdi_a[e] = 0
            lhn1_a[e] = 0
            pai_a[e] = 0
            aai_a[e] = 0
            rai_a[e] = 0
            kzi_a[e] = 0
            lhn2_a[e] = 0
            act_a[e] = 0
            cbl_a[e] = 0
            rwr_a[e] = 0
            mfi_a[e] = 0
        if sub.degree(e[1]) != 0:
            ed_ne = []
            for j in sub.neighbors(e[1]):
                this_edge = ()
                if j > e[0]:
                    this_edge = (e[0], j)
                elif j < e[0]:
                    this_edge = (j, e[0])
                if this_edge in edges:
                    ed_ne.append(this_edge)
            cn_b[e] = 1./float(sub.degree(e[1])) * np.sum([cn[i] for i in ed_ne])
            sai_b[e] = 1./float(sub.degree(e[1])) * np.sum([sai[i] for i in ed_ne])
            ji_b[e] = 1./float(sub.degree(e[1])) * np.sum([ji[i] for i in ed_ne])
            soi_b[e] = 1./float(sub.degree(e[1])) * np.sum([soi[i] for i in ed_ne])
            hpi_b[e] = 1./float(sub.degree(e[1])) * np.sum([hpi[i] for i in ed_ne])
            hdi_b[e] = 1./float(sub.degree(e[1])) * np.sum([hdi[i] for i in ed_ne])
            lhn1_b[e] = 1./float(sub.degree(e[1])) * np.sum([lhn1[i] for i in ed_ne])
            pai_b[e] = 1./float(sub.degree(e[1])) * np.sum([pai[i] for i in ed_ne])
            aai_b[e] = 1./float(sub.degree(e[1])) * np.sum([aai[i] for i in ed_ne])
            rai_b[e] = 1./float(sub.degree(e[1])) * np.sum([rai[i] for i in ed_ne])
            kzi_b[e] = 1./float(sub.degree(e[1])) * np.sum([kzi[i] for i in ed_ne])
            lhn2_b[e] = 1./float(sub.degree(e[1])) * np.sum([lhn2[i] for i in ed_ne])
            act_b[e] = 1./float(sub.degree(e[1])) * np.sum([act[i] for i in ed_ne])
            cbl_b[e] = 1./float(sub.degree(e[1])) * np.sum([cbl[i] for i in ed_ne])
            rwr_b[e] = 1./float(sub.degree(e[1])) * np.sum([rwr[i] for i in ed_ne])
            mfi_b[e] = 1./float(sub.degree(e[1])) * np.sum([mfi[i] for i in ed_ne])
        else:
            cn_b[e] = 0
            sai_b[e] = 0
            ji_b[e] = 0
            soi_b[e] = 0
            hpi_b[e] = 0
            hdi_b[e] = 0
            lhn1_b[e] = 0
            pai_b[e] = 0
            aai_b[e] = 0
            rai_b[e] = 0
            kzi_b[e] = 0
            lhn2_b[e] = 0
            act_b[e] = 0
            cbl_b[e] = 0
            rwr_b[e] = 0
            mfi_b[e] = 0
    df['Edges'] = edges
    df['y'] = df['Edges'].map(y.get)
    df['CN'] = df['Edges'].map(cn.get)
    df['SaI'] = df['Edges'].map(sai.get)
    df['JI'] = df['Edges'].map(ji.get)
    df['SoI'] = df['Edges'].map(soi.get)
    df['HPI'] = df['Edges'].map(hpi.get)
    df['HDI'] = df['Edges'].map(hdi.get)
    df['LHN1'] = df['Edges'].map(lhn1.get)
    df['PAI'] = df['Edges'].map(pai.get)
    df['AAI'] = df['Edges'].map(aai.get)
    df['RAI'] = df['Edges'].map(rai.get)
    df['KzI'] = df['Edges'].map(kzi.get)
    df['LHN2'] = df['Edges'].map(lhn2.get)
    df['ACT'] = df['Edges'].map(act.get)
    df['CBL'] = df['Edges'].map(cbl.get)
    df['RWR'] = df['Edges'].map(rwr.get)
    df['MFI'] = df['Edges'].map(mfi.get)
    df['CN_A'] = df['Edges'].map(cn_a.get)
    df['SaI_A'] = df['Edges'].map(sai_a.get)
    df['JI_A'] = df['Edges'].map(ji_a.get)
    df['SoI_A'] = df['Edges'].map(soi_a.get)
    df['HPI_A'] = df['Edges'].map(hpi_a.get)
    df['HDI_A'] = df['Edges'].map(hdi_a.get)
    df['LHN1_A'] = df['Edges'].map(lhn1_a.get)
    df['PAI_A'] = df['Edges'].map(pai_a.get)
    df['AAI_A'] = df['Edges'].map(aai_a.get)
    df['RAI_A'] = df['Edges'].map(rai_a.get)
    df['KzI_A'] = df['Edges'].map(kzi_a.get)
    df['LHN2_A'] = df['Edges'].map(lhn2_a.get)
    df['ACT_A'] = df['Edges'].map(act_a.get)
    df['CBL_A'] = df['Edges'].map(cbl_a.get)
    df['RWR_A'] = df['Edges'].map(rwr_a.get)
    df['MFI_A'] = df['Edges'].map(mfi_a.get)
    df['CN_B'] = df['Edges'].map(cn_b.get)
    df['SaI_B'] = df['Edges'].map(sai_b.get)
    df['JI_B'] = df['Edges'].map(ji_b.get)
    df['SoI_B'] = df['Edges'].map(soi_b.get)
    df['HPI_B'] = df['Edges'].map(hpi_b.get)
    df['HDI_B'] = df['Edges'].map(hdi_b.get)
    df['LHN1_B'] = df['Edges'].map(lhn1_b.get)
    df['PAI_B'] = df['Edges'].map(pai_b.get)
    df['AAI_B'] = df['Edges'].map(aai_b.get)
    df['RAI_B'] = df['Edges'].map(rai_b.get)
    df['KzI_B'] = df['Edges'].map(kzi_b.get)
    df['LHN2_B'] = df['Edges'].map(lhn2_b.get)
    df['ACT_B'] = df['Edges'].map(act_b.get)
    df['CBL_B'] = df['Edges'].map(cbl_b.get)
    df['RWR_B'] = df['Edges'].map(rwr_b.get)
    df['MFI_B'] = df['Edges'].map(mfi_b.get)
    df = df.fillna(0)
    return df

def get_folds_test(G, alg = 'rf', nfolds = 10):
    f1s = []
    precisions = []
    recalls = []
    rocaucs = []
    accuracies = []
    avg_prcs = []
    random.seed(0)
    frames, folds = run_10_fold_on_G(G)
    edges = []
    for e in nx.non_edges(G):
        edges.append(e)
    zeros = [i for i in chunk(edges, len(edges)/len(folds[0]))]
    zeros = zeros[:10]
    #for j in xrange(nfolds):
    #    test_dataset = frames[j]
    #    train_dataset = pd.concat(frames[:j] + frames[(j+1):])
    #    y_train = np.array(train_dataset['y'])
    #    X_train = np.nan_to_num(
    #        np.array(train_dataset[list(set(train_dataset.columns) - set(['y', 'Edges']))]).astype('float32'))
    #    X_train = preprocessing.normalize(X_train, norm = 'l2')
    #    rf = ensemble.RandomForestClassifier(class_weight = 'auto')
    #    rf.fit(X_train, y_train)
    #    X_test = np.nan_to_num(
    #        np.array(test_dataset[list(set(test_dataset.columns) - set(['y', 'Edges']))]).astype('float32'))
    #    X_test = preprocessing.normalize(X_test, norm = 'l2')
    #    y_test = np.array(test_dataset['y'])
    #    print 'cr on test for fold', j+1
    #    y_pred = rf.predict(X_test)
    #    print classification_report(y_test, y_pred)
    #    f1s.append(metrics.f1_score(y_test, y_pred))
    #    precisions.append(metrics.precision_score(y_test, y_pred))
    #    recalls.append(metrics.recall_score(y_test, y_pred))
    #    rocaucs.append(metrics.roc_auc_score(y_test, y_pred))
    #    accuracies.append(metrics.accuracy_score(y_test, y_pred))
    #    avg_prcs.append(metrics.average_precision_score(y_test, y_pred))
    train_dataset = pd.concat([(i[i['y'] == 1]) for i in frames])
    zero_set = pd.DataFrame()
    for i in xrange(len(zeros)):
        #print zeros[i]
        #zed = []
        #for e in zeros[i]:
        #    zed.append(frames[i][frames[i]['Edges'] == e])
        #z = pd.concat(zed)
        z = frames[i][[(j in zeros[i]) for j in frames[i]['Edges']]]
        zero_set = pd.concat([zero_set, z])
    #print zero_set
    #print train_dataset
    train_dataset = pd.concat([train_dataset, zero_set])
    y = np.array(train_dataset['y'])
    cv = cross_validation.StratifiedKFold(y, 10)
    X = np.nan_to_num(
        np.array(train_dataset[list(set(train_dataset.columns) - set(['y', 'Edges']))]).astype('float32'))
    #X = preprocessing.normalize(X, norm = 'l2')
    scores = []
    rf = ensemble.RandomForestClassifier(class_weight = 'auto')
    scores = cross_validation.cross_val_score(rf, X, y = y, cv = cv, n_jobs = -1)
    print 'Random Forest:'
    print "Accuracy: {:.3f} (+/- {:.3f})\n".format(scores.mean(), scores.std() * 2)
    #rf = svm.SVC(kernel = 'poly', class_weight = 'auto')
    #scores = cross_validation.cross_val_score(rf, X, y = y, cv = cv, n_jobs = -1)
    #print 'SVM with plynomial kernel:'
    #print "Accuracy: {:.3f} (+/- {:.3f})\n".format(scores.mean(), scores.std() * 2)
    #rf = svm.SVC(class_weight = 'auto')
    #scores = cross_validation.cross_val_score(rf, X, y = y, cv = cv, n_jobs = -1)
    #print 'SVM:'
    #print "Accuracy: {:.3f} (+/- {:.3f})\n".format(scores.mean(), scores.std() * 2)
    rf = neighbors.KNeighborsClassifier()
    scores = cross_validation.cross_val_score(rf, X, y = y, cv = cv, n_jobs = -1)
    print 'KNN:'
    print "Accuracy: {:.3f} (+/- {:.3f})\n".format(scores.mean(), scores.std() * 2)
    rf = tree.DecisionTreeClassifier(class_weight = 'auto')
    scores = cross_validation.cross_val_score(rf, X, y = y, cv = cv, n_jobs = -1)
    print 'Decision tree:'
    print "Accuracy: {:.3f} (+/- {:.3f})\n".format(scores.mean(), scores.std() * 2)
    #print 'Scores:', scores
    #print "Accuracy: {:.3f} (+/- {:.3f})\n".format(scores.mean(), scores.std() * 2)
    #print 'mean infos:'
    #print 'mean F1:', np.mean(f1s)
    #print 'mean precision:', np.mean(precisions)
    #print 'mean recall:', np.mean(recalls)
    #print 'mean roc_auc:', np.mean(rocaucs)
    #print 'mean accuracy:', np.mean(accuracies)
    #print 'mean avg_precision:', np.mean(avg_prcs)

In [10]:
def run_folds_alg(G, nfolds = 10):
    f1s = []
    precisions = []
    recalls = []
    rocaucs = []
    accuracies = []
    avg_prcs = []
    random.seed(0)
    fs = [i for i in chunk(G.edges(), nfolds)]
    for i in xrange(nfolds):
        sub = G.copy()
        for e in fs[i]:
            sub.remove_edge(*e)
        frames, folds = run_10_fold_on_G(sub)
        edges = []
        for e in nx.non_edges(sub):
            edges.append(e)
        zeros = [k for k in chunk(edges, len(edges)/len(folds[0]))]
        zeros = zeros[:10]
        train_dataset = pd.concat([(k[k['y'] == 1]) for k in frames])
        zero_set = pd.DataFrame()
        for k in xrange(len(zeros)):
            z = frames[k][[(j in zeros[k]) for j in frames[k]['Edges']]]
            zero_set = pd.concat([zero_set, z])
        train_dataset = pd.concat([train_dataset, zero_set])
        y = np.array(train_dataset['y'])
        cv = cross_validation.StratifiedKFold(y, 10)
        X = np.nan_to_num(
            np.array(train_dataset[list(set(train_dataset.columns) - set(['y', 'Edges']))]).astype('float32'))
        rf = ensemble.RandomForestClassifier(class_weight = 'auto')
        rf.fit(X, y)
        df = run_indices_on_compl(sub, fs[i])
        X_test = np.nan_to_num(np.array(df[list(set(df.columns) - set(['y', 'Edges']))]).astype('float32'))
        y_test = np.array(df['y'])
        y_pred = rf.predict(X_test)
        #print y_test
        #print y_pred
        f1s.append(metrics.f1_score(y_test, y_pred))
        precisions.append(metrics.precision_score(y_test, y_pred))
        recalls.append(metrics.recall_score(y_test, y_pred))
        rocaucs.append(metrics.roc_auc_score(y_test, y_pred))
        accuracies.append(metrics.accuracy_score(y_test, y_pred))
        avg_prcs.append(metrics.average_precision_score(y_test, y_pred))
        #print 'Accuracy:', metrics.accuracy_score(y_test, rf.predict(X_test))
        #scores = cross_validation.cross_val_score(rf, X, y = y, cv = cv, n_jobs = -1)
    print 'infos:'
    print 'F1: {:.3f} (+/- {:.3f})'.format(np.mean(f1s), np.std(f1s))
    print 'Precision: {:.3f} (+/- {:.3f})'.format(np.mean(precisions), np.std(precisions))
    print 'Recall: {:.3f} (+/- {:.3f})'.format(np.mean(recalls), np.std(recalls))
    print 'ROC_AUC: {:.3f} (+/- {:.3f})'.format(np.mean(rocaucs), np.std(rocaucs))
    print 'Accuracy: {:.3f} (+/- {:.3f})'.format(np.mean(accuracies), np.std(accuracies))
    print 'Avg_precision: {:.3f} (+/- {:.3f})'.format(np.mean(avg_prcs), np.std(avg_prcs))

In [11]:
run_folds_alg(nx.karate_club_graph())



infos:
F1: 0.090 (+/- 0.020)
Precision: 0.048 (+/- 0.011)
Recall: 0.691 (+/- 0.153)
ROC_AUC: 0.732 (+/- 0.070)
Accuracy: 0.771 (+/- 0.057)
Avg_precision: 0.372 (+/- 0.078)


In [12]:
newman_adjnoun = nx.read_gml('./netws/newman/adjnoun/adjnoun.gml')
run_folds_alg(newman_adjnoun)



infos:
F1: 0.027 (+/- 0.003)
Precision: 0.014 (+/- 0.002)
Recall: 0.583 (+/- 0.064)
ROC_AUC: 0.641 (+/- 0.031)
Accuracy: 0.698 (+/- 0.021)
Avg_precision: 0.300 (+/- 0.032)


In [None]:
pajek_us_air = nx.read_pajek('./netws/pajekds/USAir97.net')
fixed_pajek_us_air = nx.Graph(nx.convert_node_labels_to_integers(pajek_us_air, first_label = 0))
run_folds_alg(fixed_pajek_us_air)