Let's try what Dima's proposing, but with local idices:

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import random
import operator
import pandas as pd
import scipy.io as sci
%matplotlib inline
def chunk(xs, n):
    ys = list(xs)
    random.shuffle(ys)
    size = len(ys) // n
    leftovers= ys[size*n:]
    for c in xrange(n):
        if leftovers:
           extra= [ leftovers.pop() ] 
        else:
           extra= []
        yield ys[c*size:(c+1)*size] + extra

In [26]:
def common_neighbours(sub, e):
    return float(len(sorted(nx.common_neighbors(sub, e[0], e[1]))))

def salton_index(sub, e):
    j = nx.degree(sub, e[0])
    k = nx.degree(sub, e[1])
    if j != 0 and k != 0:
        return float(len(sorted(nx.common_neighbors(sub, e[0], e[1]))))/float(np.sqrt(j * k))
    else:
        return 0

def jaccard_index(sub, e):
    j = nx.degree(sub, e[0])
    k = nx.degree(sub, e[1])
    if j != 0 or k != 0:
        return float(len(sorted(nx.common_neighbors(sub, e[0], e[1]))))/float(len(set(sub[e[0]])|set(sub[e[1]])))
    else:
        return 0

def sorensen_index(sub, e):
    j = nx.degree(sub, e[0])
    k = nx.degree(sub, e[1])
    if j != 0 or k != 0:
        return float(len(sorted(nx.common_neighbors(sub, e[0], e[1]))))/float(j + k)
    else:
        return 0

def hub_promoted_index(sub, e):
    j = nx.degree(sub, e[0])
    k = nx.degree(sub, e[1])
    if j != 0 and k != 0:
        return float(len(sorted(nx.common_neighbors(sub, e[0], e[1]))))/float(min(j, k))
    else:
        return 0

def hub_depressed_index(sub, e):
    j = nx.degree(sub, e[0])
    k = nx.degree(sub, e[1])
    if j != 0 or k != 0:
        return float(len(sorted(nx.common_neighbors(sub, e[0], e[1]))))/float(max(j, k))
    else:
        return 0

def LHN1_index(sub, e):
    j = nx.degree(sub, e[0])
    k = nx.degree(sub, e[1])
    if j != 0 and k != 0:
        return float(len(sorted(nx.common_neighbors(sub, e[0], e[1]))))/float(j * k)
    else:
        return 0

def preferential_attachment_index(sub, e):
    return nx.degree(sub, e[0]) * nx.degree(sub, e[1])

def adamic_adar_index(sub, e):
    return np.sum(1/np.log(nx.degree(sub, sorted(nx.common_neighbors(sub, e[0], e[1]))).values()))

def resource_allocation_index(sub, e):
    return np.sum(1/np.array(
                nx.degree(sub, sorted(nx.common_neighbors(sub, e[0], e[1]))).values()).astype(float))

def predict_nodes(G, nfolds = 10):
    df = pd.DataFrame()
    es = nx.non_edges(G)
    cn = {}
    sai = {}
    ji = {}
    soi = {}
    hpi = {}
    hdi = {}
    lhn1 = {}
    pai = {}
    aai = {}
    rai = {}
    edges = []
    y = {}
    for e in es:
        edges.append(e)
        y[e] = 0
        cn[e] = common_neighbours(G, e)
        sai[e] = salton_index(G, e)
        ji[e] = jaccard_index(G, e)
        soi[e] = sorensen_index(G, e)
        hpi[e] = hub_promoted_index(G, e)
        hdi[e] = hub_depressed_index(G, e)
        lhn1[e] = LHN1_index(G, e)
        pai[e] = preferential_attachment_index(G, e)
        aai[e] = adamic_adar_index(G, e)
        rai[e] = resource_allocation_index(G, e)
    eds = G.edges()
    for e in eds:
        s = G.copy()
        s.remove_edge(*e)
        edges.append(e)
        y[e] = 1
        cn[e] = common_neighbours(s, e)
        sai[e] = salton_index(s, e)
        ji[e] = jaccard_index(s, e)
        soi[e] = sorensen_index(s, e)
        hpi[e] = hub_promoted_index(s, e)
        hdi[e] = hub_depressed_index(s, e)
        lhn1[e] = LHN1_index(s, e)
        pai[e] = preferential_attachment_index(s, e)
        aai[e] = adamic_adar_index(s, e)
        rai[e] = resource_allocation_index(s, e)
        s = 0
    df['Edges'] = edges
    df['y'] = df['Edges'].map(y.get)
    df['CN'] = df['Edges'].map(cn.get)
    df['SaI'] = df['Edges'].map(sai.get)
    df['JI'] = df['Edges'].map(ji.get)
    df['SoI'] = df['Edges'].map(soi.get)
    df['HPI'] = df['Edges'].map(hpi.get)
    df['HDI'] = df['Edges'].map(hdi.get)
    df['LHN1'] = df['Edges'].map(lhn1.get)
    df['PAI'] = df['Edges'].map(pai.get)
    df['AAI'] = df['Edges'].map(aai.get)
    df['RAI'] = df['Edges'].map(rai.get)
    df.fillna(0)
    return df

In [27]:
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import ensemble
from sklearn import svm
from sklearn import neighbors
from sklearn import tree
from sklearn import lda
from sklearn import qda
from sklearn import preprocessing
from sklearn import metrics
from sklearn import cross_validation

In [37]:
def run_indices_on_compl(sub, fold):
    df = pd.DataFrame()
    es = nx.non_edges(sub)
    cn = {}
    sai = {}
    ji = {}
    soi = {}
    hpi = {}
    hdi = {}
    lhn1 = {}
    pai = {}
    aai = {}
    rai = {}
    edges = []
    y = {}
    for e in es:
        edges.append(e)
        if e in fold:
            y[e] = 1
        else:
            y[e] = 0
        cn[e] = common_neighbours(sub, e)
        sai[e] = salton_index(sub, e)
        ji[e] = jaccard_index(sub, e)
        soi[e] = sorensen_index(sub, e)
        hpi[e] = hub_promoted_index(sub, e)
        hdi[e] = hub_depressed_index(sub, e)
        lhn1[e] = LHN1_index(sub, e)
        pai[e] = preferential_attachment_index(sub, e)
        aai[e] = adamic_adar_index(sub, e)
        rai[e] = resource_allocation_index(sub, e)
    df['Edges'] = edges
    df['y'] = df['Edges'].map(y.get)
    df['CN'] = df['Edges'].map(cn.get)
    df['SaI'] = df['Edges'].map(sai.get)
    df['JI'] = df['Edges'].map(ji.get)
    df['SoI'] = df['Edges'].map(soi.get)
    df['HPI'] = df['Edges'].map(hpi.get)
    df['HDI'] = df['Edges'].map(hdi.get)
    df['LHN1'] = df['Edges'].map(lhn1.get)
    df['PAI'] = df['Edges'].map(pai.get)
    df['AAI'] = df['Edges'].map(aai.get)
    df['RAI'] = df['Edges'].map(rai.get)
    df.fillna(0)
    return df

def run_folds_alg(G, nfolds = 10):
    f1s = []
    precisions = []
    recalls = []
    rocaucs = []
    accuracies = []
    avg_prcs = []
    random.seed(0)
    fs = [i for i in chunk(G.edges(), nfolds)]
    for i in xrange(nfolds):
        sub = G.copy()
        for e in fs[i]:
            sub.remove_edge(*e)
        train_dataset = predict_nodes(sub)
        y = np.array(train_dataset['y'])
        cv = cross_validation.StratifiedKFold(y, 10)
        X = np.nan_to_num(
            np.array(train_dataset[list(set(train_dataset.columns) - set(['y', 'Edges']))]).astype('float32'))
        rf = ensemble.RandomForestClassifier(class_weight = 'auto')
        rf.fit(X, y)
        df = run_indices_on_compl(sub, fs[i])
        X_test = np.nan_to_num(np.array(df[list(set(df.columns) - set(['y', 'Edges']))]).astype('float32'))
        y_test = np.array(df['y'])
        y_pred = rf.predict(X_test)
        #print y_test
        #print y_pred
        f1s.append(metrics.f1_score(y_test, y_pred))
        precisions.append(metrics.precision_score(y_test, y_pred))
        recalls.append(metrics.recall_score(y_test, y_pred))
        rocaucs.append(metrics.roc_auc_score(y_test, y_pred))
        accuracies.append(metrics.accuracy_score(y_test, y_pred))
        avg_prcs.append(metrics.average_precision_score(y_test, y_pred))
        #print 'Accuracy:', metrics.accuracy_score(y_test, rf.predict(X_test))
        #scores = cross_validation.cross_val_score(rf, X, y = y, cv = cv, n_jobs = -1)
    print 'infos:'
    print 'F1: {:.3f} (+/- {:.3f})'.format(np.mean(f1s), np.std(f1s))
    print 'Precision: {:.3f} (+/- {:.3f})'.format(np.mean(precisions), np.std(precisions))
    print 'Recall: {:.3f} (+/- {:.3f})'.format(np.mean(recalls), np.std(recalls))
    print 'ROC_AUC: {:.3f} (+/- {:.3f})'.format(np.mean(rocaucs), np.std(rocaucs))
    print 'Accuracy: {:.3f} (+/- {:.3f})'.format(np.mean(accuracies), np.std(accuracies))
    print 'Avg_precision: {:.3f} (+/- {:.3f})'.format(np.mean(avg_prcs), np.std(avg_prcs))

In [38]:
run_folds_alg(nx.karate_club_graph())

infos:
F1: 0.028 (+/- 0.048)
Precision: 0.020 (+/- 0.034)
Recall: 0.050 (+/- 0.083)
ROC_AUC: 0.506 (+/- 0.042)
Accuracy: 0.947 (+/- 0.023)
Avg_precision: 0.042 (+/- 0.057)


In [39]:
newman_adjnoun = nx.read_gml('./netws/newman/adjnoun/adjnoun.gml')
run_folds_alg(newman_adjnoun)

infos:
F1: 0.018 (+/- 0.009)
Precision: 0.010 (+/- 0.005)
Recall: 0.089 (+/- 0.045)
ROC_AUC: 0.512 (+/- 0.022)
Accuracy: 0.928 (+/- 0.012)
Avg_precision: 0.053 (+/- 0.025)


In [40]:
pajek_us_air = nx.read_pajek('./netws/pajekds/USAir97.net')
fixed_pajek_us_air = nx.Graph(nx.convert_node_labels_to_integers(pajek_us_air, first_label = 0))
run_folds_alg(fixed_pajek_us_air)

infos:
F1: 0.028 (+/- 0.015)
Precision: 0.035 (+/- 0.028)
Recall: 0.032 (+/- 0.009)
ROC_AUC: 0.511 (+/- 0.005)
Accuracy: 0.987 (+/- 0.011)
Avg_precision: 0.036 (+/- 0.015)
