# Link Prediction in the Greek Web



First we extract the texts from the folders...

In [1]:
import stemmer as st
def process_word(word):
    """detone and stem word
    """
    new_word = []
    tones = { u'Ά' : u'A' , u'Έ': u'Ε', u'Ί': u'Ι', u'Ϊ': u'Ι',
             u'Ύ': u'Υ', u'Ϋ': u'Υ', u'Ό' : u'Ο', u'Ή': u'Η', u'Ώ': u'Ω'}
    for letter in word:
        try:
            new_word.append(tones[letter])
        except KeyError:
            new_word.append(letter)
    detoned = ''.join(l for l in new_word)
    return st.stem(detoned)
    

In [2]:
import string
def process_text(data):
    with open('dataset/greekstopwords.txt', 'r') as fp:
        stopwords = []
        for line in fp:
            stopwords.append(line.strip().decode('utf-8').upper())
    for domain in data.keys():
        text = data[domain]
        # remove punctuation
        punctuation = set(string.punctuation)    
        doc = ''.join([w for w in text if w not in punctuation])
        # remove stopwords
        doc = [w for w in doc.split() if w not in stopwords]
        doc = [w for w in doc if not re.match(r"$\d+\W+|\b\d+\b|\W+\d+$", w)]
        doc = ' '.join(process_word(w) for w in doc)
        data[domain] = doc
    return data        


In [3]:
import os
import zipfile
import nltk
import pickle

if os.path.isfile('cache/processed_text.pickle'):
    with open('cache/processed_text.pickle', 'rb') as pfile:
        text_data = pickle.load(pfile)
        print "loaded from pickle"
else:
    filenames = os.listdir('dataset/hosts')
    raw_text = {}
    for zipfilename in filenames:
        with zipfile.ZipFile('dataset/hosts/'+zipfilename) as z:
            text = ""
            for filename in z.namelist():
                if not os.path.isdir(filename):
                    with z.open(filename) as f:
                        for line in f:
                            text += line.decode("utf-8").upper()
                            text += " "
            raw_text[zipfilename[:-4]] = text
    text_data = process_text(raw_text)
    with open('cache/processed_text.pickle', 'wb') as pfile:
        pickle.dump(text_data, pfile)

loaded from pickle


In [4]:
domain_number = {}
for i, domain in enumerate(text_data.keys()):
    domain_number[domain] = i

In [5]:
import re
doc = [word for word in text_data['news247.gr'].split()[:20]]
doc = [w for w in doc if not re.match(r"$\d+\W+|\b\d+\b|\W+\d+$", w)]
for w in doc:
    print w
    #print process_word(word.upper())
    #print st.stem(word.upper())

HTTPNEWS247GREIDISEISPASOK
ΠΕΜΠΤ
ΟΚΤΩΒΡ
ΠΟΛΙΤ
ΚΟΙΝΩΝ
ΟΙΚΟΝΟΜ
ΚΟΣΜ
ΓΝΩΜ
ΠΑΡΑΣΚΗΝΙ
ΠΡΩΤΟΣΕΛΙΔ
ΡΟΗ
ΠΕΡΙΣΣ
ΥΓΕΙ
LIFE
GUIDE
ΤΑΞΙΔ
VIRAL
ΨΥΧΑΓΩΓ
ΦΑΓΗΤ
CELEBRITIES


In [6]:
from __future__ import division
import networkx as nx
import pprint
import random
import numpy as np
from scipy import sparse
from sklearn.model_selection import train_test_split


In [7]:
random.seed(1)
edges = []
with open('dataset/edgelist.txt', 'r')as fp:
    for line in fp:
        edges.append((line.split()[0], line.split()[1]))
print len(edges)
nodes = set([node for _tuple in edges for node in _tuple])
print len(nodes)

2683
2041


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

def compute_text_similarity(data):
    vec = TfidfVectorizer(max_df=0.90, min_df=5, max_features=500,lowercase=False ,
                          analyzer = 'word')
    #vec = TfidfVectorizer(max_df=1, max_features=1000, lowercase=False)
    x = vec.fit_transform(data)
    for word in vec.get_feature_names():
        print word,
    return x
    

In [9]:
X_text = compute_text_similarity(text_data.values())
print X_text.shape
from sklearn.metrics.pairwise import cosine_similarity

cosine = cosine_similarity(X_text)

ABOUT ALL AND ARE AT ATHENS AΔΕΙ AΛΛ AΝΘΡΩΠ AΡΘΡ AΤΟΜ BLOG BY COMMENT COMMENTS CONTACT COOKIES COPYRIGHT DE DESIGN EMAIL FACEBOOK FM FOR FROM GOOGLE GREECE GREEK HOME HOT IN IS IT JULY LED LIFESTYLE LIKE LIVE MAY MEDIA MORE NEW NEWS NEWSLETTER NO OCTOBER OF ON ONLINE OR OUT POST POSTED POWERED RADIO READ RESERVED RIGHTS RSS SEARCH SEPTEMBER SHARE SITE THE THIS TO TOP TWEET TWITTER US VIDEO VIEW WEB WITH YOU YOUR ΑΓ ΑΓΟΡ ΑΓΟΡA ΑΓΩΝ ΑΘΗΝ ΑΘΛΗΤ ΑΘΛΗΤΙΣΜ ΑΚΟΛΟΥΘ ΑΚΟΜ ΑΛΛ ΑΛΛA ΑΝAΠΤΥΞ ΑΝΑΖΗΤΗΣ ΑΝΑΚΟΙΝΩΣ ΑΝΘΡΩΠ ΑΠ ΑΠAΝΤΗΣ ΑΠΟ ΑΠΟΣΤΟΛ ΑΠΟΤΕΛ ΑΠΟΤΕΛΕΣΜ ΑΠΟΦAΣ ΑΠΟΦΑΣ ΑΠΟΨ ΑΠΡΙΛ ΑΠΡΙΛΙ ΑΡΘΡ ΑΡΙΘΜ ΑΡΧ ΑΡΧΕΙ ΑΣΦAΛΕΙ ΑΤΤ ΑΥΓ ΑΥΓΟΥΣΤ ΑΥΤ ΑΥΤA ΑΥΤΟΚΙΝΗΤ ΑΦ ΒAΣ ΒΑΘΜΟΛΟΓ ΒΙΒΛ ΒΙΒΛΙ ΒΙΝΤΕ ΒΟΛ ΒΡ ΒΡΙΣΚ ΓΕΝ ΓΕΡΜΑΝ ΓΕΩΡΓΙ ΓΙAΝΝ ΓΙΑΤ ΓΙΝ ΓΙΩΡΓ ΓΝΩΣΤ ΓΟΝ ΓΡAΦ ΓΡΑΦΕΙ ΓΡΗΓΟΡ ΓΥΝΑΙΚ ΔΕ ΔΕΔΟΜΕΝ ΔΕΚ ΔΕΚΕΜΒΡ ΔΗΛΩΣ ΔΗΜ ΔΗΜΗΤΡ ΔΗΜΙΟΥΡΓ ΔΗΜΟΣ ΔΗΜΟΣΙ ΔΗΜΟΣΙΕΥΘ ΔΗΜΟΣΙΕΥΣ ΔΗΜΟΤ ΔΗΜΟΦΙΛ ΔΙAΒΑΣ ΔΙAΡΚΕΙ ΔΙAΦΟΡ ΔΙΑΒAΣ ΔΙΑΒAΣΤ ΔΙΑΓΩΝΙΣΜ ΔΙΑΤΡΟΦ ΔΙΑΦΗΜΙΣ ΔΙΑΧΕΙΡΙΣ ΔΙΕΘΝ ΔΙΕΥΘΥΝΣ ΔΙΚΑΙ ΔΙΚΑΙΩΜ ΔΙΚΤΥ ΔΙΝ ΔΙΟΙΚΗΣ ΔΡAΣ ΔΡΑΣΤΗΡΙΟΤ

In [10]:
print cosine[domain_number['news247.gr'], domain_number['contra.gr']]

0.478839219864


In [11]:
"""
topic extraction!
http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-topics-extraction-with-nmf-lda-py"""

'\ntopic extraction!\nhttp://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-topics-extraction-with-nmf-lda-py'

In [12]:
def text_similarity(src, dst):
    return cosine[domain_number[src], domain_number[dst]]  
    

In [13]:
non_existent_edges = {}
with open('dataset/not_existing_edges.txt', 'r')as fp:
    for line in fp:
        non_existent_edges[((line.split()[0], line.split()[1]))] = 0 

In [14]:
#X_train, y_train= train_test_split(edges, test_size=0.15, random_state=1)

In [15]:
import networkx as nx
import pprint

"""
#G = nx.read_edgelist('dataset/edgelist.txt', delimiter='\t', create_using=nx.DiGraph())
G = nx.DiGraph()
G.add_edges_from(X_train)
G.add_nodes_from(nodes)"""
G = nx.read_edgelist('dataset/edgelist.txt', delimiter='\t', create_using=nx.DiGraph())
print(len(G.nodes()))
print(len(G.edges()))

2041
2683


In [16]:
non_edges = [edge for edge in list(nx.non_edges(G)) if not non_existent_edges.has_key(edge)]
#4160957

print len(non_edges)

4140197


In [17]:
sum_in_degree = sum(G.in_degree().values())
print sum_in_degree

2683


In [18]:
def neighborhood(src, dst):
    neigh = {}
    neigh['out_src'] = G.out_degree(src)/G.degree(src)
    neigh['in_dst'] = G.in_degree(dst)/G.degree(dst)
    # maybe remove the next two features
    #neigh['in_src'] = G.in_degree(src)/G.degree(src)
    #neigh['out_dst'] = G.out_degree(dst)/G.degree(dst)    
    src_neigh = list(nx.all_neighbors(G, src))
    dst_neigh = list(nx.all_neighbors(G, dst))
    #neigh['common_neighbors'] = len(set(src_neigh).intersection(dst_neigh))/(G.degree(src)+G.degree(dst))
    out_dst = [a for _, a in G.out_edges(dst)]
    in_dst = [a for a, _ in G.in_edges(dst)]
    out_src = [a for _, a in G.out_edges(src)]    
    in_src = [a for a, _ in G.in_edges(src)]
    try:
        neigh['n_common_in'] = len(set(in_src).intersection(in_dst))/(G.in_degree(src)+G.in_degree(dst))
    except ZeroDivisionError:
        neigh['n_common_in'] = 0
    try:
        neigh['n_common_out'] = len(set(out_src).intersection(out_dst))/(G.out_degree(src)+G.out_degree(dst))
    except ZeroDivisionError:
        neigh['n_common_out'] = 0
    return neigh

In [19]:
pagerank = nx.pagerank(G)

In [20]:
betweeness = nx.betweenness_centrality(G)
closeness = nx.closeness_centrality(G)
eigenvector = nx.eigenvector_centrality(G)
degree = nx.degree_centrality(G)
in_degree = G.in_degree()
out_degree = G.out_degree()

In [21]:
import math
def adamic_adar(src, dst):
    score = 0
    common = list(set(G.neighbors(src)).intersection(G.neighbors(dst)))
    for node in common:
        score += 1/np.log(len(G.neighbors(node)))
    return sum(1 / math.log(G.degree(w))
                   for w in common)     
    

In [22]:
adamic_adar('news247.gr', 'contra.gr')



3.0020540579927957

In [23]:
matrix = nx.adjacency_matrix(G)

In [24]:
import igraph as ig
import louvain_igraph as louvain
G_new = ig.Graph()
G_new.add_vertices(G.nodes())
G_new.add_edges(G.edges())

In [25]:
opt = louvain.Optimiser()
partition = opt.find_partition(graph=G_new,partition_class=louvain.SignificanceVertexPartition)

In [26]:
partition = list(partition)
partition_names = []
for com in partition:
    new_com = []
    for node_id in com:
        new_com.append(G_new.vs[node_id]['name'])
    partition_names.append(new_com)
# extract names
partitions = { node : [] for node in G.nodes()}
for com in partition_names:
    for node in com:
        partitions[node].extend(com)

In [27]:
for com in partition_names:
    print com

[u'politikanet.gr', u'katafylli.gr', u'skopelosonline.gr', u'inital.gr', u'arcadiasports.gr', u'bookone1.gr', u'aoristies.gr', u'oraiokastro24.gr', u'siriosfm.gr', u'sdeeth.gr', u'mypatra.gr', u'radioamfilochia.gr', u'ekriti.gr', u'paok-athens.gr', u'sarakis.gr', u'venceremos33.blogspot.gr', u'sportcyclades.gr', u'omospondiadaniolipton.gr', u'pireasnow.gr', u'hysteria.gr', u'radiomanos.gr', u'katanalotis.gr', u'nexusmanagementconsultants.gr', u'misitimi.gr', u'oxafies.com', u'erassitexnikopodosfairo.blogspot.gr', u'pkteam.gr', u'betforthewinners.gr', u'eplski.gr', u'oragiaspor-dramas.gr', u'epskastorias.gr', u'freddonews.gr', u'frontpages.gr']
[u'tourist-guides.gr', u'mpass.gr', u'theacropolismuseum.gr', u'neolaia.gr', u'alkal.gr', u'gbroofgarden.gr', u'cna.org.cy', u'studenthouse.gr', u'grandebretagne.gr']
[u'fiat.gr', u'fiatprofessional.gr']
[u'left.gr', u'candianews.gr', u'djchriskaltsas.gr', u'parallaximag.gr', u'epohi.gr', u'tiniakos.gr', u'iskra.gr', u'fisy.gr', u'seisaxthia.word

In [28]:
def partition_check(src, dst):
    counter = 0
    for neigh in G.neighbors(src):
        if neigh in partitions[dst]:
            counter+=1
    """if src in partitions[dst]:
        return 1
    else:
        return 0"""
    return counter
    
partition_check('news247.gr', 'ladylike.gr')


9

In [29]:
from scipy import stats
print "pagerank", stats.describe(pagerank.values())
print "closeness", stats.describe(closeness.values())
print "betweeness", stats.describe(betweeness.values())
print "eigenvector", stats.describe(eigenvector.values())
print "text similarity", stats.describe(cosine)

pagerank DescribeResult(nobs=2041, minmax=(0.00029761511829001677, 0.019530823306958298), mean=0.00048995590396864249, variance=5.1484022288357726e-07, skewness=16.593681826353492, kurtosis=379.97825697924446)
closeness DescribeResult(nobs=2041, minmax=(0.0, 0.043706197338373103), mean=0.0011057427491154147, variance=1.0548295767353478e-05, skewness=6.840389761601526, kurtosis=62.844402531343476)
betweeness DescribeResult(nobs=2041, minmax=(0.0, 0.0013256810816528672), mean=5.1837164079498881e-06, variance=3.5261533531813181e-09, skewness=16.489608369782463, kurtosis=294.5096041003185)
eigenvector DescribeResult(nobs=2041, minmax=(0.0, 0.37735422231142191), mean=0.0017948109226848491, variance=0.00048697315309188523, skewness=13.386178390197925, kurtosis=183.2693369063938)
text similarity DescribeResult(nobs=2041, minmax=(array([ 0.,  0.,  0., ...,  0.,  0.,  0.]), array([ 1.,  1.,  1., ...,  1.,  1.,  1.])), mean=array([ 0.14081705,  0.11499577,  0.19841769, ...,  0.18956786,
        

In [30]:
def feature_extraction(edge):
    src, dst = edge
    f_vector = []
    f_vector.append(text_similarity(src, dst))
    f_vector.append(pagerank[src])
    f_vector.append(eigenvector[src])
    f_vector.append(eigenvector[dst])
    f_vector.append(pagerank[dst])
    f_vector.append(betweeness[src])
    f_vector.append(betweeness[dst])
    f_vector.append(closeness[dst])
    f_vector.append(closeness[src])
    f_vector.append(adamic_adar(src, dst))    
    f_vector.append(partition_check(src, dst))
    return f_vector

"""
def feature_extraction(edge):
    src, dst = edge
    f_vector = []
    f_vector.append(text_similarity(src, dst))
    #f_vector.append(pagerank[src])
    f_vector.append(pagerank[dst])
    f_vector.append(betweeness[src])
    f_vector.append(betweeness[dst])
    f_vector.append(closeness[dst])
    f_vector.append(closeness[src])
    f_vector.append(G.in_degree(dst)/G.degree(dst))
    f_vector.append(len(set(G.neighbors(src)).intersection(G.neighbors(dst))))
    if G.has_edge(dst,src):
        f_vector.append(1)
    else:
        f_vector.append(0)
    f_vector.append(adamic_adar(src, dst))
    f_vector.append(partition_check(src, dst))
    return f_vector
    
    def feature_extraction(edge):
    src, dst = edge
    f_vector = []
    f_vector.append(text_similarity(src, dst))
    #f_vector.append(pagerank[src])
    f_vector.append(eigenvector[dst])
    f_vector.append(pagerank[dst])
    f_vector.append(betweeness[src])
    f_vector.append(betweeness[dst])
    f_vector.append(closeness[dst])
    f_vector.append(closeness[src])
    union = len(set(G.neighbors(src)).union(G.neighbors(dst)))
    intersection = len(set(G.neighbors(src)).intersection(G.neighbors(dst)))
    f_vector.append(intersection)
    if union:
        f_vector.append(intersection/union)
    else:
        f_vector.append(0)
    f_vector.append(adamic_adar(src, dst))
    f_vector.append(partition_check(src, dst))
    if G.has_edge(dst,src):
        f_vector.append(1)
    else:
        f_vector.append(0)
    return f_vector"""

'\ndef feature_extraction(edge):\n    src, dst = edge\n    f_vector = []\n    f_vector.append(text_similarity(src, dst))\n    #f_vector.append(pagerank[src])\n    f_vector.append(pagerank[dst])\n    f_vector.append(betweeness[src])\n    f_vector.append(betweeness[dst])\n    f_vector.append(closeness[dst])\n    f_vector.append(closeness[src])\n    f_vector.append(G.in_degree(dst)/G.degree(dst))\n    f_vector.append(len(set(G.neighbors(src)).intersection(G.neighbors(dst))))\n    if G.has_edge(dst,src):\n        f_vector.append(1)\n    else:\n        f_vector.append(0)\n    f_vector.append(adamic_adar(src, dst))\n    f_vector.append(partition_check(src, dst))\n    return f_vector\n    \n    def feature_extraction(edge):\n    src, dst = edge\n    f_vector = []\n    f_vector.append(text_similarity(src, dst))\n    #f_vector.append(pagerank[src])\n    f_vector.append(eigenvector[dst])\n    f_vector.append(pagerank[dst])\n    f_vector.append(betweeness[src])\n    f_vector.append(betweeness[dst

In [31]:
vector = feature_extraction(('news247.gr','contra.gr'))
print vector

[0.47883921986362765, 0.004009557602747434, 0.2705070374673354, 0.310522072601534, 0.0016805343926659456, 0.00022283606919962717, 2.1252247833905517e-05, 0.005404411764705882, 0.006550802139037433, 3.0020540579927957, 9]




In [32]:
X_train = []
y_train = []
for edge in edges:
    X_train.append(feature_extraction(edge))
    y_train.append(1)
for edge in non_existent_edges:
    X_train.append(feature_extraction(edge))
    y_train.append(0)




In [34]:
X_predict = []
for edge in non_edges:
    X_predict.append(feature_extraction(edge))



In [35]:
X_train_n = np.array(X_train)
y_train_n = np.array(y_train)
X_predict_n = np.array(X_predict)


In [33]:
for i in range(11):
    print stats.describe(X_train_n, axis=1).mean[i],
print
for i in range(11):
    print stats.describe(X_train_n, axis=1).variance[i],

NameError: name 'X_train_n' is not defined

In [36]:
X_train_n[0]

array([  1.81467172e-01,   2.97615118e-04,   0.00000000e+00,
         9.38850476e-06,   1.67151672e-02,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   4.90196078e-04,
         0.00000000e+00,   1.00000000e+00])

In [37]:
# scale features
from sklearn import preprocessing 
X_train_scaled = preprocessing.normalize(X_train_n)  
X_predict_scaled = preprocessing.normalize(X_predict_n) 
"""from sklearn import preprocessing
X_train_scaled = preprocessing.scale(X_train_n, axis=1)
print X_train_scaled[0]
X_predict_scaled = preprocessing.scale(X_predict_n)"""

'from sklearn import preprocessing\nX_train_scaled = preprocessing.scale(X_train_n, axis=1)\nprint X_train_scaled[0]\nX_predict_scaled = preprocessing.scale(X_predict_n)'

In [38]:
print X_train_scaled[0]

[  1.78526938e-01   2.92792991e-04   0.00000000e+00   9.23638692e-06
   1.64443387e-02   0.00000000e+00   0.00000000e+00   0.00000000e+00
   4.82253646e-04   0.00000000e+00   9.83797437e-01]


In [39]:
"""PCA didnt work
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
X_train_n = pca.fit_transform(X_train)
X_predict_n = pca.fit_transform(X_predict)"""


'PCA didnt work\nfrom sklearn.decomposition import PCA\npca = PCA(n_components=10)\nX_train_n = pca.fit_transform(X_train)\nX_predict_n = pca.fit_transform(X_predict)'

In [40]:
print X_train_n.shape
print y_train_n.shape
print X_predict_n.shape

(23443, 11)
(23443,)
(4140197, 11)


In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
reg = LogisticRegression()
sgd = SGDClassifier(loss="log", penalty="l2")
svm = SVC(probability=True)
#reg.fit(X_train_n, y_train_n)
# pred = reg.predict_proba(X_predict_n)
svm.fit(X_train_scaled, y_train_n)
pred_svm = svm.predict_proba(X_predict_scaled)
#sgd.fit(X_train_n, y_train_n)
#pred_sgd = sgd.predict_proba(X_predict_n)
probs_svm = []
for t in pred_svm:
    probs_svm.append(t[1])
indices = sorted(range(len(probs_svm)), key=lambda k: probs_svm[k])[::-1][:453]
print [probs_svm[indices[i]] for i in range(len(indices))]
predicted_edges_svm = []
for i in range(len(indices)):
    predicted_edges_svm.append(non_edges[indices[i]])

with open('predicted_edges_svm.txt', 'w') as fp:
    for site_1, site_2 in predicted_edges_svm:
        fp.write(site_1+'\t'+site_2+'\n')


[0.98056969926241, 0.98020435516337578, 0.98016587281685386, 0.98014731551031475, 0.98012464533571286, 0.98009791589583395, 0.98000626140736058, 0.97989124764051438, 0.97988775256125926, 0.97976111042416103, 0.97974459586573093, 0.9796332558959161, 0.97961015606202262, 0.97959448738098598, 0.97951872525784778, 0.97951130610877712, 0.97948679787599624, 0.9794058118895721, 0.97935802928289872, 0.97927408978402464, 0.97925361415138279, 0.97922643354936767, 0.97919506944999524, 0.97917163718238776, 0.97909832206091285, 0.97902813292175872, 0.97897347150366176, 0.97897054850480469, 0.97897053579541982, 0.97897048542446397, 0.97896994055025699, 0.97896964142822185, 0.97896900185754865, 0.97896890135538461, 0.97896875718943488, 0.97896875718943488, 0.97896865439897596, 0.97896836565521161, 0.97896829153441056, 0.97896814481994288, 0.9789679393323848, 0.97896791300590558, 0.97896773159726114, 0.97896773159726114, 0.9789676586456364, 0.9789676586456364, 0.97896763581847845, 0.97896763581847845,

In [50]:
from sklearn.neural_network import MLPClassifier
#mlp = MLPClassifier(activation = 'logistic',solver='lbfgs', alpha=1e-4, hidden_layer_sizes=(100,100), random_state=1)
mlp = MLPClassifier()

In [51]:
mlp.fit(X_train_scaled, y_train_n)  

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [52]:
pred_mlp = mlp.predict_proba(X_predict_scaled)

In [53]:
probs_mlp = []
for t in pred_mlp:
    probs_mlp.append(t[1])
indices = sorted(range(len(probs_mlp)), key=lambda k: probs_mlp[k])[::-1][:453]
print [probs_mlp[indices[i]] for i in range(len(indices))]

[0.98753841354307892, 0.98753789764469391, 0.98753648718217879, 0.98753511505086344, 0.98753258506772679, 0.98753256810053569, 0.98753149950947916, 0.98753149950947916, 0.98753149950947916, 0.98753149950947916, 0.98753148877827757, 0.98753148877827757, 0.98753148877827757, 0.98753148877827757, 0.98753148877827757, 0.98753148877827757, 0.98753148877827757, 0.98753148087459952, 0.98753147668114238, 0.98753145505713213, 0.98753143297408941, 0.98753143135592869, 0.98753142506995029, 0.98753142442447817, 0.98753142442447817, 0.98753142442447817, 0.98753136643407435, 0.98753130007932455, 0.98753126697317184, 0.98753115034050354, 0.98753110897513263, 0.98753100413230877, 0.98753095216911468, 0.98753088559505242, 0.98753087985970267, 0.98753084385988232, 0.98753075330336693, 0.98753072816526655, 0.98753071305139994, 0.98753066344429097, 0.98753053545976643, 0.98753039684520438, 0.98752988612805681, 0.98752981377390947, 0.98752964501272023, 0.98752925702318417, 0.98752907207423235, 0.9875290210

In [54]:
print X_train[0]

[0.1814671715064545, 0.00029761511829001677, 0.0, 9.388504758428038e-06, 0.01671516723216645, 0.0, 0.0, 0.0, 0.0004901960784313725, 0, 1]


In [55]:
predicted_edges_mlp = []
for i in range(len(indices)):
    predicted_edges_mlp.append(non_edges[indices[i]])

with open('predicted_edges_mlp.txt', 'w') as fp:
    for site_1, site_2 in predicted_edges_mlp:
        fp.write(site_1+'\t'+site_2+'\n')

In [48]:
import xgboost
xgb = xgboost.XGBClassifier()
xgb.fit(X_train_n, y_train_n)
pred_xgb = xgb.predict_proba(X_predict_n)



In [85]:
probs_xgb = []
for t in pred_xgb:
    probs_xgb.append(t[1])
indices = sorted(range(len(probs_xgb)), key=lambda k: probs_xgb[k])[::-1][:453]
predicted_edges_xgb = []
for i in range(len(indices)):
    predicted_edges_xgb.append(non_edges[indices[i]])
print [probs_xgb[indices[i]] for i in range(len(indices))]
with open('predicted_edges_xgb.txt', 'w') as fp:
    for site_1, site_2 in predicted_edges_xgb:
        fp.write(site_1+'\t'+site_2+'\n')

[0.99908197, 0.99901986, 0.99883097, 0.99878412, 0.99877673, 0.99875224, 0.99863005, 0.99853814, 0.99850428, 0.99844295, 0.99842936, 0.99841297, 0.99839753, 0.99837506, 0.99827886, 0.99827886, 0.99825913, 0.99824393, 0.9982298, 0.99820161, 0.99819034, 0.99816811, 0.99815458, 0.99815458, 0.99815458, 0.99815458, 0.99815458, 0.99815458, 0.99814904, 0.99814701, 0.99810898, 0.9980768, 0.9980768, 0.99807632, 0.99807394, 0.99804282, 0.99798143, 0.99795282, 0.99793303, 0.99792516, 0.99791902, 0.99791795, 0.99791032, 0.99790287, 0.99790287, 0.99789399, 0.99789399, 0.99786925, 0.99786925, 0.9978689, 0.99786359, 0.99786347, 0.99783891, 0.99782324, 0.99779797, 0.99776077, 0.99774987, 0.99774951, 0.99774051, 0.99773139, 0.99773139, 0.99773139, 0.99773139, 0.9977144, 0.99771035, 0.99766123, 0.99765849, 0.99765414, 0.99759895, 0.99759442, 0.99759442, 0.99757665, 0.99756491, 0.99756491, 0.99756491, 0.99756491, 0.99754333, 0.99753821, 0.99753344, 0.99753112, 0.99753016, 0.99752027, 0.99751163, 0.997488

In [91]:
from sklearn.ensemble import VotingClassifier
eclf1 = VotingClassifier(estimators=[('mlp', mlp), ('svm', svm), ('xgb', xgb)], voting='soft')
eclf1 = eclf1.fit(X_train_n, y_train_n)
voting_pred = eclf1.predict_proba(X_predict_n)

In [92]:
probs_vote = []
for t in voting_pred:
    probs_vote.append(t[1])
indices = sorted(range(len(probs_vote)), key=lambda k: probs_vote[k])[::-1][:453]
predicted_edges_vote = []
for i in range(len(indices)):
    predicted_edges_vote.append(non_edges[indices[i]])
print [probs_vote[indices[i]] for i in range(len(indices))]
with open('predicted_edges_voting.txt', 'w') as fp:
    for site_1, site_2 in predicted_edges_vote:
        fp.write(site_1+'\t'+site_2+'\n')

[0.99902387336740972, 0.99885484287791149, 0.99882022673005932, 0.99847524495883766, 0.99840146098216354, 0.99824515987067264, 0.99823185275608806, 0.99822233105172309, 0.99821732530310003, 0.99819125255277896, 0.99812748218053537, 0.99807400265985047, 0.99806759101188414, 0.99796497042900079, 0.99781376796019305, 0.99777734045247524, 0.9976087499763201, 0.99754087915773992, 0.99748141886536812, 0.9974742697149378, 0.9973908135710351, 0.99713600823149984, 0.99711876151905077, 0.99708943335118916, 0.99708021168228489, 0.99703558023550609, 0.99690248552984073, 0.99688266589333507, 0.99686802470507818, 0.99685811187996387, 0.9968334899288549, 0.99681003340090513, 0.99677636993035079, 0.99674508182046428, 0.99672704834894521, 0.99672638582436468, 0.99670725634937485, 0.99669720793647121, 0.9966824822963477, 0.99663223633848441, 0.99660482849173693, 0.99660308766376149, 0.99658437938631028, 0.9965660848036908, 0.99656194067355852, 0.99654630558667556, 0.9965457980683583, 0.99652507997717776