# Link Prediction in the Greek Web



First we extract the texts from the folders...

In [2]:
import stemmer as st
def process_word(word):
    """detone and stem word
    """
    new_word = []
    tones = { u'Ά' : u'A' , u'Έ': u'Ε', u'Ί': u'Ι', u'Ϊ': u'Ι',
             u'Ύ': u'Υ', u'Ϋ': u'Υ', u'Ό' : u'Ο', u'Ή': u'Η', u'Ώ': u'Ω'}
    for letter in word:
        try:
            new_word.append(tones[letter])
        except KeyError:
            new_word.append(letter)
    detoned = ''.join(l for l in new_word)
    return st.stem(detoned)
    

In [3]:
import string
def process_text(data):
    with open('dataset/greekstopwords.txt', 'r') as fp:
        stopwords = []
        for line in fp:
            stopwords.append(line.strip().decode('utf-8').upper())
    for domain in data.keys():
        text = data[domain]
        # remove punctuation
        punctuation = set(string.punctuation)    
        doc = ''.join([w for w in text if w not in punctuation])
        # remove stopwords
        doc = [w for w in doc.split() if w not in stopwords]
        doc = ' '.join(process_word(w) for w in doc)
        data[domain] = doc
    return data        


In [4]:
import os
import zipfile
import nltk
import pickle

if os.path.isfile('cache/processed_text.pickle'):
    with open('cache/processed_text.pickle', 'rb') as pfile:
        text_data = pickle.load(pfile)
        print "loaded from pickle"
else:
    filenames = os.listdir('dataset/hosts')
    raw_text = {}
    for zipfilename in filenames:
        with zipfile.ZipFile('dataset/hosts/'+zipfilename) as z:
            text = ""
            for filename in z.namelist():
                if not os.path.isdir(filename):
                    with z.open(filename) as f:
                        for line in f:
                            text += line.decode("utf-8").upper()
                            text += " "
            raw_text[zipfilename[:-4]] = text
    text_data = process_text(raw_text)
    with open('cache/processed_text.pickle', 'wb') as pfile:
        pickle.dump(text_data, pfile)

loaded from pickle


In [5]:
domain_number = {}
for i, domain in enumerate(text_data.keys()):
    domain_number[domain] = i

In [6]:
for word in text_data['news247.gr'].split()[:20]:
    print word
    #print process_word(word.upper())
    #print st.stem(word.upper())

HTTPNEWS247GREIDISEISPASOK
ΠΕΜΠΤ
13
ΟΚΤΩΒΡ
2016
ΠΟΛΙΤ
ΚΟΙΝΩΝ
ΟΙΚΟΝΟΜ
ΚΟΣΜ
ΓΝΩΜ
ΠΑΡΑΣΚΗΝΙ
ΠΡΩΤΟΣΕΛΙΔ
ΡΟΗ
ΠΕΡΙΣΣ
ΥΓΕΙ
LIFE
GUIDE
ΤΑΞΙΔ
VIRAL
ΨΥΧΑΓΩΓ


In [7]:
from __future__ import division
import networkx as nx
import pprint
import random
import numpy as np
from scipy import sparse
from sklearn.model_selection import train_test_split


In [8]:
random.seed(1)
edges = []
with open('dataset/edgelist.txt', 'r')as fp:
    for line in fp:
        edges.append((line.split()[0], line.split()[1]))
print len(edges)
nodes = set([node for _tuple in edges for node in _tuple])
print len(nodes)

2683
2041


In [134]:
from sklearn.feature_extraction.text import TfidfVectorizer

def compute_text_similarity(data):
    vec = TfidfVectorizer(max_df=0.98, min_df=2, max_features=1000)
    x = vec.fit_transform(data)
    return x
    

In [135]:
X_text = compute_text_similarity(text_data)
print X_text.shape
from sklearn.metrics.pairwise import cosine_similarity

cosine = cosine_similarity(X_text)

(2041, 55)


In [136]:
print cosine[domain_number['news247.gr'], domain_number['contra.gr']]

0.13700047218


In [None]:
from sklearn.decomposition import NMF, LatentDirichletAllocation
n_topics = 10
n_top_words = 20
nmf = NMF(n_components=n_topics, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)

In [137]:
def text_similarity(src, dst):
    return cosine[domain_number[src], domain_number[dst]]  
    

In [138]:
non_existent_edges = {}
with open('dataset/not_existing_edges.txt', 'r')as fp:
    for line in fp:
        non_existent_edges[((line.split()[0], line.split()[1]))] = 0 

In [139]:
X_train, y_train= train_test_split(edges, test_size=0.15, random_state=1)

In [140]:
import networkx as nx
import pprint

"""
#G = nx.read_edgelist('dataset/edgelist.txt', delimiter='\t', create_using=nx.DiGraph())
G = nx.DiGraph()
G.add_edges_from(X_train)
G.add_nodes_from(nodes)"""
G = nx.read_edgelist('dataset/edgelist.txt', delimiter='\t', create_using=nx.DiGraph())
print(len(G.nodes()))
print(len(G.edges()))

2041
2683


In [141]:
non_edges = [edge for edge in list(nx.non_edges(G)) if not non_existent_edges.has_key(edge)]
#4160957

print len(non_edges)

4140197


In [142]:
sum_in_degree = sum(G.in_degree().values())
print sum_in_degree

2683


In [143]:
def neighborhood(src, dst):
    neigh = {}
    neigh['out_src'] = G.out_degree(src)/G.degree(src)
    neigh['in_dst'] = G.in_degree(dst)/G.degree(dst)
    # maybe remove the next two features
    #neigh['in_src'] = G.in_degree(src)/G.degree(src)
    #neigh['out_dst'] = G.out_degree(dst)/G.degree(dst)    
    src_neigh = list(nx.all_neighbors(G, src))
    dst_neigh = list(nx.all_neighbors(G, dst))
    #neigh['common_neighbors'] = len(set(src_neigh).intersection(dst_neigh))/(G.degree(src)+G.degree(dst))
    out_dst = [a for _, a in G.out_edges(dst)]
    in_dst = [a for a, _ in G.in_edges(dst)]
    out_src = [a for _, a in G.out_edges(src)]    
    in_src = [a for a, _ in G.in_edges(src)]
    try:
        neigh['n_common_in'] = len(set(in_src).intersection(in_dst))/(G.in_degree(src)+G.in_degree(dst))
    except ZeroDivisionError:
        neigh['n_common_in'] = 0
    try:
        neigh['n_common_out'] = len(set(out_src).intersection(out_dst))/(G.out_degree(src)+G.out_degree(dst))
    except ZeroDivisionError:
        neigh['n_common_out'] = 0
    return neigh

In [144]:
pagerank = nx.pagerank(G)

In [145]:
betweeness = nx.betweenness_centrality(G)
closeness = nx.closeness_centrality(G)

In [146]:
#jaccard = nx.jaccard_coefficient(G,'news247.gr')

In [147]:
def feature_extraction_old(edge):
    src, dst = edge
    f_vector = []
    f_vector.append(text_similarity(src, dst)*2)
    f_vector.append(pagerank[src])
    #f_vector.append(pagerank[dst])
    edge_hood = neighborhood(src, dst)
    # add features: out degree of src, in degree of dst, # of common in with out
    #f_vector.extend([len(edge_hood['out_src']), len(edge_hood['in_dst']), edge_hood['n_common_in'], edge_hood['n_common_out']])
    f_vector.extend(edge_hood.values())    
    return f_vector  
        

In [168]:
def feature_extraction(edge):
    src, dst = edge
    f_vector = []
    f_vector.append(text_similarity(src, dst)*2)
    f_vector.append(pagerank[src])
    f_vector.append(pagerank[dst])
    f_vector.append(betweeness[src])
    f_vector.append(betweeness[dst])
    f_vector.append(G.in_degree(dst)/G.degree(dst))
    f_vector.append(len(set(G.neighbors(src)).intersection(G.neighbors(dst))))
    if G.has_edge(dst,src):
        f_vector.append(1)
    else:
        f_vector.append(0)
    return f_vector

"""
def feature_extraction(edge):
    src, dst = edge
    f_vector = []
    f_vector.append(text_similarity(src, dst))
    f_vector.append(pagerank[src])
    f_vector.append(pagerank[dst])
    f_vector.append(betweeness[src])
    f_vector.append(betweeness[dst])
    f_vector.append(closeness[src])
    f_vector.append(closeness[dst])
    f_vector.append(G.out_degree(src)/G.degree(src))
    f_vector.append(G.in_degree(dst)/G.degree(dst))
    try:
        f_vector.append(len(set(G.neighbors(src)).intersection(G.neighbors(dst)))/len(set(G.neighbors(src)).union(G.neighbors(dst))))
    except ZeroDivisionError:
        f_vector.append(0)
    return f_vector"""

'\ndef feature_extraction(edge):\n    src, dst = edge\n    f_vector = []\n    f_vector.append(text_similarity(src, dst))\n    f_vector.append(pagerank[src])\n    f_vector.append(pagerank[dst])\n    f_vector.append(betweeness[src])\n    f_vector.append(betweeness[dst])\n    f_vector.append(closeness[src])\n    f_vector.append(closeness[dst])\n    f_vector.append(G.out_degree(src)/G.degree(src))\n    f_vector.append(G.in_degree(dst)/G.degree(dst))\n    try:\n        f_vector.append(len(set(G.neighbors(src)).intersection(G.neighbors(dst)))/len(set(G.neighbors(src)).union(G.neighbors(dst))))\n    except ZeroDivisionError:\n        f_vector.append(0)\n    return f_vector'

In [169]:
print len(set(G.neighbors('news247.gr')).intersection(G.neighbors('contra.gr')))

8


In [170]:
vector = feature_extraction(('news247.gr','contra.gr'))
print vector

[0.2740009443607388, 0.004009557602747434, 0.0016805343926659456, 0.00022283606919962717, 2.1252247833905517e-05, 0.5, 8, 0]


In [171]:
X_train = []
y_train = []
for edge in edges:
    X_train.append(feature_extraction(edge))
    y_train.append(1)
for edge in non_existent_edges:
    X_train.append(feature_extraction(edge))
    y_train.append(0)


In [172]:
X_predict = []
for edge in non_edges:
    X_predict.append(feature_extraction(edge))

In [173]:
X_train_n = np.array(X_train)
y_train_n = np.array(y_train)
X_predict_n = np.array(X_predict)

In [174]:
print X_train_n.shape
print y_train_n.shape
print X_predict_n.shape

(23443, 8)
(23443,)
(4140197, 8)


In [86]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
reg = LogisticRegression()
sgd = SGDClassifier(loss="log", penalty="l2")
svm = SVC(probability=True)
#reg.fit(X_train_n, y_train_n)
#sgd.fit(X_train_n, y_train_n)

In [175]:
import xgboost
xgb = xgboost.XGBClassifier()
xgb.fit(X_train_n, y_train_n)
pred_xgb = xgb.predict_proba(X_predict_n)

In [30]:
#svm.fit(X_train_n, y_train_n)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [51]:
#pred = reg.predict_proba(X_predict_n)

In [None]:
#pred_sgd = sgd.predict_proba(X_predict_n)

In [31]:
# pred_svm = svm.predict_proba(X_predict_n)

In [176]:
probs = []
print pred_xgb[0]
for t in pred_xgb:
    probs.append(t[1])


[ 0.75522542  0.24477461]


In [177]:
indices = sorted(range(len(probs)), key=lambda k: probs[k])[::-1][:453]

In [178]:
print [probs[indices[i]] for i in range(len(indices))]

[0.99969029, 0.99959809, 0.99950826, 0.99949062, 0.99945682, 0.999376, 0.99936479, 0.99930727, 0.99929535, 0.99929535, 0.99908078, 0.99898082, 0.9989453, 0.99885046, 0.99870515, 0.99869305, 0.99856025, 0.99840122, 0.99806637, 0.99794275, 0.99793541, 0.99770987, 0.99770987, 0.99770987, 0.99740368, 0.99737823, 0.99731219, 0.99681658, 0.99600369, 0.99597186, 0.99580634, 0.9950965, 0.99473119, 0.99453211, 0.99450797, 0.9943198, 0.9938764, 0.99385214, 0.99352843, 0.99293453, 0.99286276, 0.99188691, 0.99080396, 0.99012429, 0.9897092, 0.98962456, 0.98910147, 0.98904991, 0.98884773, 0.98855674, 0.98847836, 0.98820132, 0.98766464, 0.98685604, 0.98669541, 0.98665988, 0.98641169, 0.98606747, 0.98599237, 0.98537999, 0.98463494, 0.98459321, 0.98417526, 0.9840135, 0.98377478, 0.98371947, 0.98364568, 0.9833551, 0.98264104, 0.98165786, 0.9811033, 0.98089361, 0.9808895, 0.98072368, 0.98047674, 0.98018152, 0.98004663, 0.97991496, 0.97991496, 0.9797799, 0.97960359, 0.97960359, 0.97940367, 0.97930497, 0.9

In [179]:
predicted_edges = []
for i in range(len(indices)):
    predicted_edges.append(non_edges[indices[i]])

In [180]:
"""predicted_edges = []
counter = 0
node_counter = { i : 0 for i in G.nodes()}
for i in range(len(indices)):
    if counter >452:
        break
    if node_counter[non_edges[indices[i]][0]]< 2:
        predicted_edges.append(non_edges[indices[i]])
        counter += 1
        node_counter[non_edges[indices[i]][0]]+=1"""

'predicted_edges = []\ncounter = 0\nnode_counter = { i : 0 for i in G.nodes()}\nfor i in range(len(indices)):\n    if counter >452:\n        break\n    if node_counter[non_edges[indices[i]][0]]< 2:\n        predicted_edges.append(non_edges[indices[i]])\n        counter += 1\n        node_counter[non_edges[indices[i]][0]]+=1'

In [181]:
print predicted_edges[-5:]

[(u'provocateur.gr', u'olapaok.gr'), (u'alexpolisonline.com', u'frontpages.gr'), (u'alexpolisonline.com', u'google.gr'), (u'stoxasmos-politikh.blogspot.gr', u'imerodromos.gr'), (u'provocateur.gr', u'frontpages.gr')]


In [182]:

with open('predicted_edges_xgb.txt', 'w') as fp:
    for site_1, site_2 in predicted_edges:
        fp.write(site_1+'\t'+site_2+'\n')