# Link Prediction in the Greek Web



First we extract the texts from the folders...

In [154]:
import stemmer as st
def process_word(word):
    """detone and stem word
    """
    new_word = []
    tones = { u'Ά' : u'A' , u'Έ': u'Ε', u'Ί': u'Ι', u'Ϊ': u'Ι',
             u'Ύ': u'Υ', u'Ϋ': u'Υ', u'Ό' : u'Ο', u'Ή': u'Η', u'Ώ': u'Ω'}
    for letter in word:
        try:
            new_word.append(tones[letter])
        except KeyError:
            new_word.append(letter)
    detoned = ''.join(l for l in new_word)
    return st.stem(detoned)
    

In [156]:
import string
def process_text(data):
    with open('dataset/greekstopwords.txt', 'r') as fp:
        stopwords = []
        for line in fp:
            stopwords.append(line.strip().decode('utf-8').upper())
    for domain in data.keys():
        text = data[domain]
        # remove punctuation
        punctuation = set(string.punctuation)    
        doc = ''.join([w for w in text if w not in punctuation])
        # remove stopwords
        doc = [w for w in doc.split() if w not in stopwords]
        doc = ' '.join(process_word(w) for w in doc)
        data[domain] = doc
    return data        


In [None]:
import os
import zipfile
import nltk
import pickle

if os.path.isfile('cache/processed_text.pickle'):
    with open('cache/processed_text.pickle', 'rb') as pfile:
        text_data = pickle.load(pfile)
        print "loaded from pickle"
else:
    filenames = os.listdir('dataset/hosts')
    raw_text = {}
    for zipfilename in filenames:
        with zipfile.ZipFile('dataset/hosts/'+zipfilename) as z:
            text = ""
            for filename in z.namelist():
                if not os.path.isdir(filename):
                    with z.open(filename) as f:
                        for line in f:
                            text += line.decode("utf-8").upper()
                            text += " "
            raw_text[zipfilename[:-4]] = text
    text_data = process_text(raw_text)
    with open('cache/processed_text.pickle', 'wb') as pfile:
        pickle.dump(text_data, pfile)

In [5]:
domain_number = {}
for i, domain in enumerate(text_data.keys()):
    domain_number[domain] = i

In [153]:
import stemmer as st
for word in text_data['news247.gr'].split()[:20]:
    print process_word(word.upper())
    #print st.stem(word.upper())

HTTPNEWS247GREIDISEISPASOK
ΠΕΜΠΤ
13
ΟΚΤΩΒΡ
2016
ΠΟΛΙΤ
ΚΟΙΝΩΝ
ΟΙΚΟΝΟΜ
ΚΟΣΜ
ΓΝΩΜ
ΠΑΡΑΣΚΗΝΙ
ΠΡΩΤΟΣΕΛΙΔ
ΡΟΗ
ΠΕΡΙΣΣ
ΥΓΕΙ
LIFE
GUIDE
ΤΑΞΙΔ
VIRAL
ΨΥΧΑΓΩΓ


In [6]:
from __future__ import division
import networkx as nx
import pprint
import random
import numpy as np
from scipy import sparse
from sklearn.model_selection import train_test_split


In [7]:
random.seed(1)

edges = []
with open('dataset/edgelist.txt', 'r')as fp:
    for line in fp:
        edges.append((line.split()[0], line.split()[1]))
print len(edges)
nodes = set([node for _tuple in edges for node in _tuple])
print len(nodes)

2683
2041


In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

def compute_text_similarity(data):
    vec = TfidfVectorizer(max_features=500)
    x = vec.fit_transform(data)
    return x
    

In [39]:
X_text = compute_text_similarity(text_data)
print X_text.shape
from sklearn.metrics.pairwise import cosine_similarity

X_text = cosine_similarity(X_text)

(2041, 500)


In [40]:
print X_text[domain_number['news247.gr'], domain_number['newsit.gr']]

0.0169301810461


In [41]:
def text_similarity(src, dst):
    return X_text[domain_number[src], domain_number[dst]]  
    

In [42]:
non_existent_edges = {}
with open('dataset/not_existing_edges.txt', 'r')as fp:
    for line in fp:
        non_existent_edges[((line.split()[0], line.split()[1]))] = 0 

In [11]:
X_train, y_train= train_test_split(edges, test_size=0.15, random_state=1)

In [43]:
import networkx as nx
import pprint

"""
#G = nx.read_edgelist('dataset/edgelist.txt', delimiter='\t', create_using=nx.DiGraph())
G = nx.DiGraph()
G.add_edges_from(X_train)
G.add_nodes_from(nodes)"""
G = nx.read_edgelist('dataset/edgelist.txt', delimiter='\t', create_using=nx.DiGraph())
print(len(G.nodes()))
print(len(G.edges()))

2041
2683


In [44]:
non_edges = [edge for edge in list(nx.non_edges(G)) if not non_existent_edges.has_key(edge)]
#4160957

print len(non_edges)

4140197


In [45]:
sum_in_degree = sum(G.in_degree().values())
print sum_in_degree

2683


In [46]:
def neighborhood(src, dst):
    neigh = {}
    neigh['out_src'] = G.out_degree(src)/sum_in_degree
    neigh['in_dst'] = G.in_degree(dst)/sum_in_degree
    # maybe remove the next two features
    neigh['in_src'] = G.in_degree(src)/sum_in_degree
    neigh['out_dst'] = G.out_degree(dst)/sum_in_degree    
    src_neigh = list(nx.all_neighbors(G, src))
    dst_neigh = list(nx.all_neighbors(G, dst))
    neigh['common_neighbors'] = len(set(src_neigh).intersection(dst_neigh))/sum_in_degree
    out_dst = [a for _, a in G.out_edges(dst)]
    in_dst = [a for a, _ in G.in_edges(dst)]
    out_src = [a for _, a in G.out_edges(src)]    
    in_src = [a for a, _ in G.in_edges(src)]
    neigh['n_common_in'] = len(set(in_src).intersection(in_dst))/sum_in_degree
    neigh['n_common_out'] = len(set(out_src).intersection(out_dst))/sum_in_degree
    return neigh

In [47]:
pagerank = nx.pagerank(G)

In [48]:
def feature_extraction(edge):
    src, dst = edge
    f_vector = []
    f_vector.append(text_similarity(src, dst))
    f_vector.append(pagerank[src])
    #f_vector.append(pagerank[dst])
    edge_hood = neighborhood(src, dst)
    # add features: out degree of src, in degree of dst, # of common in with out
    #f_vector.extend([len(edge_hood['out_src']), len(edge_hood['in_dst']), edge_hood['n_common_in'], edge_hood['n_common_out']])
    f_vector.extend(edge_hood.values())
    
    return f_vector
        
        

In [49]:
vector = feature_extraction(('efimerides.eu','cancer-society.gr'))
print vector

[0.0, 0.00029761511829001677, 0.0003339730672479524, 0.0, 0.0, 0.0, 0.02981736861721953, 0.0007454342154304882, 0.0, 0.0]


In [50]:
X_train = []
y_train = []
for edge in edges:
    X_train.append(feature_extraction(edge))
    y_train.append(1)
for edge in non_existent_edges:
    X_train.append(feature_extraction(edge))
    y_train.append(0)


In [51]:
X_predict = []
for edge in non_edges:
    X_predict.append(feature_extraction(edge))

In [52]:
X_train_n = np.array(X_train)
y_train_n = np.array(y_train)
X_predict_n = np.array(X_predict)

In [53]:
print X_train_n.shape
print y_train_n.shape
print X_predict_n.shape

(23443, 10)
(23443,)
(4140197, 10)


In [63]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
reg = LogisticRegression()
sgd = SGDClassifier(loss="log", penalty="l2")
svm = SVC(probability=True)
reg.fit(X_train_n, y_train_n)
sgd.fit(X_train_n, y_train_n)
svm.fit(X_train_n, y_train_n)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [64]:
pred = reg.predict_proba(X_predict_n)

In [65]:
pred_sgd = sgd.predict_proba(X_predict_n)

In [66]:
pred_svm = svm.predict_proba(X_predict_n)

In [78]:
probs = []
for t in pred_svm:
    probs.append(t[1])


In [79]:
indices = sorted(range(len(probs)), key=lambda k: probs[k])[-453:]

In [80]:
predicted_edges = []
for i in range(len(indices)-1, -1, -1):
    index = indices[i]
    predicted_edges.append(non_edges[index])

In [81]:
print predicted_edges[:5]

[(u'agorazoome.gr', u'frontpages.gr'), (u'agorazoome.gr', u'google.gr'), (u'epirus-ellas.gr', u'frontpages.gr'), (u'epirus-ellas.gr', u'google.gr'), (u'portareto.gr', u'frontpages.gr')]


In [None]:
"""
new_edges = []
counter = 0
while (counter<453):
    if predicted_edges[counter][0]!='efimerides.eu' and predicted_edges[counter][0]!='pak-elta.gr':
        new_edges.append(predicted_edges[counter])
        counter+=1
        print counter"""

1


In [82]:

with open('predicted_edges_svm.txt', 'w') as fp:
    for site_1, site_2 in predicted_edges:
        fp.write(site_1+'\t'+site_2+'\n')