In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
import itertools
import csv
import math

import igraph

import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words("english")
#nltk.download('punkt') # for tokenization

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import preprocessing

# Import Data

In [0]:
with open("testing_set.txt", "r") as f:
    reader = csv.reader(f)
    test = list(reader)
test = [element[0].split(" ") for element in test]

In [0]:
with open("training_set.txt", "r") as f:
    reader = csv.reader(f)
    train = list(reader)
train = [element[0].split(" ") for element in train]

In [0]:
with open("node_information.csv", "r") as f:
    reader = csv.reader(f)
    node_info = list(reader)

In [0]:
ID = [element[0] for element in node_info]
year = [element[1] for element in node_info]
title = [element[2] for element in node_info]
authors = [element[3] for element in node_info]
journal = [element[4] for element in node_info]
abstract = [element[5] for element in node_info]

# Prepossing

### Textual Preprossing

In [0]:
# compute TFIDF vector for each title/authors/journal/abstract

vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words="english")
TFIDF_title = vectorizer.fit_transform(title)
TFIDF_author = vectorizer.fit_transform(authors)
TFIDF_journal = vectorizer.fit_transform(journal)
TFIDF_abstract = vectorizer.fit_transform(abstract)

### Graph Preprossing

In [0]:
                                       # -----Build graph ---- #

# create raw empty directed graph
g = igraph.Graph(directed=True)

# add vertices
nodes = ID
g.add_vertices(nodes)

# add edges
edges = [(element[0], element[1]) for element in train if element[2] == "1"]
g.add_edges(edges)

In [0]:
                           # ----- Generate features in the graph ---- #

# The adjacency list representation is a list of lists. 
# Each item of the outer list belongs to a single vertex of the graph. 
# The inner list contains the neighbors of the given vertex.
adjlist = [set(x) for x in g.get_adjlist(mode="ALL")]

# degree
degrees = g.degree()

# page rank
page_rank = g.pagerank()

# Create Features

### Train set

In [0]:
                                        # ---- Textual Features ---- #
overlapping_words_in_title = [] # number of overlapping words in title
temp_distance = [] # temporal distance between the papers
common_authors = [] # number of common authors
common_words_in_journal = [] # number of common words in journal
overlapping_words_in_abstract = []  # number of overlapping words in abstract
sim_title = []  # cosine similarity of title
sim_author = []  # cosine similarity of authors
sim_journal = []  # cosine similarity of journal
sim_abstract = [] # consine similarity of abstract

                                        # ---- Graphical Features ---- #
common_neighbors = [] # common neighbors
pref_attach = [] # preferential attachment
jaccard_sim = [] # Jaccard similarity coefficient
adam_adar_sim = [] # Adamic Adar similarity
page_rank_source = [] # page rank
page_rank_target = [] # page rank
#shortest_paths = []

In [0]:
def jaccard_similarity(source, target, graph):
    s_neighbors = set(graph.neighbors(source))
    t_neighbors = set(graph.neighbors(target))
    intersect = len(s_neighbors.intersection(t_neighbors))
    union = len(s_neighbors.union(t_neighbors))
    if union == 0:
        return 0.0
    else:
        return float(intersect / float(union))

        #-----------------#
    
def adam_adar_similarity(source, target, graph):
    s_neighbors = set(graph.neighbors(source))
    t_neighbors = set(graph.neighbors(target))
    sim = 0.0
    for i in s_neighbors.intersection(t_neighbors):
        if math.log(len(graph.neighbors(i))) == 0:
            sim += 0.0
        else:
            sim += float(1 / math.log(len(graph.neighbors(i))))
    return sim

In [0]:
# create features for train set
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()
count = 0.0

for i in range(len(train)):
    source = train[i][0]
    target = train[i][1]
    index_source = ID.index(source)
    index_target = ID.index(target)
    
    source_info = [element for element in node_info if element[0] == source][0]
    target_info = [element for element in node_info if element[0] == target][0]
    
    # prepossing title
    source_title = source_info[2].lower().split(" ")
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]
    
    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]
    
    # prepossing author
    source_author = source_info[3].split(",")
    target_author = target_info[3].split(",")
    
    # prepossing abstract
    source_abstract = source_info[5].lower().split(" ")  
    source_abstract = [token for token in source_abstract if token not in stpwds] 
    source_abstract = [stemmer.stem(token) for token in source_abstract]  

    target_abstract = target_info[5].lower().split(" ") 
    target_abstract = [token for token in target_abstract if token not in stpwds]  
    target_abstract = [stemmer.stem(token) for token in target_abstract]
    
    # Generate textual features
    # 1
    overlapping_words_in_title.append(len(set(source_title).intersection(set(target_title))))
    # 2
    temp_distance.append(int(source_info[1]) - int(target_info[1]))
    # 3
    common_authors.append(len(set(source_author).intersection(set(target_author))))
    # 4
    common_words_in_journal.append(len(set(source_info[4]).intersection(set(target_info[4]))))
    # 5
    overlapping_words_in_abstract.append(len(set(source_abstract).intersection(set(target_abstract))))
    # 6
    sim_title.append(cosine_similarity(TFIDF_title[index_source], TFIDF_title[index_target]))
    # 7
    sim_author.append(cosine_similarity(TFIDF_author[index_source], TFIDF_author[index_target]))
    # 8
    sim_journal.append(cosine_similarity(TFIDF_journal[index_source], TFIDF_journal[index_target]))
    # 9
    sim_abstract.append(cosine_similarity(TFIDF_abstract[index_source], TFIDF_abstract[index_target]))

    # generate graphical features
    # 10
    common_neighbors.append(len(adjlist[index_source].intersection(adjlist[index_target])))
    # 11
    pref_attach.append(int(degrees[index_source] * degrees[index_target]))
    # 12
    jaccard_sim.append(jaccard_similarity(index_source, index_target, g))
    # 13
    adam_adar_sim.append(adam_adar_similarity(index_source, index_target, g))
    # 14
    page_rank_source.append(page_rank[index_target])
    # 15
    page_rank_target.append(page_rank[index_source])
    # 16
    #shortest_paths_t.append(len(g.shortest_paths_dijkstra(source=index_source, target=index_target, weights=None, mode=1)))
    
    count += 1
    if count % 5000 == True:
        print(count,'training examples processed')

In [0]:
# merge all the features and save
train_features = np.array(
    [overlapping_words_in_title, temp_distance, common_authors, common_words_in_journal, 
     overlapping_words_in_abstract, sim_title, sim_author, sim_journal, sim_abstract, 
     common_neighbors, pref_attach, jaccard_sim, adam_adar_sim, page_rank_source, 
     page_rank_target]).astype(np.float64).T


# save
np.savetxt('train_features.txt', train_features)

### Test set

In [0]:
# create features for test set
                                        # ---- Textual Features ---- #
overlapping_words_in_title_t = [] # number of overlapping words in title
temp_distance_t = [] # temporal distance between the papers
common_authors_t = [] # number of common authors
common_words_in_journal_t = [] # number of common words in journal
overlapping_words_in_abstract_t = []  # number of overlapping words in abstract
sim_title_t = []  # cosine similarity of title
sim_author_t = []  # cosine similarity of authors
sim_journal_t = []  # cosine similarity of journal
sim_abstract_t = [] # consine similarity of abstract

                                        # ---- Graphical Features ---- #
common_neighbors_t = [] # common neighbors
pref_attach_t = [] # preferential attachment
jaccard_sim_t = [] # Jaccard similarity coefficient
adam_adar_sim_t = [] # Adamic Adar similarity
page_rank_source_t = [] # page rank
page_rank_target_t = [] # page rank
#shortest_paths_t = []

In [0]:
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()
count = 0
for i in range(len(test)):
    source = test[i][0]
    target = test[i][1]
    index_source = ID.index(source)
    index_target = ID.index(target)
    
    source_info = [element for element in node_info if element[0] == source][0]
    target_info = [element for element in node_info if element[0] == target][0]
    
    # prepossing title
    source_title = source_info[2].lower().split(" ")
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]
    
    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]
    
    # prepossing author
    source_author = source_info[3].split(",")
    target_author = target_info[3].split(",")
    
    # prepossing abstract
    source_abstract = source_info[5].lower().split(" ")  
    source_abstract = [token for token in source_abstract if token not in stpwds] 
    source_abstract = [stemmer.stem(token) for token in source_abstract]  

    target_abstract = target_info[5].lower().split(" ") 
    target_abstract = [token for token in target_abstract if token not in stpwds]  
    target_abstract = [stemmer.stem(token) for token in target_abstract]
    
    # Generate textual features
    # 1
    overlapping_words_in_title_t.append(len(set(source_title).intersection(set(target_title))))
    # 2
    temp_distance_t.append(int(source_info[1]) - int(target_info[1]))
    # 3
    common_authors_t.append(len(set(source_author).intersection(set(target_author))))
    # 4
    common_words_in_journal_t.append(len(set(source_info[4]).intersection(set(target_info[4]))))
    # 5
    overlapping_words_in_abstract_t.append(len(set(source_abstract).intersection(set(target_abstract))))
    # 6
    sim_title_t.append(cosine_similarity(TFIDF_title[index_source], TFIDF_title[index_target]))
    # 7
    sim_author_t.append(cosine_similarity(TFIDF_author[index_source], TFIDF_author[index_target]))
    # 8
    sim_journal_t.append(cosine_similarity(TFIDF_journal[index_source], TFIDF_journal[index_target]))
    # 9
    sim_abstract_t.append(cosine_similarity(TFIDF_abstract[index_source], TFIDF_abstract[index_target]))

    # generate graphical features
    # 10
    common_neighbors_t.append(len(adjlist[index_source].intersection(adjlist[index_target])))
    # 11
    pref_attach_t.append(int(degrees[index_source] * degrees[index_target]))
    # 12
    jaccard_sim_t.append(jaccard_similarity(index_source, index_target, g))
    # 13
    adam_adar_sim_t.append(adam_adar_similarity(index_source, index_target, g))
    # 14
    page_rank_source_t.append(page_rank[index_target])
    # 15
    page_rank_target_t.append(page_rank[index_source])
    #
    #shortest_paths_t.append(len(g.shortest_paths_dijkstra(source=index_source, target=index_target, weights=None, mode=1)))
    
    count += 1
    if count % 5000 == True:
        print(count,'testing examples processed')

1 training examples processed
5001 training examples processed
10001 training examples processed
15001 training examples processed
20001 training examples processed
25001 training examples processed
30001 training examples processed


In [0]:
# merge all the features and save
test_features = np.array(
    [overlapping_words_in_title_t, temp_distance_t, common_authors_t, common_words_in_journal_t, 
     overlapping_words_in_abstract_t, sim_title_t, sim_author_t, sim_journal_t, sim_abstract_t, 
     common_neighbors_t, pref_attach_t, jaccard_sim_t, adam_adar_sim_t, page_rank_source_t, 
     page_rank_target_t]).astype(np.float64).T

# save
np.savetxt('test_features.txt', test_features)