## Feature Engineering to train our models  

<b> End up in 15 Features: </b>

1) Number of overlapping words in title  
2) Temporal distance between the papers  
3) Number of common authors  
4) Number of overlapping words in journal  
5) Number of overlapping words in abstract  
6) Cosine similarity of abstract  
7) Cosine similarity of title  
8) Cosine similarity of author  
9) Cosine similarity of journal

<b> Graph Based Features</b>

10) Jaccard similarity coefficient  
11) Preferential attachment score  
12) Adamic Adar Index  
13) Common neighbours  
14) Same Cluster (community)  
15) Community resource allocation  

### Import the appropriate packages

In [1]:
import random
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from sklearn import preprocessing
import nltk
import csv
import re
from time import time

In [2]:
#for networks

import networkx as nx
from networkx import *
#!pip install python-louvain
import community.community_louvain as community_louvain

In [3]:
nltk.download('punkt') # for tokenization
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Open the files

In [4]:
#Testing Set
with open("testing_set.txt", "r") as f:
    reader = csv.reader(f)
    testing_set  = list(reader)

testing_set = [element[0].split(" ") for element in testing_set]

In [5]:
#Training Set
with open("training_set.txt", "r") as f:
    reader = csv.reader(f)
    training_set  = list(reader)

training_set = [element[0].split(" ") for element in training_set]

In [6]:
#Node Set
with open("node_information.csv", "r") as f:
    reader = csv.reader(f)
    node_info  = list(reader)

IDs = [element[0] for element in node_info]

### Preprossesing for consine Similarity (Abstract , Title ,  Author & Journal)

In [7]:
# compute TFIDF vector of each paper Abstract
corpus = [element[5] for element in node_info] #get the abstract from node info file
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_features=50000) #we add the combination of two words as well, but since the output will be quite large we want to minimize to the 50.000 double of the inital
features_TFIDF_abstr = vectorizer.fit_transform(corpus)

#features_TFIDF_abstr.shape -> to check the shape

In [8]:
# compute TFIDF vector of each paper Title
corpus = [element[2] for element in node_info]  #get the title from node info file
vectorizer = TfidfVectorizer (ngram_range=(1, 2), stop_words="english", max_features=10000) #we add the combination of two words as well, but since the output will be quite large we want to minimize to the 10.0000 almostdouble of the inital
features_TFIDF_title = vectorizer.fit_transform(corpus)

In [9]:
# compute TFIDF vector of each author
corpus = [element[3] for element in node_info]
vectorizer = TfidfVectorizer (ngram_range=(1, 2), stop_words="english", max_features=20000) #we add the combination of two words as well, but since the output will be quite large we want to minimize to the 20.000 almost double of the inital
features_TFIDF_author = vectorizer.fit_transform(corpus)

In [10]:
# compute TFIDF vector of each journal
corpus = [element[4] for element in node_info]
vectorizer = TfidfVectorizer (ngram_range=(1, 2), stop_words="english") 
features_TFIDF_journal = vectorizer.fit_transform(corpus)

### Preprossesing for Features from Graphs


In [11]:
edges = [(element[0], element[1]) for element in training_set if element[2] == "1"]
# some nodes may not be connected to any other node
# hence the need to create the nodes of the graph from node_info.csv,
# not just from the edge list
nodes = IDs

In [12]:
# make the graph with the networkX library
gx = nx.Graph() 
gx.add_nodes_from(nodes)
gx.add_edges_from(edges)

In [13]:
# find communities

partition = community_louvain.best_partition(gx)

In [14]:
#assign communities at each node
for i in nodes:
    gx.nodes[i]["community"] = partition[i]

In [15]:
# function to check if they belong in the same community (if so, outpout 1)
def in_same_cluster(s, d, partition):
    if partition[s] ==  partition[d]:
        return 1
    else:
        return 0

# Initialise the process of Feature Engineering

In [16]:
# # for each training example we need to compute features  
# # in this baseline we will train the model on only 1% of the training set  

# to_keep = random.sample(range(len(training_set)), k=int(round(len(training_set)*0.01)))
# training_set_reduced = [training_set[i] for i in to_keep]

training_set_reduced= training_set

In [17]:
%%time
# Initilaise the required lists for our features

#1: Number of overlapping words in title
overlap_title = []
#2: Temporal distance between the papers
temp_diff = []
#3: Number of common authors
comm_auth = []
#4: Number of overlapping words in journal
overlap_journ = []
#5: Number of overlapping words in adstract
overlap_abstr = []
#6: Cosine similarity of abstract
cos_sim_abstr = []
#7: Cosine similarity title
cos_sim_title = []
#8: Cosine similarity author
cos_sim_author = []
#9: Cosine similarity journal
cos_sim_journal = []
#10: Jaccard similarity coefficient
jac_sim = []
#11: Preferential attachment score
pref_attac = []
#12: Adamic Adar Index
adam_index = []
#13: Common neighbors
com_neigh = []
#14: Same Cluster (community)
same_cluster=[]
#15: Community resource allocation
commun_ra = []
# Extra:  Common Neighbor and Centrality based Parameterized Algorithm(CCPA) - It was not running, maybe computational expensive 


counter = 0
for i in range(len(training_set_reduced)):
    source = training_set_reduced[i][0]
    target = training_set_reduced[i][1]
    
    index_source = IDs.index(source)
    index_target = IDs.index(target)
    
    source_info = [element for element in node_info if element[0]==source][0]
    target_info = [element for element in node_info if element[0]==target][0]
    
    ### Preprocessing of data
    #1 -> Title (source and targe) convert to lowercase, tokenize, remove stopwords and stemming
    source_title = source_info[2].lower().split(" ")
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]
    
    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]
    
   
    #2 -> Manipulate authors (source and targe) 
    source_auth = source_info[3]
    source_auth= re.sub(r'\([^\)]*(?=\.\w+$)', '', source_auth) #in some cases there were also the university denoted in parenthesis -> first close parentheses
    source_auth = re.sub(r'\([^()]*\)', '', source_auth)  #then remove it
    source_auth = source_auth.split(",") 
    
    target_auth = target_info[3]
    target_auth= re.sub(r'\([^\)]*(?=\.\w+$)', '', target_auth) #in some cases there were also the university denoted in parenthesis -> first close parentheses
    target_auth = re.sub(r'\([^()]*\)', '', target_auth)  #then remove it
    target_auth = target_auth.split(",") 
    
    
    #3-> Same process as Title for Abstract
    source_abstr = source_info[5].lower().split(" ")  
    source_abstr = [token for token in source_abstr if token not in stpwds] 
    source_abstr = [stemmer.stem(token) for token in source_abstr]  
    
    target_abstr = target_info[5].lower().split(" ")  # convert to lowercase and tokenize
    target_abstr = [token for token in target_abstr if token not in stpwds]  # remove stopwords
    target_abstr = [stemmer.stem(token) for token in target_abstr]  # perform stemming

    
    #4-> Call Journal(source and targe) 
    source_journal = source_info[4]
    target_journal = target_info[4]

    
    #Features extraction 
    #1 ->  Overlapping words in Title
    overlap_title.append(len(set(source_title).intersection(set(target_title))))
    #2 -> Difference of Publication Year
    temp_diff.append(int(source_info[1]) - int(target_info[1]))
    #3 -> Common Authors
    comm_auth.append(len(set(source_auth).intersection(set(target_auth))))
    #4 - Overlapping words in Journal
    overlap_journ.append(len(set(source_journal).intersection(set(target_journal))))
    #5 - Overlapping Abstract words
    overlap_abstr.append(len(set(source_abstr).intersection(set(target_abstr))))
    #6 - Cosine similarity of Abstract
    cos_sim_abstr.append(cosine_similarity(features_TFIDF_abstr[index_source], features_TFIDF_abstr[index_target]))
    #7 - Cosine similarity of Title
    cos_sim_title.append(cosine_similarity(features_TFIDF_title[index_source], features_TFIDF_title[index_target]))
    #8 - Cosine similarity of Author
    cos_sim_author.append(cosine_similarity(features_TFIDF_author[index_source], features_TFIDF_author[index_target]))
    #9 - Cosine similarity of Journal
    cos_sim_journal.append(cosine_similarity(features_TFIDF_journal[index_source], features_TFIDF_journal[index_target]))
    
    # From Graphs
    #10 Jaccard similarity coeffieent
    pred = nx.jaccard_coefficient(gx,[(str(source), str(target))]) #string since only this way will be able to handle 
    for u, v ,p in pred:
        jac_sim.append(p)
        
    #11: Preferential attachment score
    pref = nx.preferential_attachment(gx,[(str(source), str(target))]) #string since only this way will be able to handle
    for u, v ,p in pref:
        pref_attac.append(p)
    
    #12: Adamic Adar Index
    adam = nx.adamic_adar_index(gx,[(str(source), str(target))]) #string since only this way will be able to handle
    for u, v ,p in adam:
        adam_index.append(p)
        
    #13: common neighbors
    com_neigh.append(len(sorted(nx.common_neighbors(gx, str(source), str(target)))))
    
    #14: same cluster
    same_cluster.append(in_same_cluster(str(source), str(target), partition))
    
    #15: Community resource allocation
    cra = nx.ra_index_soundarajan_hopcroft(gx,[(str(source), str(target))]) #same as 8
    for u, v ,p in cra:
        commun_ra.append(p)
    
    counter += 1
    if counter % 10000 == True:
        print (counter, "training examples processsed")
        
print ("End of Process")

1 training examples processsed
10001 training examples processsed
20001 training examples processsed
30001 training examples processsed
40001 training examples processsed
50001 training examples processsed
60001 training examples processsed
70001 training examples processsed
80001 training examples processsed
90001 training examples processsed
100001 training examples processsed
110001 training examples processsed
120001 training examples processsed
130001 training examples processsed
140001 training examples processsed
150001 training examples processsed
160001 training examples processsed
170001 training examples processsed
180001 training examples processsed
190001 training examples processsed
200001 training examples processsed
210001 training examples processsed
220001 training examples processsed
230001 training examples processsed
240001 training examples processsed
250001 training examples processsed
260001 training examples processsed
270001 training examples processsed
280001

### Insert 15 new features in file

In [18]:
# convert list of lists into array
# documents as rows, unique words as columns (i.e., example as rows, features as columns)
training_features = np.array([overlap_title, temp_diff, comm_auth, overlap_journ, overlap_abstr, cos_sim_abstr, cos_sim_title, cos_sim_author, cos_sim_journal, jac_sim, pref_attac, adam_index, com_neigh, same_cluster, commun_ra]).T

  training_features = np.array([overlap_title, temp_diff, comm_auth, overlap_journ, overlap_abstr, cos_sim_abstr, cos_sim_title, cos_sim_author, cos_sim_journal, jac_sim, pref_attac, adam_index, com_neigh, same_cluster, commun_ra]).T


In [19]:
#scale them before save
training_features = preprocessing.scale(training_features)

In [20]:
training_features_scaled = zip(range(len(training_set)), training_features)
# header = ['overlap_title', 'temp_diff', 'comm_auth', 'overlap_journ', 'overlap_abstr', 'cos_sim_abstr', 'cos_sim_title', 'cos_sim_author', 'cos_sim_journal', 'jac_sim', 'pref_attac', 'adam_index', 'com_neigh', 'same_cluster', 'commun_ra']
with open("training_features_scaled.csv","w") as tr_feat:
    csv_out = csv.writer(tr_feat)
#     csv_out.writerow(header)
    for row in training_features:
        csv_out.writerow(row)

#### Add also labels in a file 

In [21]:
# convert labels into integers then into column array
labels = [int(element[2]) for element in training_set_reduced]
labels = list(labels)
labels_array = np.array(labels)

In [22]:
from numpy import savetxt
# save to csv file
savetxt('labels.csv', labels_array, delimiter=',')

## Same process for test

In [23]:
%%time
# test -> same process as train
# we need to compute the features for the testing set

overlap_title_test = []
temp_diff_test = []
comm_auth_test = []
overlap_journ_test = []
overlap_abstr_test = []
cos_sim_abstr_test = []
cos_sim_title_test = []
cos_sim_author_test = []
cos_sim_journal_test = []
jac_sim_test = []
pref_attac_test = []
adam_index_test = []
com_neigh_test = []
same_cluster_test=[]
commun_ra_test = []

counter = 0

for i in range(len(testing_set)):
    source = testing_set[i][0]
    target = testing_set[i][1]
    
    index_source = IDs.index(source)
    index_target = IDs.index(target)
    
    source_info = [element for element in node_info if element[0]==source][0]
    target_info = [element for element in node_info if element[0]==target][0]
    
   ### Preprocessing of data
    #1 -> Title (source and targe) convert to lowercase, tokenize, remove stopwords and stemming
    source_title = source_info[2].lower().split(" ")
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]
    
    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]
   
    #2 -> Manipulate authors (source and targe) 
    source_auth = source_info[3]
    source_auth= re.sub(r'\([^\)]*(?=\.\w+$)', '', source_auth) #in some cases there were also the university denoted in parenthesis -> first close parentheses
    source_auth = re.sub(r'\([^()]*\)', '', source_auth)  #then remove it
    source_auth = source_auth.split(",") 
    
    target_auth = target_info[3]
    target_auth= re.sub(r'\([^\)]*(?=\.\w+$)', '', target_auth) #in some cases there were also the university denoted in parenthesis -> first close parentheses
    target_auth = re.sub(r'\([^()]*\)', '', target_auth)  #then remove it
    target_auth = target_auth.split(",") 
    
    #3-> Same process as Title for Abstract
    source_abstr = source_info[5].lower().split(" ")  
    source_abstr = [token for token in source_abstr if token not in stpwds] 
    source_abstr = [stemmer.stem(token) for token in source_abstr]  
    
    target_abstr = target_info[5].lower().split(" ")  # convert to lowercase and tokenize
    target_abstr = [token for token in target_abstr if token not in stpwds]  # remove stopwords
    target_abstr = [stemmer.stem(token) for token in target_abstr]  # perform stemming

    #4-> Call Journal(source and targe) 
    source_journal = source_info[4]
    target_journal = target_info[4]

    
    #Features extraction 
    overlap_title_test.append(len(set(source_title).intersection(set(target_title))))
    temp_diff_test.append(int(source_info[1]) - int(target_info[1]))
    comm_auth_test.append(len(set(source_auth).intersection(set(target_auth))))
    overlap_journ_test.append(len(set(source_journal).intersection(set(target_journal))))
    overlap_abstr_test.append(len(set(source_abstr).intersection(set(target_abstr))))
    cos_sim_abstr_test.append(cosine_similarity(features_TFIDF_abstr[index_source], features_TFIDF_abstr[index_target]))
    cos_sim_title_test.append(cosine_similarity(features_TFIDF_title[index_source], features_TFIDF_title[index_target]))
    cos_sim_author_test.append(cosine_similarity(features_TFIDF_author[index_source], features_TFIDF_author[index_target]))
    cos_sim_journal_test.append(cosine_similarity(features_TFIDF_journal[index_source], features_TFIDF_journal[index_target]))
   
    
    #Graph Features
    #1
    pred = nx.jaccard_coefficient(gx,[(str(source), str(target))]) #string since only this way will be able to handle 
    for u, v ,p in pred:
        jac_sim_test.append(p)
        
    #2
    pref = nx.preferential_attachment(gx,[(str(source), str(target))]) #string since only this way will be able to handle
    for u, v ,p in pref:
        pref_attac_test.append(p)
    
    #3
    adam = nx.adamic_adar_index(gx,[(str(source), str(target))]) #string since only this way will be able to handle
    for u, v ,p in adam:
        adam_index_test.append(p)
        
    #4
    com_neigh_test.append(len(sorted(nx.common_neighbors(gx, str(source), str(target)))))
    
    #5
    same_cluster_test.append(in_same_cluster(str(source), str(target), partition))
    
    #6
    cra_t = nx.ra_index_soundarajan_hopcroft(gx,[(str(source), str(target))]) #same as 8
    for u, v ,p in cra_t:
        commun_ra_test.append(p)
    
    counter += 1
    if counter % 10000 == True:
        print (counter, "testing examples processsed")
        
print ("End of Process")

1 testing examples processsed
10001 testing examples processsed
20001 testing examples processsed
30001 testing examples processsed
End of Process
Wall time: 7min 16s


In [24]:
# convert list of lists into array
# documents as rows, unique words as columns (i.e., example as rows, features as columns)
testing_features = np.array([overlap_title_test,temp_diff_test,comm_auth_test, overlap_journ_test,overlap_abstr_test,cos_sim_abstr_test, cos_sim_title_test, cos_sim_author_test, cos_sim_journal_test,jac_sim_test, pref_attac_test, adam_index_test,com_neigh_test, same_cluster_test,commun_ra_test]).T

  testing_features = np.array([overlap_title_test,temp_diff_test,comm_auth_test, overlap_journ_test,overlap_abstr_test,cos_sim_abstr_test, cos_sim_title_test, cos_sim_author_test, cos_sim_journal_test,jac_sim_test, pref_attac_test, adam_index_test,com_neigh_test, same_cluster_test,commun_ra_test]).T


In [25]:
# scale
testing_features = preprocessing.scale(testing_features)

In [26]:
testing_features_scaled = zip(range(len(testing_set)), testing_features)
header=['overlap_title_test','temp_diff_test','comm_auth_test', 'overlap_journ_test','overlap_abstr_test','cos_sim_abstr_test', 'cos_sim_title_test', 'cos_sim_author_test', 'cos_sim_journal_test','jac_sim_test', 'pref_attac_test', 'adam_index_test','com_neigh_test', 'same_cluster_test','commun_ra_test']
with open("testing_features_scaled.csv","w") as test_feat:
    csv_out = csv.writer(test_feat)
#     csv_out.writerow(header)
    for row in testing_features:
        csv_out.writerow(row)