In [0]:
# https://www.kaggle.com/c/ngsa-2018

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
import itertools

from gensim import corpora, models, similarities
import networkx as nx
from networkx.algorithms.connectivity import local_edge_connectivity
from networkx.algorithms.connectivity import build_auxiliary_edge_connectivity
from networkx.algorithms.flow import build_residual_network
import igraph

import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words("english")
#nltk.download('punkt') # for tokenization

from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn import preprocessing

# Import data

In [0]:
nodes_header = ["id", "year", "title", "authors", "journal", "abstract"]
node_info = pd.read_csv('node_information.csv',names=nodes_header)
node_info.set_index("id", inplace=True)

In [0]:
node_info.head()

Unnamed: 0_level_0,year,title,authors,journal,abstract
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,2000,compactification geometry and duality,Paul S. Aspinwall,,these are notes based on lectures given at tas...
1002,2000,domain walls and massive gauged supergravity p...,"M. Cvetic, H. Lu, C.N. Pope",Class.Quant.Grav.,we point out that massive gauged supergravity ...
1003,2000,comment on metric fluctuations in brane worlds,"Y.S. Myung, Gungwon Kang",,recently ivanov and volovich hep-th 9912242 cl...
1004,2000,moving mirrors and thermodynamic paradoxes,Adam D. Helfer,Phys.Rev.,quantum fields responding to moving mirrors ha...
1005,2000,bundles of chiral blocks and boundary conditio...,"J. Fuchs, C. Schweigert",,proceedings of lie iii clausthal july 1999 var...


In [0]:
names = ["id1","id2","category"]
train = pd.read_csv('training_set.txt', names=names,delimiter=" ")
train["index"] = train["id1"].astype(str) + "|" + train["id2"].astype(str)
train.set_index("index", inplace=True)
train_size=615512

name = ["id1","id2"]
test = pd.read_csv('testing_set.txt',names=name,delimiter=" ")
#train["index"] = train["id1"].astype(str) + "|" + train["id2"].astype(str)
#train.set_index("index", inplace=True)

# Preprossing

In [0]:
# this part is to do some cleaning on titles and abstract

stemmer = nltk.porter.PorterStemmer()

def cleaning(text):
    # split and lower the text
    text= text.apply(lambda sentence:  "".join((char if char.isalpha() else " ") for char in sentence).lower().split() )
    # remove stopwords
    text = text.apply(lambda words : [word for word in words if word not in stopwords])
    # stemmer
    text = text.apply(lambda words : [stemmer.stem(word) for word in words])
    return text

In [0]:
                                     # -----clean title----- #
titles = cleaning(node_info['title'])
node_info['clean_titles'] = titles

                                     # -----clean abstract---- #
abstracts = cleaning(node_info['abstract'])
node_info['clean_abstracts'] = abstracts

                                     # -----split authors---- #
authors = node_info['authors'].astype(str).str.split(',')
node_info['split_authors'] = authors

# Generate Features

### overlapping words in title / overlapping words in abstract / number of common author

In [0]:
def get_overlap(id1,id2,text,split=True):
    text_1 = node_info.at[id1, text]
    text_2 = node_info.at[id2, text]
    count = len(set(text_1).intersection(set(text_2)))
    return count

                                # --- overlapping words in title --- #
print('start computing overlapping words in title')
train['overlapping_words_in_title'] = 0
for index, row in train.iterrows():
    id1, id2 = row['id1'], row['id2']
    row['overlapping_words_in_title'] = get_overlap(id1, id2, 'clean_titles')
    i += 1
    if int(i) % 100000 == 0:
        print(str(100.0*i/train_size)[:4]+"% of the task done", flush=True)

start computing overlapping words in title
536.% of the task done
552.% of the task done
568.% of the task done
584.% of the task done
601.% of the task done
617.% of the task done


In [0]:
                            # --- overlapping words in abstract --- #
print('start computing overlapping words in abstract')
train['overlapping_words_in_abstract'] = 0
for index, row in train.iterrows():
    id1, id2 = row['id1'], row['id2']
    row['overlapping_words_in_abstract'] = get_overlap(int(id1),int(id2), 'clean_abstracts')
    i += 1
    if int(i) % 100000 == 0:
        print(str(100.0*i/train_size)[:4]+"% of the task done", flush=True)
    
                                 # --- common authors --- #
print('start computing common authors')
train['common_author'] = 0
for index, row in train.iterrows():
    id1, id2 = row['id1'], row['id2']
    row['common_author'] = get_overlap(int(id1),int(id2), 'split_authors')
    i += 1
    if int(i) % 100000 == 0:
        print(str(100.0*i/train_size)[:4]+"% of the task done", flush=True)

start computing overlapping words in abstract
633.% of the task done
649.% of the task done
666.% of the task done
682.% of the task done
698.% of the task done
714.% of the task done
start computing common authors
731.% of the task done
747.% of the task done
763.% of the task done
779.% of the task done
796.% of the task done
812.% of the task done


In [0]:
                                # --- overlapping words in title --- #
print('start computing overlapping words in title')
test['overlapping_words_in_title'] = 0
for index, row in test.iterrows():
    id1, id2 = row['id1'], row['id2']
    row['overlapping_words_in_title'] = get_overlap(int(id1),int(id2), 'clean_titles')
    i += 1
    if int(i) % 100000 == 0:
        print(str(100.0*i/train_size)[:4]+"% of the task done", flush=True)
    
                              # --- overlapping words in abstract --- #
print('start computing overlapping words in abstract')
test['overlapping_words_in_abstract'] = 0
for index, row in test.iterrows():
    id1, id2 = row['id1'], row['id2']
    row['overlapping_words_in_abstract'] = get_overlap(int(id1),int(id2), 'clean_abstracts')
    i += 1
    if int(i) % 100000 == 0:
        print(str(100.0*i/train_size)[:4]+"% of the task done", flush=True)
    
                                 # --- common authors --- #
print('start computing common authors')
test['common_author'] = 0
for index, row in test.iterrows():
    id1, id2 = row['id1'], row['id2']
    row['common_author'] = get_overlap(int(id1),int(id2), 'split_authors')
    i += 1
    if int(i) % 100000 == 0:
        print(str(100.0*i/train_size)[:4]+"% of the task done", flush=True)

start computing overlapping words in title
start computing overlapping words in abstract
828.% of the task done
start computing common authors


### temporal distance between two papers

In [0]:
print('start computing temporal distance between papers')
train['tem_distance'] = 0
for index, row in train.iterrows():
    id1, id2 = row['id1'], row['id2']
    row['tem_distance'] = int(node_info['year'][id1]) - int(node_info['year'][id2])
    i += 1
    if int(i) % 100000 == 0:
        print(str(100.0*i/train_size)[:4]+"% of the task done", flush=True)

start computing temporal distance between papers
844.% of the task done
861.% of the task done
877.% of the task done
893.% of the task done
909.% of the task done
926.% of the task done


In [0]:
print('start computing temporal distance between papers')
test['tem_distance'] = 0
for index, row in test.iterrows():
    id1, id2 = row['id1'], row['id2']
    row['tem_distance'] = int(node_info['year'][id1]) - int(node_info['year'][id2])
    i += 1
    if int(i) % 100000 == 0:
        print(str(100.0*i/train_size)[:4]+"% of the task done", flush=True)

start computing temporal distance between papers
942.% of the task done


###  feature TFIDF in abstract

In [0]:
vectorizer = TfidfVectorizer(stop_words="english")
# each row is a node in the order of node_info
features_TFIDF = vectorizer.fit_transform(node_info['abstract'])

#features_TFIDF.toarray()

### cosine similarity in abstract

In [0]:
# mark*: the result is quiet different between using "abstract" to compute tf-idf and using "clean_abstracts",
# "clean abstract have been removed stopwords and done stemming".
# below is used "clean_abstracts", but I am not sure if it would be better than using "abstract".

dictionary = corpora.Dictionary(abstracts)
tfidf = models.TfidfModel(dictionary=dictionary)

def get_tf_idf_encoding(index):
    abstract = node_info.at[index, "clean_abstracts"]
    #abstract = abstract.split(" ")
    abstract = dictionary.doc2bow(abstract)
    count = tfidf[[abstract]]
    return count[0]

def my_norm(f):
    ans = 0  
    for (k, v) in f:
        ans += v**2    
    return np.sqrt(ans)
    
def cosine_distance(id1, id2):
    f1 = get_tf_idf_encoding(id1)
    f2 = get_tf_idf_encoding(id2)
    denom = my_norm(f1) * my_norm(f2)
    f1 = dict(f1)
    f2 = dict(f2)
    
    ans = 0.0
    
    for k, v in f1.items():
        if k in f2.keys():
            ans += v * f2[k]
    return ans/denom

print('start compute the cosine similarity in abstract')
i = 0
for index, row in train.iterrows():
    id1, id2 = int(row["id1"]), int(row["id2"]) 
    train.set_value(index, "cosine_distance", cosine_distance(id1, id2))
    i += 1
    if int(i) % 100000 == 0:
        print(str(100.0*i/train_size)[:4]+"% of the task done", flush=True)
    
print('start compute the cosine similarity in abstract')
i = 0
for index, row in test.iterrows():
    id1, id2 = int(row["id1"]), int(row["id2"]) 
    test.set_value(index, "cosine_distance", cosine_distance(id1, id2))
    i += 1
    if int(i) % 100000 == 0:
        print(str(100.0*i/train_size)[:4]+"% of the task done", flush=True)

start compute the cosine similarity in abstract




16.2% of the task done
32.4% of the task done
48.7% of the task done
64.9% of the task done
81.2% of the task done
97.4% of the task done
start compute the cosine similarity in abstract




### edge connectivity

H = build_auxiliary_edge_connectivity(di_network_graph)
R = build_residual_network(H, 'capacity')
di_connectivity = dict.fromkeys(di_network_graph, dict())
for u, v in itertools.combinations(di_network_graph, 2):
    k = local_edge_connectivity(di_network_graph, u, v, auxiliary=H, residual=R)
    di_connectivity[u][v] = k

H = build_auxiliary_edge_connectivity(un_network_graph)
R = build_residual_network(H, 'capacity')
un_connectivity = dict.fromkeys(un_network_graph, dict())
for u, v in itertools.combinations(un_network_graph, 2):
    k = local_edge_connectivity(un_network_graph, u, v, auxiliary=H, residual=R)
    un_connectivity[u][v] = k
            
# edge connectivity
di_connectivities = [di_connectivity[vs[str(id1)], di_connectivity[str(id2)]] for id1, id2 in zip(id1, id2)]
un_connectivities = [un_connectivity[vs[str(id1)], un_connectivity[str(id2)]] for id1, id2 in zip(id1, id2)]

train['di_connectivities'] = di_connectivities
train['un_connectivities'] = un_connectivities

### in the same cluster or not

In [0]:
# no idea how to do it

### same journal

In [0]:
id1 = train['id1'].values
id2 = train['id2'].values
journal_1 = np.array(node_info.loc[id1,'journal'])
journal_2 = np.array(node_info.loc[id2,'journal'])
na_journal = np.array((journal_1=="NaN") | (journal_2=="NaN")).astype(int)
same_journal = np.array((journal_1!="NaN") & (journal_2!="NaN") & (journal_1==journal_2)).astype(int)
diff_journal = np.array((journal_1!="NaN") & (journal_2!="NaN") & (journal_1!=journal_2)).astype(int)

train['na_journal'] = na_journal
train['same_journal'] = same_journal
train['diff_journal'] = diff_journal

In [0]:
id1_t = test['id1'].values
id2_t = test['id2'].values
journal_1_t = np.array(node_info.loc[id1_t,'journal'])
journal_2_t = np.array(node_info.loc[id2_t,'journal'])
na_journal_t = np.array((journal_1_t=="NaN") | (journal_2_t=="NaN")).astype(int)
same_journal_t = np.array((journal_1_t!="NaN") & (journal_2_t!="NaN") & (journal_1_t==journal_2_t)).astype(int)
diff_journal_t = np.array((journal_1_t!="NaN") & (journal_2_t!="NaN") & (journal_1_t!=journal_2_t)).astype(int)

test['na_journal'] = na_journal_t
test['same_journal'] = same_journal_t
test['diff_journal'] = diff_journal_t

### cosine similarity in titles