In [None]:
# https://www.kaggle.com/c/ngsa-2018

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
import itertools

from gensim import corpora, models, similarities
import networkx as nx
from networkx.algorithms.connectivity import local_edge_connectivity
from networkx.algorithms.connectivity import build_auxiliary_edge_connectivity
from networkx.algorithms.flow import build_residual_network
import igraph

import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words("english")
#nltk.download('punkt') # for tokenization

from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn import preprocessing

# Import data

In [2]:
nodes_header = ["id", "year", "title", "authors", "journal", "abstract"]
node_info = pd.read_csv('node_information.csv',names=nodes_header)
node_info.set_index("id", inplace=True)

In [33]:
node_info.head()

Unnamed: 0_level_0,year,title,authors,journal,abstract,clean_titles,clean_abstracts,split_authors
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1001,2000,compactification geometry and duality,Paul S. Aspinwall,,these are notes based on lectures given at tas...,"[compactif, geometri, dualiti]","[note, base, lectur, given, tasi, review, geom...",[Paul S. Aspinwall]
1002,2000,domain walls and massive gauged supergravity p...,"M. Cvetic, H. Lu, C.N. Pope",Class.Quant.Grav.,we point out that massive gauged supergravity ...,"[domain, wall, massiv, gaug, supergrav, potenti]","[point, massiv, gaug, supergrav, potenti, exam...","[M. Cvetic, H. Lu, C.N. Pope]"
1003,2000,comment on metric fluctuations in brane worlds,"Y.S. Myung, Gungwon Kang",,recently ivanov and volovich hep-th 9912242 cl...,"[comment, metric, fluctuat, brane, world]","[recent, ivanov, volovich, hep, th, claim, per...","[Y.S. Myung, Gungwon Kang]"
1004,2000,moving mirrors and thermodynamic paradoxes,Adam D. Helfer,Phys.Rev.,quantum fields responding to moving mirrors ha...,"[move, mirror, thermodynam, paradox]","[quantum, field, respond, move, mirror, predic...",[Adam D. Helfer]
1005,2000,bundles of chiral blocks and boundary conditio...,"J. Fuchs, C. Schweigert",,proceedings of lie iii clausthal july 1999 var...,"[bundl, chiral, block, boundari, condit, cft]","[proceed, lie, iii, clausthal, juli, variou, a...","[J. Fuchs, C. Schweigert]"


In [4]:
names = ["id1","id2","category"]
train = pd.read_csv('training_set.txt', names=names,delimiter=" ")
train["index"] = train["id1"].astype(str) + "|" + train["id2"].astype(str)
train.set_index("index", inplace=True)
train_size=615512

name = ["id1","id2"]
test = pd.read_csv('testing_set.txt',names=name,delimiter=" ")

# Preprossing

In [5]:
# this part is to do some cleaning on titles and abstract

stemmer = nltk.porter.PorterStemmer()

def cleaning(text):
    # split and lower the text
    text= text.apply(lambda sentence:  "".join((char if char.isalpha() else " ") for char in sentence).lower().split() )
    # remove stopwords
    text = text.apply(lambda words : [word for word in words if word not in stopwords])
    # stemmer
    text = text.apply(lambda words : [stemmer.stem(word) for word in words])
    return text

In [6]:
                                     # -----clean title----- #
titles = cleaning(node_info['title'])
node_info['clean_titles'] = titles

                                     # -----clean abstract---- #
abstracts = cleaning(node_info['abstract'])
node_info['clean_abstracts'] = abstracts

                                     # -----split authors---- #
authors = node_info['authors'].astype(str).str.split(',')
node_info['split_authors'] = authors

# Generate Features

## citation graph features

### shortest path in citation graph

In [7]:
# create graph of citations

shortest_path=[]
id1 = train['id1'].values
id2 = train['id2'].values

# create empty directed graph
g = igraph.Graph(directed=True)

# create the nodes of the graph from node_info
nodes = node_info.index.values
str_vec = np.vectorize(str)
nodes = str_vec(nodes)

# add vertices
g.add_vertices(nodes)

# create and add edges
edges = [(str(id1[i]), str(id2[i])) for i in range(len(id1)) if train['category'][i] == 1]
g.add_edges(edges)

vs = dict(zip(nodes, range(len(nodes))))

#di_network_graph = nx.DiGraph() 
#di_network_graph.add_nodes_from(nodes)
#di_network_graph.add_edges_from(edges)

#un_network_graph = nx.Graph() 
#un_network_graph.add_nodes_from(nodes)
#un_network_graph.add_edges_from(edges)

In [8]:
i = 0
print('start computing shortest path')
shortest_path = []
id1 = train['id1'].values
id2 = train['id2'].values
for idx in range(len(id1)):
    shortest_path.append(g.shortest_paths(source=str(id1[idx]), target=str(id2[idx]))[0][0])
    i += 1
    if int(i) % 1000 == 0:
        print(str(100.0*i/train_size)[:4]+"% of the task done", flush=True)
    
# categorize shortest path into different categories
shortest_dist_3 = [1 if d <= 3 else 0 for d in shortest_path]
shortest_dist_5 = [1 if d <= 5 and d > 3 else 0 for d in shortest_path]
shortest_dist_10 = [1 if d <= 10 and d > 5 else 0 for d in shortest_path]
shortest_dist_inf = [1 if d > 10 else 0 for d in shortest_path]

train['shortest_path_3'] = shortest_dist_3
train['shortest_path_5'] = shortest_dist_5
train['shortest_path_10'] = shortest_dist_10
train['shortest_path_inf'] = shortest_dist_inf

start computing shortest path
0.16% of the task done
0.32% of the task done
0.48% of the task done
0.64% of the task done
0.81% of the task done
0.97% of the task done
1.13% of the task done
1.29% of the task done
1.46% of the task done
1.62% of the task done
1.78% of the task done
1.94% of the task done
2.11% of the task done
2.27% of the task done
2.43% of the task done
2.59% of the task done
2.76% of the task done
2.92% of the task done
3.08% of the task done
3.24% of the task done
3.41% of the task done
3.57% of the task done
3.73% of the task done
3.89% of the task done
4.06% of the task done
4.22% of the task done
4.38% of the task done
4.54% of the task done
4.71% of the task done
4.87% of the task done
5.03% of the task done
5.19% of the task done
5.36% of the task done
5.52% of the task done
5.68% of the task done
5.84% of the task done
6.01% of the task done
6.17% of the task done
6.33% of the task done
6.49% of the task done
6.66% of the task done
6.82% of the task done
6.98

57.8% of the task done
58.0% of the task done
58.1% of the task done
58.3% of the task done
58.4% of the task done
58.6% of the task done
58.8% of the task done
58.9% of the task done
59.1% of the task done
59.3% of the task done
59.4% of the task done
59.6% of the task done
59.7% of the task done
59.9% of the task done
60.1% of the task done
60.2% of the task done
60.4% of the task done
60.5% of the task done
60.7% of the task done
60.9% of the task done
61.0% of the task done
61.2% of the task done
61.4% of the task done
61.5% of the task done
61.7% of the task done
61.8% of the task done
62.0% of the task done
62.2% of the task done
62.3% of the task done
62.5% of the task done
62.7% of the task done
62.8% of the task done
63.0% of the task done
63.1% of the task done
63.3% of the task done
63.5% of the task done
63.6% of the task done
63.8% of the task done
64.0% of the task done
64.1% of the task done
64.3% of the task done
64.4% of the task done
64.6% of the task done
64.8% of th

In [10]:
print('start computing shortest path in test set')
shortest_path_test = []
id1 = test['id1'].values
id2 = test['id2'].values
for idx in range(len(id1)):
    shortest_path_test.append(g.shortest_paths(source=str(id1[idx]), target=str(id2[idx]))[0][0])
    i += 1
    if int(i) % 1000 == 0:
        print(str(100.0*i/32648)[:4]+"% of the task done", flush=True)
    
# categorize shortest path into different categories
shortest_dist_3_test = [1 if d <= 3 else 0 for d in shortest_path_test]
shortest_dist_5_test = [1 if d <= 5 and d > 3 else 0 for d in shortest_path_test]
shortest_dist_10_test = [1 if d <= 10 and d > 5 else 0 for d in shortest_path_test]
shortest_dist_inf_test = [1 if d > 10 else 0 for d in shortest_path_test]

test['shortest_path_3'] = shortest_dist_3_test
test['shortest_path_5'] = shortest_dist_5_test
test['shortest_path_10'] = shortest_dist_10_test
test['shortest_path_inf'] = shortest_dist_inf_test

start computing shortest path in test set
1886% of the task done
1889% of the task done
1892% of the task done
1895% of the task done
1899% of the task done
1902% of the task done
1905% of the task done
1908% of the task done
1911% of the task done
1914% of the task done
1917% of the task done
1920% of the task done
1923% of the task done
1926% of the task done
1929% of the task done
1932% of the task done
1935% of the task done
1938% of the task done
1941% of the task done
1944% of the task done
1948% of the task done
1951% of the task done
1954% of the task done
1957% of the task done
1960% of the task done
1963% of the task done
1966% of the task done
1969% of the task done
1972% of the task done
1975% of the task done
1978% of the task done
1981% of the task done
1984% of the task done


### difference of in-degree / number of times id2 paper is citated

In [12]:
id1 = train['id1'].values
id2 = train['id2'].values
print('start computing in-degree and number of times id2 paper is citated')
degrees_vs = g.indegree()
degrees_id1 = [degrees_vs[vs[str(id1)]] for id1 in id1]
degrees_id2 = [degrees_vs[vs[str(id2)]] for id2 in id2]
diff_degrees = [a-b for a,b in zip(degrees_id1,degrees_id2)]

train['diff_indegree'] = diff_degrees
train['id2_cite'] = degrees_id2 # number of times id2 paper is cited

start computing in-degree and number of times id2 paper is citated


In [21]:
id1_t = test['id1'].values
id2_t = test['id2'].values
print('start computing in-degree and number of times id2 paper is citated')
degrees_vs = g.indegree()
degrees_id1_t = [degrees_vs[vs[str(id1_t)]] for id1_t in id1_t]
degrees_id2_t = [degrees_vs[vs[str(id2_t)]] for id2_t in id2_t]
diff_degrees_t = [a-b for a,b in zip(degrees_id1_t,degrees_id2_t)]

test['diff_indegree'] = diff_degrees_t
test['id2_cite'] = degrees_id2_t # number of times id2 paper is cited

start computing in-degree and number of times id2 paper is citated


### difference in between centrality

In [22]:
                             #---- directed between centrality difference ---- #
di_between_centrality = g.betweenness(directed=True)
di_b_centrality_id1 = np.array([di_between_centrality[vs[str(nodeid)]] for nodeid in id1])
di_b_centrality_id2 = np.array([di_between_centrality[vs[str(nodeid)]] for nodeid in id2])
diff_di_b_centrality = di_b_centrality_id2 - di_b_centrality_id1
train['diff_di_b_centrality'] = diff_di_b_centrality

                             #---- directed between centrality difference ---- #
un_between_centrality = g.betweenness(directed=False)
un_b_centrality_id1 = np.array([un_between_centrality[vs[str(nodeid)]] for nodeid in id1])
un_b_centrality_id2 = np.array([un_between_centrality[vs[str(nodeid)]] for nodeid in id2])
diff_un_b_centrality = un_b_centrality_id2 - un_b_centrality_id1
train['diff_un_b_centrality'] = diff_un_b_centrality

In [23]:
                             #---- directed between centrality difference ---- #
di_between_centrality = g.betweenness(directed=True)
di_b_centrality_id1_t = np.array([di_between_centrality[vs[str(nodeid)]] for nodeid in id1_t])
di_b_centrality_id2_t = np.array([di_between_centrality[vs[str(nodeid)]] for nodeid in id2_t])
diff_di_b_centrality_t = di_b_centrality_id2_t - di_b_centrality_id1_t
test['diff_di_b_centrality'] = diff_di_b_centrality_t

                             #---- directed between centrality difference ---- #
un_between_centrality = g.betweenness(directed=False)
un_b_centrality_id1_t = np.array([un_between_centrality[vs[str(nodeid)]] for nodeid in id1_t])
un_b_centrality_id2_t = np.array([un_between_centrality[vs[str(nodeid)]] for nodeid in id2_t])
diff_un_b_centrality_t = un_b_centrality_id2_t - un_b_centrality_id1_t
test['diff_un_b_centrality'] = diff_un_b_centrality_t

### in the same cluster or not

In [None]:
# no idea how to do it