# COMPUTE ALL FEATURES

In [1]:
import pickle
import os
import numpy as np
import pandas as pd
import networkx as nx
import sys
import time

from gensim.models.doc2vec import Doc2Vec
from gensim.models import KeyedVectors

import paths

## 1. Load the data

In [2]:
# Read node pairs
# Train
train_pairs = pd.read_csv(
    paths.TRAIN_PAIRS_PATH, 
    names=['node_1', 'node_2']
)
train_target = pd.read_csv(
    paths.TRAIN_TARGET_PATH,
    names=['target']
).to_numpy().ravel()

# Test
test_pairs = pd.read_csv(
    paths.TEST_PAIRS_PATH,
    names=['node_1', 'node_2']
)
test_target = pd.read_csv(
    paths.TEST_TARGET_PATH,
    names=['target']
).to_numpy().ravel()

# For the Kaggle challenge
challenge_pairs = pd.read_csv(
    paths.CHALLENGE_PAIRS_PATH,
    names=['node_1', 'node_2']
)

In [3]:
# Load the training graph
train_graph = nx.read_edgelist(
    paths.TRAIN_EDGELIST_PATH, delimiter=',', nodetype=int
)

# Load the test graph
test_graph = nx.read_edgelist(
    paths.TEST_EDGELIST_PATH, delimiter=',', nodetype=int
)

# Load the full graph
full_graph = nx.read_edgelist(
    paths.FULL_GRAPH_EDGELIST_PATH, delimiter=',', nodetype=int
)

In [4]:
# Read the abstract of each paper
abstracts = dict()
with open(paths.ABSTRACTS_PATH, 'r') as f:
    for line in f:
        node, abstract = line.split('|--|')
        abstracts[int(node)] = abstract

# Read the authors to each paper
authors = dict()
with open(paths.PAPER_2_AUTHORS_ID_PATH, 'r') as f:
    for line in f:
        node, node_authors = line.rstrip('\n').split('|--|')
        authors[int(node)] = node_authors.split(',')


In [5]:
# Read the Doc2vec model
doc2vec_model = Doc2Vec.load(paths.DOC2VEC_PATH)

# Read nodes embeddings
node2vec_train = KeyedVectors.load_word2vec_format(paths.NODE2VEC_TRAIN_PATH)
node2vec_test = KeyedVectors.load_word2vec_format(paths.NODE2VEC_TEST_PATH)
node2vec_full_graph = KeyedVectors.load_word2vec_format(paths.NODE2VEC_FULL_GRAPH_PATH)

## 2. Semantic, Attributes, Graph (paper citation network) based Features 

### 2.1. Semantic Features

####  Cosine similarity of abstract embeddings

In [7]:
def cosine_similarity_doc2vec(paper_1, paper_2):
    """
    Computes the cosine similarity between the abstract embeddings of two papers/nodes.
    """
    # The model was trained on all the papers 
    return doc2vec_model.docvecs.similarity(paper_1, paper_2)

### 2.2. Attribute Features

#### Number of common authors

In [8]:
def common_authors(paper_1, paper_2):
    """
    Computes the number of common author between two papers/nodes.
    """
    # The model was trained on all the papers 
    return len(set(authors[paper_1]) & set(authors[paper_2]))

### 2.3. Graph based features

#### Degree approches

In [9]:
def abs_diff_degree(node_1, node_2, G):
    """
    Computes the difference in degree of two nodes in a graph.
    """
    return abs(G.degree(node_1) - G.degree(node_2))


def sum_degree(node_1, node_2, G):
    """
    Computes the difference in degree of two nodes in a graph.
    """
    return G.degree(node_1) + G.degree(node_2)

#### Local based similarity measures

In [10]:
def jaccard_coefficient(node_1, node_2, G):
    """
    Computes the jaccard coefficient of two nodes in a graph.
    """
    _, _, coeff = list(nx.jaccard_coefficient(G, [(node_1, node_2)]))[0]
    return coeff


def adamic_adar_index(node_1, node_2, G):
    """
    Computes the adamic adar index of two nodes in a graph.
    """
    _, _, index = list(nx.adamic_adar_index(G, [(node_1, node_2)]))[0]
    return index


def pref_attachment(node_1, node_2, G):
    """ 
    Computes the preferential attachment of two nodes in a graph.
    """
    _, _, p = list(nx.preferential_attachment(G, [(node_1, node_2)]))[0]
    return p


def salton_index(node_1, node_2, G):
    """ 
    Computes the salton index of two nodes in a graph
    """
    sqrt_prod = np.sqrt(G.degree(node_1) * G.degree(node_1))
    if sqrt_prod == 0:
        return 0
    return len(list(nx.common_neighbors(G, node_1, node_2))) / sqrt_prod

#### Global based similarity measures

In [11]:
def shortest_path_length(node_1, node_2, G):
    """
    Computes the shortest path length between two nodes in a graph.
    """
    try:
        length = nx.shortest_path_length(G, node_1, node_2)
    except nx.NetworkXNoPath:
        length = -1
    return length


def diff_pagerank(node_1, node_2, pageranks):
    """ 
    Computes the absolute difference in the pageranks of two nodes in a graph
    """
    return np.abs(pageranks[node_1] - pageranks[node_2])


def diff_eigvec_centrality(node_1, node_2, eigvec_centrality):
    """ 
    Computes the absolute difference in the eigenvector centrality of two nodes in a graph
    """
    return np.abs(eigvec_centrality[node_1] - eigvec_centrality[node_2])


def diff_bet_centrality(node_1, node_2, bet_centrality):
    """ 
    Computes the absolute difference in the betweeness centrality of two nodes in a graph
    """
    return np.abs(bet_centrality[node_1] - bet_centrality[node_2])

### 2.4. Computing features

In [34]:
def compute_features(df, G, name):
    print(f"Computing features for the {name} set")
    print('-'*50)

    t0 = time.time()
    df["n_common_authors"] = df.apply(
        lambda row: common_authors(row['node_1'], row['node_2']),
        axis=1
    )
    print(f"n_common_authors - done [{round(time.time() - t0,1)}s]")

    t1 = time.time()
    df["abstract_similarity"] = df.apply(
        lambda row: cosine_similarity_doc2vec(row['node_1'], row['node_2']),
        axis=1
    )
    print(f"abstract_similarity - done [{round(time.time() - t1,1)}s]")

    t1 = time.time()
    df["abs_diff_degree"] = df.apply(
        lambda row: abs_diff_degree(row['node_1'], row['node_2'], G),
        axis=1
    )
    print(f"abs_diff_degree - done [{round(time.time() - t1,1)}s]")

    t1 = time.time()
    df["sum_degree"] = df.apply(
        lambda row: sum_degree(row['node_1'], row['node_2'], G),
        axis=1
    ) 
    print(f"sum_degree - done [{round(time.time() - t1,1)}s]")

    t1 = time.time()
    df["jaccard_coeff"] = df.apply(
        lambda row: jaccard_coefficient(row['node_1'], row['node_2'], G),
        axis=1
    )
    print(f"jaccard_coeff - done [{round(time.time() - t1,1)}s]")

    t1 = time.time()
    df["adamic_adar_index"] = df.apply(
        lambda row: adamic_adar_index(row['node_1'], row['node_2'], G),
        axis=1
    )
    print(f"adamic_adar_index - done [{round(time.time() - t1,1)}s]")

    t1 = time.time()
    df["pref_attachment"] = df.apply(
        lambda row: pref_attachment(row['node_1'], row['node_2'], G),
        axis=1
    )
    print(f"pref_attachment - done [{round(time.time() - t1,1)}s]")

    t1 = time.time()
    df["salton_index"] = df.apply(
        lambda row: salton_index(row['node_1'], row['node_2'], G),
        axis=1
    )
    print(f"salton_index - done [{round(time.time() - t1,1)}s]")

    t1 = time.time()
    df["shortest_path_length"] = df.apply(
        lambda row: shortest_path_length(row['node_1'], row['node_2'], G),
        axis=1
    )
    print(f"shortest_path_length - done [{round(time.time() - t1,1)}s]")

    t1 = time.time()
    eigvec_centrality = nx.eigenvector_centrality(G)
    df["diff_eigvec_centrality"] = df.apply(
        lambda row: diff_eigvec_centrality(
            row['node_1'], row['node_2'], eigvec_centrality
        ), axis=1
    )
    print(f"diff_eigvec_centrality - done [{round(time.time() - t1,1)}s]")

    t1 = time.time()
    pageranks = nx.pagerank(G)
    df["diff_pagerank"] = df.apply(
        lambda row: diff_pagerank(
            row['node_1'], row['node_2'], pageranks
        ), axis=1
    )
    t2 = time.time()
    print(f"diff_pagerank - done [{round(t2 - t1,1)}s]")
    print(f"Total time : [{round(t2 - t0,1)}s]")


We will save the current computed features in a `pd.HDFStore`. We do this because computing all features is time expensive so we save these one first and then we can read them before computing other features.

In [3]:
if not os.path.isfile(paths.STORAGE_STAGE1_PATH):

    # Create storage
    hdf = pd.HDFStore(paths.STORAGE_STAGE1_PATH)

    X_train = train_pairs.copy()
    X_test = test_pairs.copy()
    X_challenge = challenge_pairs.copy()
    
    # Train features
    compute_features(X_train, train_graph, 'train')
    hdf.put('X_train', X_train, format='table', data_columns=True)
    # Test features
    compute_features(X_test, test_graph, 'test')
    hdf.put('X_test', X_test, format='table', data_columns=True)
    # Challenge features
    compute_features(X_challenge, full_graph, 'challenge')
    hdf.put('X_challenge', X_challenge, format='table', data_columns=True)
    # Close storage
    hdf.close()
else:
    # Read the table if the storage exists
    X_train = pd.read_hdf(paths.STORAGE_STAGE1_PATH, key='X_train', mode='r')
    X_test = pd.read_hdf(paths.STORAGE_STAGE1_PATH, key='X_test', mode='r')
    X_challenge = pd.read_hdf(paths.STORAGE_STAGE1_PATH, key='X_challenge', mode='r')

## 3. Features from the authors networks