# MODELISATION

In [1]:
import pickle
import os
import numpy as np
import pandas as pd
import networkx as nx
import sys

from gensim.models.doc2vec import Doc2Vec
from gensim.models import KeyedVectors

## 1. Load the data

In [98]:
# Paths where we will save the datasets
data_folder = "../data"

if 'google.colab' in sys.modules:
  data_folder = "/content/drive/MyDrive/Colab Notebooks/ALTeGraD/Projet/data"
  
datasets_folder = os.path.join(data_folder, "datasets")
models_folder = os.path.join(data_folder, "models")

train_edges_path = os.path.join(datasets_folder, 'train_graph_edgelist.txt')
train_pairs_path = os.path.join(datasets_folder, 'train_pairs.csv')
train_target_path = os.path.join(datasets_folder, 'train_target.csv')

test_edges_path = os.path.join(datasets_folder, 'test_graph_edgelist.txt')
test_pairs_path = os.path.join(datasets_folder, 'test_pairs.csv')
test_target_path = os.path.join(datasets_folder, 'test_target.csv')

node2vec_train_path = os.path.join(models_folder, 'node2vec_train_graph.model')
node2vec_test_path = os.path.join(models_folder, 'node2vec_test_graph.model')
node2vec_full_graph_path = os.path.join(models_folder, 'node2vec_full_graph.nodevectors')

doc2vec_path = os.path.join(models_folder, "doc2vec_dm_64.model")

In [77]:
# Read datasets
train_pairs = pd.read_csv(train_pairs_path, names=['node_1', 'node_2'])
train_target = pd.read_csv(train_target_path, names=['target']).to_numpy().ravel()

test_pairs = pd.read_csv(test_pairs_path, names=['node_1', 'node_2'])
test_target = pd.read_csv(test_target_path, names=['target']).to_numpy().ravel()

challenge_pairs = pd.read_csv("../data/initial_data/test.txt", names=['node_1', 'node_2'])

In [78]:
# Load the training graph
train_graph = nx.read_edgelist(train_edges_path, delimiter=',', nodetype=int)

# Load the test graph
test_graph = nx.read_edgelist(test_edges_path, delimiter=',', nodetype=int)

# Load the full graph
graph = nx.read_edgelist("../data/initial_data/edgelist.txt", delimiter=',', nodetype=int)

In [7]:
# Read the abstract of each paper
abstracts = dict()
with open('../data/initial_data/abstracts.txt', 'r') as f:
    for line in f:
        node, abstract = line.split('|--|')
        abstracts[int(node)] = abstract

# Read the authors to each paper
authors = dict()
with open('../data/authors_processed/authors_ids.txt', 'r') as f:
    for line in f:
        node, node_authors = line.rstrip('\n').split('|--|')
        authors[int(node)] = node_authors.split(',')


In [99]:
# Read the Doc2vec model
doc2vec_model = Doc2Vec.load(doc2vec_path)

# Read nodes embeddings
node2vec_train = KeyedVectors.load_word2vec_format(node2vec_train_path)
node2vec_test = KeyedVectors.load_word2vec_format(node2vec_test_path)
node2vec_full_graph = KeyedVectors.load_word2vec_format(node2vec_full_graph_path)

## 2. Function to compute the features

### 2.1. Semantic Features

####  Cosine similarity of abstract embeddings

In [8]:
def cosine_similarity_doc2vec(paper_1, paper_2):
    """
    Computes the cosine similarity between the abstract embeddings of two papers/nodes.
    """
    # The model was trained on all the papers 
    return doc2vec_model.docvecs.similarity(paper_1, paper_2)

### 2.2. Attribute Features

#### Number of common authors

In [9]:
def common_authors(paper_1, paper_2):
    """
    Computes the number of common author between two papers/nodes.
    """
    # The model was trained on all the papers 
    return len(set(authors[paper_1]) & set(authors[paper_2]))

### 2.2. Topological features

#### Difference in degree

In [21]:
def abs_diff_degree(node_1, node_2, G):
    """
    Computes the difference in degree of two nodes in a graph.
    """
    return abs(G.degree(node_1) - G.degree(node_2))

#### Sum of degree

In [22]:
def sum_degree(node_1, node_2, G):
    """
    Computes the difference in degree of two nodes in a graph.
    """
    return G.degree(node_1) + G.degree(node_2)

#### Jaccard coefficient

In [10]:
def jaccard_coefficient(node_1, node_2, G):
    """
    Computes the jaccard coefficient of two nodes in a graph.
    """
    _, _, coeff = list(nx.jaccard_coefficient(G, [(node_1, node_2)]))[0]
    return coeff

#### Adamic Adar Index

In [11]:
def adamic_adar_index(node_1, node_2, G):
    """
    Computes the adamic adar index of two nodes in a graph.
    """
    _, _, index = list(nx.adamic_adar_index(G, [(node_1, node_2)]))[0]
    return index

#### Shortest path length

In [12]:
def shortest_path_length(node_1, node_2, G):
    """
    Computes the shortest path length between two nodes in a graph.
    """
    try:
        length = nx.shortest_path_length(G, node_1, node_2)
    except nx.NetworkXNoPath:
        length = -1
    return length

### 2.3. Nodes embeddings

In [108]:
def get_embedding(u, node_vectors):
    """
    Get the embedding of a node from node2vec model.
    """
    return node_vectors[str(u)]

def link_examples_to_features(link_examples, node_vectors, binary_operator):
    return [
        binary_operator(get_embedding(src, node_vectors), get_embedding(dst, node_vectors))
        for src, dst in link_examples
    ]

def operator_hadamard(u, v):
    return u * v


def operator_l1(u, v):
    return np.abs(u - v)


def operator_l2(u, v):
    return (u - v) ** 2


def operator_avg(u, v):
    return (u + v) / 2.0

## 3. Computing features

In [14]:
X_train = train_pairs.copy()
X_test = test_pairs.copy()
X_challenge = challenge_pairs.copy()

In [83]:
def compute_features(df, G):
    df["n_common_authors"] = df.apply(
        lambda row: common_authors(row['node_1'], row['node_2']), axis=1
    )

    df["abstract_similarity"] = df.apply(
        lambda row: cosine_similarity_doc2vec(row['node_1'], row['node_2']), axis=1
    )

    df["jaccard_coeff"] = df.apply(
        lambda row: jaccard_coefficient(row['node_1'], row['node_2'], G), axis=1
    )

    df["adamic_adar_index"] = df.apply(
        lambda row: adamic_adar_index(row['node_1'], row['node_2'], G), axis=1
    )
    
    df["shortest_path_length"] = df.apply(
        lambda row: shortest_path_length(row['node_1'], row['node_2'], G), axis=1
    )

    df["abs_diff_degree"] = df.apply(
        lambda row: abs_diff_degree(row['node_1'], row['node_2'], G), axis=1
    )

    df["sum_degree"] = df.apply(
        lambda row: sum_degree(row['node_1'], row['node_2'], G), axis=1
    )    

In [84]:
compute_features(X_train, graph)
compute_features(X_test, graph)
compute_features(X_challenge, graph)

## 4. Model

In [38]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, log_loss, confusion_matrix
from sklearn.preprocessing import StandardScaler

from xgboost import XGBClassifier

In [27]:
X_train.head()

Unnamed: 0,node_1,node_2,n_common_authors,abstract_similarity,jaccard_coeff,adamic_adar_index,shortest_path_length,abs_diff_degree,sum_degree
0,23960,21336,0,0.079334,0.0,0.0,3,9,33
1,15506,41314,0,0.173078,0.0,0.0,4,16,30
2,2792,37899,2,0.403547,0.052632,0.180084,2,4,20
3,111658,111203,0,0.436967,0.0,0.0,3,0,8
4,64879,77131,0,0.386424,0.03125,0.264257,2,23,33


In [85]:
X_train.drop(columns=['node_1', 'node_2'], inplace=True)
X_test.drop(columns=['node_1', 'node_2'], inplace=True)
X_challenge.drop(columns=['node_1', 'node_2'], inplace=True)

In [29]:
X_train.head()

Unnamed: 0,n_common_authors,abstract_similarity,jaccard_coeff,adamic_adar_index,shortest_path_length,abs_diff_degree,sum_degree
0,0,0.079334,0.0,0.0,3,9,33
1,0,0.173078,0.0,0.0,4,16,30
2,2,0.403547,0.052632,0.180084,2,4,20
3,0,0.436967,0.0,0.0,3,0,8
4,0,0.386424,0.03125,0.264257,2,23,33


### Add the corresponding link embedding

In [129]:
X_train_node_emb = link_examples_to_features(
    train_pairs.to_numpy(),
    node2vec_train,
    binary_operator=operator_l1
)

X_train_c = np.concatenate(
    (X_train.to_numpy(), np.array(X_train_node_emb)),
    axis=1
)
print("Shape of X_train:", X_train_c.shape)

Shape of X_train: (349424, 135)


In [130]:
X_test_node_emb = link_examples_to_features(
    test_pairs.to_numpy(),
    node2vec_test,
    binary_operator=operator_l1
)

X_test_c = np.concatenate(
    (X_test.to_numpy(), np.array(X_test_node_emb)),
    axis=1
)
print("Shape of X_test:", X_test_c.shape)

Shape of X_test: (436782, 135)


In [131]:
X_challenge_node_emb = link_examples_to_features(
    challenge_pairs.to_numpy(),
    node2vec_full_graph,
    binary_operator=operator_l1
)

X_challenge_c = np.concatenate(
    (X_challenge.to_numpy(), np.array(X_challenge_node_emb)),
    axis=1
)
print("Shape of X_challenge:", X_challenge_c.shape)

Shape of X_challenge: (106692, 135)


In [132]:
X_new_c = np.concatenate((X_train_c, X_test_c))
y_new = np.concatenate((train_target, test_target))

In [133]:
scaler = StandardScaler()
X_new_sc = scaler.fit_transform(X_new_c)
X_challenge_sc = scaler.transform(X_challenge_c)

In [121]:
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train_c)
X_test_sc = scaler.transform(X_test_c)
X_challenge_sc = scaler.transform(X_challenge_sc)

In [134]:
lr_clf = LogisticRegression(max_iter=2000)
lr_clf.fit(X_new_sc, y_new)

LogisticRegression(max_iter=2000)

In [137]:
log_loss(train_target, lr_clf.predict_proba(X_train_sc)[:,1])

2.8904744499322637

In [135]:
log_loss(y_new, lr_clf.predict_proba(X_new_sc)[:,1])

0.13622079791101255

In [140]:
log_loss(test_target, lr_clf.predict_proba(X_test_sc)[:,1])

0.13233471812878958

In [139]:
import csv
y_pred = lr_clf.predict_proba(X_challenge_sc)
y_pred = y_pred[:,1]

# Write predictions to a file
predictions = zip(range(len(y_pred)), y_pred)
with open("submission.csv","w") as pred:
    csv_out = csv.writer(pred)
    csv_out.writerow(['id','predicted'])
    for row in predictions:
        csv_out.writerow(row) 