# ANALYSIS OF THE PAPER CITATION NETWORK

## 1. Analysis of the test dataset

In [1]:
import csv
import networkx as nx
import pandas as pd
import numpy as np
import random
import pickle
import os

In [2]:
G = nx.read_edgelist('../data/initial_data/edgelist.txt', delimiter=',', nodetype=int)
nodes = list(G.nodes())
n = G.number_of_nodes()
m = G.number_of_edges()
print('Number of nodes:', n)
print('Number of edges:', m)

Number of nodes: 138499
Number of edges: 1091955


In [3]:
node_pairs_test = list()
with open('../data/initial_data/test.txt', 'r') as f:
    for line in f:
        t = line.split(',')
        node_pairs_test.append((int(t[0]), int(t[1])))

node_pairs_test = list(map(sorted, node_pairs_test))
# The result of sorted is a list so we put it back as a tuple
node_pairs_test = list(map(tuple, node_pairs_test))

In [4]:
edges = list(G.edges())

edges = list(map(sorted, edges))
# The result of sorted is a list so we put it back as a tuple
edges = list(map(tuple, edges))

In [5]:
len(set(edges) & set(node_pairs_test))

0

We can see that none of the node pairs of the test set are not edges in the graph. We don't know if there is a citation link between each node pair of the test. 

## 2. Building the non-edges pairs of nodes

The non-edges will be the instances of our training dataset of the negative class

In [8]:
non_edges_path = "../data/datasets/list_non_edges.pkl"

if not os.path.isfile(non_edges_path):
    random.seed(12)
    non_edges = [(random.randint(0, n-1), random.randint(0, n-1)) for _ in range(m+1000)]

    non_edges = list(map(sorted, non_edges))
    # The result of sorted is a list so we put it back as a tuple
    non_edges = list(map(tuple, non_edges))

    # Remove edges or node pairs in the test set
    non_edges = set(non_edges) - (set(node_pairs_test) | set(edges))

    def check_shortest_path_length(G, a, b):
        try:
            res = nx.shortest_path_length(G,a,b) > 2
        except nx.NetworkXNoPath:
            res = True
        return res

    # Remove pairs for which shortest path > 2
    non_edges = [
        (a,b) for a,b in non_edges
        if check_shortest_path_length(G,a,b)
    ]

    # Save the non edges
    with open(non_edges_path, 'wb') as f:
        pickle.dump(non_edges, f)

else:
    # Read the non edges
    with open(non_edges_path, 'rb') as f:
        non_edges = pickle.load(f)



## 3. Building the training and test datasets

In [10]:
from sklearn.model_selection import train_test_split

In [14]:
train_folder = "../data/datasets/train_set"
test_folder = "../data/datasets/test_set"

os.makedirs(train_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)

train_edges_path = os.path.join(train_folder, 'train_edges.csv')
train_non_edges_path = os.path.join(train_folder, 'train_non_edges.csv')
train_path = os.path.join(train_folder, 'train.csv')
train_y_path = os.path.join(train_folder, 'train_y.csv')

test_edges_path = os.path.join(test_folder, 'test_edges.csv')
test_non_edges_path = os.path.join(test_folder, 'test_non_edges.csv')
test_path = os.path.join(test_folder, 'test.csv')
test_y_path = os.path.join(test_folder, 'test_y.csv')

In [15]:
if (not os.path.isfile(train_edges_path)) and (not os.path.isfile(test_edges_path)):
    # Positive links
    df_edges = pd.read_csv("../data/initial_data/edgelist.txt", names=['node_1', 'node_2'])
    # negative links
    df_non_edges = pd.DataFrame(non_edges, columns=['node_1', 'node_2'])
    
    print("Number of pairs of node in the graph with edges", df_edges.shape[0])
    print("Number of pairs of node in the graph without edges", df_non_edges.shape[0])
    
    #Trian test split 
    #Spiltted data into 80-20 
    #positive links and negative links seperatly because we need positive training data only for creating graph 
    #and for feature generation
    X_train_pos, X_test_pos, y_train_pos, y_test_pos  = train_test_split(
        df_edges, np.ones(len(df_edges)), test_size=0.2, random_state=12
    )
    X_train_neg, X_test_neg, y_train_neg, y_test_neg  = train_test_split(
        df_non_edges, np.zeros(len(df_non_edges)), test_size=0.2, random_state=12
    )
    
    print('='*60)
    print("Number of pairs of node in the train data graph with edges", X_train_pos.shape[0])
    print("Number of pairs of node in the train data graph without edges", X_train_neg.shape[0])
    print('='*60)
    print("Number of pairs of node in the test data graph with edges", X_test_pos.shape[0])
    print("Number of pairs of node in the test data graph without edges", X_test_neg.shape[0])

    #removing header and saving
    X_train_pos.to_csv(train_edges_path, header=False, index=False)
    X_test_pos.to_csv(test_edges_path, header=False, index=False)
    X_train_neg.to_csv(train_non_edges_path, header=False, index=False)
    X_test_neg.to_csv(test_non_edges_path, header=False, index=False)

del non_edges

Number of pairs of node in the graph with edges 1091955
Number of pairs of node in the graph without edges 1084146
Number of pairs of node in the train data graph with edges 873564
Number of pairs of node in the train data graph without edges 867316
Number of pairs of node in the test data graph with edges 218391
Number of pairs of node in the test data graph without edges 216830


In [22]:
# Graph of train set
train_graph = nx.read_edgelist(train_edges_path, delimiter=',', nodetype=int)
# Graph of test set
test_graph = nx.read_edgelist(test_edges_path, delimiter=',', nodetype=int)

# finding the unique nodes in the both train and test graphs
train_nodes_pos = set(train_graph.nodes())
test_nodes_pos = set(test_graph.nodes())

trY_teY = len(train_nodes_pos.intersection(test_nodes_pos))
trY_teN = len(train_nodes_pos - test_nodes_pos)
teY_trN = len(test_nodes_pos - train_nodes_pos)

print('# of common papers in train and test (edgelists):', trY_teY)
print('# of papers present in train but not in test (edgelists):', trY_teN)

print('# of papers present in test but not in train (edgelists):', teY_trN)
print(f"Percentage in test of papers not in train (edgelists) is \
    {teY_trN/len(test_nodes_pos)*100 :.2f}%")

# of common papers in train and test (edgelists): 100935
# of papers present in train but not in test (edgelists): 34561
# of papers present in test but not in train (edgelists): 3003
Percentage in test of papers not in train (edgelists) is     2.89%


We have a cold start problem here because some nodes present in the the test graph are not in the train graph.