# package import

In [1]:
import time
import networkx as nx
from pymongo import MongoClient
import matplotlib.pyplot as plt

# connect MongoDB

In [4]:
client = MongoClient('localhost', 27017)
db = client['graph']
citation = db['citation']

# Preprocessing

## Filter Data

In [5]:
##########
# FILTER #
##########
print("FILTER START")

filter_query = { "n_citation": {"$gt": 100}, "references": {"$exists": True}, "fos": {"$exists": True} }
filter_projection = { "_id": True, "fos": True }
filter_result = citation.find(filter_query, filter_projection)

filtered_nodes = [ doc["_id"] for doc in filter_result if len(doc["fos"]) != 0 ]

print("FILTER FINISH")

FILTER START
FILTER FINISH


## Generate Edges

In [None]:
################################
# Generate Edge List (BATCHED) #
################################
print("Generating Edge List START")

BATCH_SIZE = 20_000
num_filtered_nodes = len(filtered_nodes)
n_iter = num_filtered_nodes//BATCH_SIZE
edges_set = set()
idx = 1

for it in range(n_iter+1):
    if it < n_iter:
        batched_filtered_nodes = filtered_nodes[it*BATCH_SIZE:(it+1)*BATCH_SIZE]
    else:
        batched_filtered_nodes = filtered_nodes[it*BATCH_SIZE:]

    edge_result = citation.find({ "_id": {"$in": batched_filtered_nodes} }, { "references": 1 })
    for res_doc in edge_result:
        start = time.time()
        satisfied_references = citation.find({ "_id": {"$in": res_doc["references"]}, "n_citation": {"$gt": 100}, "references": {"$exists": True}, "fos": {"$exists": True} }, { "_id": 1 })

        for reference in satisfied_references:
            edges_set.add((res_doc["_id"], reference["_id"]))

        end = time.time()

        if (idx/num_filtered_nodes*100) % 10 < 0.01:
            print(f"{idx}/{num_filtered_nodes} [{idx/num_filtered_nodes*100:.2f}%] - time [{end-start:.5f} s/p]")

        idx += 1
edges_list = list(edges_set)

print("Generating Edge List FINISH")

In [None]:
######################
# Generate Edge List #
######################
print("Generating Edge List START")

# num_filtered_nodes = len(filtered_nodes)
# n_iter = num_filtered_nodes//BATCH_SIZE
# idx = 1
edges_set = set()

edge_result = citation.find({ "_id": {"$in": filtered_nodes} }, { "references": 1 })
for res_doc in edge_result:
    # start = time.time()
    satisfied_references = citation.find({ "_id": {"$in": res_doc["references"]}, "n_citation": {"$gt": 100}, "references": {"$exists": True}, "fos": {"$exists": True} }, { "_id": 1 })

    for reference in satisfied_references:
        edges_set.add((res_doc["_id"], reference["_id"]))

    # end = time.time()

    # if (idx/num_filtered_nodes*100) % 10 < 0.01:
    #     print(f"{idx}/{num_filtered_nodes} [{idx/num_filtered_nodes*100:.2f}%] - time [{end-start:.5f} s/p]")

    # idx += 1
edges_list = list(edges_set)

print("Generating Edge List FINISH")

In [6]:
######################
# Generate Edge List #
######################
print("Generating Edge List START")

edges_set = set()
edge_result = citation.find({ "_id": {"$in": filtered_nodes} }, { "references": 1 })
for res_doc in edge_result:
    satisfied_references = citation.find({ "_id": {"$in": res_doc["references"]}, "n_citation": {"$gt": 100}, "references": {"$exists": True}, "fos": {"$exists": True} }, { "_id": 1 })

    for reference in satisfied_references:
        edges_set.add((res_doc["_id"], reference["_id"]))

edges_list = list(edges_set)

print("Generating Edge List FINISH")

Generating Edge List START
Generating Edge List FINISH


# Generate Graph using networkx

In [10]:
citation_network = nx.DiGraph()
citation_network.add_edges_from(edges_list)

In [None]:
from collections import Counter

dc = nx.degree_centrality(citation_network)
dc_top5 = Counter(dc).most_common(5)
print(f"Degree Centrality Top 5: {dc_top5}")

cc = nx.closeness_centrality(citation_network)
cc_top5 = Counter(cc).most_common(5)
print(f"Closeness Centrality Top 5: {cc_top5}")

bc = nx.betweenness_centrality(citation_network)
bc_top5 = Counter(bc).most_common(5)
print(f"Degree Centrality Top 5: {bc_top5}")

Degree Centrality Top 5: [('599c7f08601a182cd28e5abd', 0.010477581439383007), ('53e9986eb7602d97020ab93b', 0.01026398887009531), ('53e9a95db7602d97032b5715', 0.006829189445063414), ('53e9b587b7602d97040c7931', 0.00680032558434886), ('573696026e3b12023e515eec', 0.006523232521489145)]
