In [1]:
from semanticscholar import SemanticScholar
import csv
import pandas as pd
import random
import json
import shutil
import numpy as np

import os
from neo4j import GraphDatabase


In [2]:
NEO4J_URI = "bolt://localhost:7687"  # Default for Neo4j Desktop
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "pass1234"  # Change this to your actual password
IMPORT_FOLDER = os.path.expanduser("~/Library/Application Support/Neo4j Desktop/Application/relate-data/dbmss/dbms-6f9440a9-c56d-4ebc-b244-90d92b771350/import/")

In [3]:
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

session = driver.session() 

1. Node Similarity Algorithm: For Paper Similarity Based on Topics 

In [19]:
# drop projected graph if exists

query = """CALL gds.graph.drop('paperTopicGraph', false)"""
result = session.run(query)

print([record for record in result])




[<Record graphName='paperTopicGraph' database='neo4j' databaseLocation='local' memoryUsage='' sizeInBytes=-1 nodeCount=5276 relationshipCount=22646 configuration={'relationshipProjection': {'About': {'aggregation': 'DEFAULT', 'orientation': 'UNDIRECTED', 'indexInverse': False, 'properties': {}, 'type': 'About'}}, 'readConcurrency': 4, 'relationshipProperties': {}, 'nodeProperties': {}, 'jobId': '583dd711-ca7c-44f3-acd9-2ad52c682843', 'nodeProjection': {'Paper': {'label': 'Paper', 'properties': {}}, 'Keyword': {'label': 'Keyword', 'properties': {}}}, 'logProgress': True, 'validateRelationships': False, 'sudo': False} density=0.0008136998803488209 creationTime=neo4j.time.DateTime(2025, 4, 8, 3, 18, 26, 937924000, tzinfo=<DstTzInfo 'Europe/Madrid' CEST+2:00:00 DST>) modificationTime=neo4j.time.DateTime(2025, 4, 8, 3, 18, 26, 937924000, tzinfo=<DstTzInfo 'Europe/Madrid' CEST+2:00:00 DST>) schema={'graphProperties': {}, 'nodes': {'Paper': {}, 'Keyword': {}}, 'relationships': {'About': {}}} 

In [20]:
# Check similarity (project the graph, run algorith, and retrun similariy)

query = """
        CALL gds.graph.project(
        'paperTopicGraph',
        ['Paper', 'Keyword'],
        {
            About: {
            type: 'About',
            orientation: 'UNDIRECTED'
            }
        }
        )
        """
result = session.run(query)

print(list(result))

[<Record nodeProjection={'Paper': {'label': 'Paper', 'properties': {}}, 'Keyword': {'label': 'Keyword', 'properties': {}}} relationshipProjection={'About': {'aggregation': 'DEFAULT', 'orientation': 'UNDIRECTED', 'indexInverse': False, 'properties': {}, 'type': 'About'}} graphName='paperTopicGraph' nodeCount=5276 relationshipCount=22646 projectMillis=28>] <neo4j._work.summary.ResultSummary object at 0x125aac670>


In [23]:
query = """

        CALL gds.nodeSimilarity.stream('paperTopicGraph')
        YIELD node1, node2, similarity
        RETURN 
        gds.util.asNode(node1).DOI AS Paper1_DOI,
        gds.util.asNode(node2).DOI AS Paper2_DOI,
        similarity
        ORDER BY similarity DESC
        Limit 10;

        """
result = session.run(query)

print(list(result))

[<Record Paper1_DOI='202310.3390/en17040925' Paper2_DOI='201810.1016/J.PSEP.2019.01.013' similarity=1.0>, <Record Paper1_DOI='202310.3390/en17040925' Paper2_DOI='202110.1007/s13201-022-01830-0' similarity=1.0>, <Record Paper1_DOI='202310.3390/en17040925' Paper2_DOI='201710.1016/J.PSEP.2019.01.013' similarity=1.0>, <Record Paper1_DOI='202310.3390/en17040925' Paper2_DOI='202110.1109/ACCESS.2022.3167058' similarity=1.0>, <Record Paper1_DOI='202310.3390/en17040925' Paper2_DOI='202210.1049/rpg2.12985' similarity=1.0>, <Record Paper1_DOI='202310.3390/en17040925' Paper2_DOI='202310.5753/jisa.2024.3805' similarity=1.0>, <Record Paper1_DOI='202310.3390/en17040925' Paper2_DOI='202310.1049/rpg2.12985' similarity=1.0>, <Record Paper1_DOI='202310.3390/en17040925' Paper2_DOI='202010.1007/s13201-022-01830-0' similarity=1.0>, <Record Paper1_DOI='202310.3390/en17040925' Paper2_DOI='202010.1109/ACCESS.2022.3167058' similarity=1.0>, <Record Paper1_DOI='202310.3390/en17040925' Paper2_DOI='202210.3390/en17

2. PageRank: Identifying the most influential papers in the network considering citations.

This will help determine which papers are most influential based on how many other papers cite them and how important those citing papers are.

In [34]:
# drop projected graph if exists

query = """CALL gds.graph.drop('citationGraph', false)"""
result = session.run(query)

print([record for record in result])



[<Record graphName='citationGraph' database='neo4j' databaseLocation='local' memoryUsage='' sizeInBytes=-1 nodeCount=5252 relationshipCount=3545 configuration={'relationshipProjection': {'CITED_BY': {'aggregation': 'DEFAULT', 'orientation': 'REVERSE', 'indexInverse': False, 'properties': {'decision': {'aggregation': 'DEFAULT', 'property': 'decision', 'defaultValue': 0.5}}, 'type': 'Cited_by'}}, 'readConcurrency': 4, 'relationshipProperties': {}, 'nodeProperties': {}, 'jobId': 'c0eafca0-7aa2-464a-8a79-903fc2411a90', 'nodeProjection': {'Paper': {'label': 'Paper', 'properties': {}}}, 'logProgress': True, 'validateRelationships': False, 'sudo': False} density=0.00012854331739372025 creationTime=neo4j.time.DateTime(2025, 4, 8, 4, 0, 51, 680450000, tzinfo=<DstTzInfo 'Europe/Madrid' CEST+2:00:00 DST>) modificationTime=neo4j.time.DateTime(2025, 4, 8, 4, 0, 51, 680450000, tzinfo=<DstTzInfo 'Europe/Madrid' CEST+2:00:00 DST>) schema={'graphProperties': {}, 'nodes': {'Paper': {}}, 'relationships':

In [35]:
# Project the graph first
query = """
CALL gds.graph.project(
  'citationGraph',
  'Paper',
  {
    Cited_by: {
      type: 'Cited_by',
      orientation: 'REVERSE'
    }
  }
)

"""

result = session.run(query)
print([record for record in result])


[<Record nodeProjection={'Paper': {'label': 'Paper', 'properties': {}}} relationshipProjection={'Cited_by': {'aggregation': 'DEFAULT', 'orientation': 'REVERSE', 'indexInverse': False, 'properties': {}, 'type': 'Cited_by'}} graphName='citationGraph' nodeCount=5252 relationshipCount=3545 projectMillis=16>]


In [39]:

# Run weighted PageRank
query = """
CALL gds.pageRank.stream('citationGraph')
YIELD nodeId, score
MATCH (p:Paper) WHERE id(p) = nodeId
RETURN p.DOI AS paper, score
ORDER BY score desc
LIMIT 10
"""

result = session.run(query)
print([record for record in result])



[<Record paper='10.3390/APP8081280' score=1.48875>, <Record paper='10.3390/a17090419' score=1.48875>, <Record paper='10.1016/J.PSEP.2019.01.013' score=1.4249999999999998>, <Record paper='10.1055/a-1885-1697' score=1.4249999999999998>, <Record paper='10.2139/ssrn.3935555' score=1.4249999999999998>, <Record paper='10.1371/journal.pone.0312395' score=1.4249999999999998>, <Record paper='10.4103/jmss.jmss_47_23' score=1.4249999999999998>, <Record paper='10.3390/ijerph21050521' score=1.4249999999999998>, <Record paper='10.1016/j.euroneuro.2020.03.016' score=1.4249999999999998>, <Record paper='10.1016/j.dt.2024.06.004' score=1.4249999999999998>]
