# Neo4j Data Import

## Paper Metadata Collection

In [None]:
title = "Training Large Language Models to Reason in a Continuous Latent Space"

In [None]:
import sys
import os

# 获取当前脚本所在目录的父目录 (即 my_project)
parent_dir = os.path.dirname(os.getcwd())

# 将父目录添加到 sys.path
sys.path.append(parent_dir)

In [None]:
from apis.arxiv_tool import ArxivKit
from apis.semanticscholar_tool import SemanticScholarKit

Arxiv Metadata

In [None]:
arxiv = ArxivKit()
arxiv_metadata = arxiv.retrieve_metadata_by_paper(query_term=title, max_cnt=3)

SemanticScholar Metadata

In [None]:
ss = SemanticScholarKit()
ss_metadata = ss.search_paper_by_keywords(query=title, limit=3)

In [None]:
# paper_ss_id = ss_metadata[0][0].get('paperId')
paper_ss_id = ss_metadata[0].get('paperId')
print(paper_ss_id)

Reference and Citedby data

In [None]:
reference_metadata = ss.get_semanticscholar_references(paper_id=paper_ss_id, limit=100)
len(reference_metadata)

In [None]:
citedby_metadata = ss.get_semanticscholar_citedby(paper_id=paper_ss_id, limit=100)
len(citedby_metadata)

## Useful Functions

### Metadata Preprocess

In [None]:
def move_key_to_first(input_dict, key_to_move):
    """将字典的某个键移动到第一位。"""
    if key_to_move not in input_dict:
        return input_dict  # 如果键不存在，则直接返回原字典

    value = input_dict[key_to_move]
    new_dict = {key_to_move: value} # 创建新字典，首先插入要移动的键
    for k, v in input_dict.items():
        if k != key_to_move:
            new_dict[k] = v
    return new_dict

In [None]:
def filter_dict_keys(input_dict, keys_to_keep):
    """使用字典推导式过滤并按指定键顺序调整字典。"""
    return {key: input_dict[key] for key in keys_to_keep if key in input_dict}

In [None]:
import copy 

def remove_dict_keys(input_dict, keys_to_delete):
    """使用 del 关键字删除字典中特定的键。"""
    opt_dct = copy.deepcopy(input_dict)
    for key in keys_to_delete:
        if key in opt_dct:  # 检查键是否存在，避免 KeyError
            del opt_dct[key]
    return opt_dct # 为了方便链式调用，返回修改后的字典

In [None]:
def remove_kth_element(original_list, k):
    """删除list中第k个元素 (不改变原list的值，仅返回新list)"""
    if k <= 0 or k > len(original_list):
        return list(original_list)  # 返回原list的副本，不改变原list
    else:
        new_list = list(original_list) # 创建原list的副本
        new_list.pop(k - 1) # 删除索引为 k-1 的元素 (因为list索引是 0-based)
        return new_list

In [None]:
import json

def convert_dict_values_to_json(dict_data):
    """检查字典的值，如果值是字典类型，则将其转换为 JSON 字符串。
    Args:
        dict_data (dict): 输入字典。
    Returns:
        dict: 值被转换后的字典。
    """
    modified_dict = {}
    for key, value in dict_data.items():
        if isinstance(value, dict):
            modified_dict[key] = json.dumps(value, ensure_ascii=False)
        else:
            modified_dict[key] = value
    return modified_dict

In [None]:
def rename_dict_key(input_dict, old_key, new_key):
  """rename old key in dict to new key
  """
  return {(new_key if k == old_key else k): v for k, v in input_dict.items()}

In [None]:
from collections import Counter

def count_and_sort_combinations(list_of_dicts, keys):
  """统计字典列表中指定键的不同属性值组合的计数，并从高到低排列。
  Args:
    list_of_dicts: 包含字典的列表。
    keys: 用于统计的键列表。
  Returns:
    按计数降序排列的属性值组合及其计数列表。
  """
  combinations = []
  for d in list_of_dicts:
    # 提取指定键的值，并组成一个元组
    values = tuple(d.get(key) for key in keys)
    combinations.append(values)

  # 使用 Counter 统计每个组合的出现次数
  counts = Counter(combinations)

  # 按计数降序排序
  sorted_counts = counts.most_common()
  return sorted_counts

### Json Import to Neo4j

For better alignment of data types:
- separate by node and relationship types
- first mapping data types between neo4j, then import data
- import nodes and relationships separately

In [None]:
import json
from neo4j import GraphDatabase  # pip install neo4j https://github.com/neo4j/neo4j-python-driver
# import jsonschema  # pip install jsonschema https://github.com/python-jsonschema/jsonschema
# from jsonschema import Draft7Validator

neo4j_uri = "bolt://localhost:7687"  # 替换为你的 Neo4j Bolt URI
neo4j_user = "neo4j"           # 替换为你的 Neo4j 用户名
neo4j_password = "25216590"      # 替换为你的 Neo4j 密码
database = "paper-graph-v0-1"

In [None]:
def is_neo4j_compatible(value):
    """检查值是否可被Neo4j存储为属性（基本类型或其数组）"""
    if isinstance(value, (str, int, float, bool, type(None))):
        return True
    elif isinstance(value, list):
        return all(is_neo4j_compatible(item) for item in value)
    else:
        return False

def import_json_to_neo4j(processed_data, uri, username, password, database):
    driver = GraphDatabase.driver(uri, auth=(username, password))

    with driver.session(database=database) as session:
        for item in processed_data:
            if item['type'] == 'node':
                labels = ":".join(item['labels'])
                parameters = {"id": item['id']}
                set_clauses = []

                if item.get('properties') and isinstance(item['properties'], dict):
                    for key, value in item['properties'].items():
                        if is_neo4j_compatible(value):
                            parameters[key] = value
                        else:
                            # 序列化非兼容类型为JSON字符串
                            parameters[key] = json.dumps(value, ensure_ascii=False)
                        set_clauses.append(f"n.{key} = ${key}")

                merge_query = f"MERGE (n:{labels} {{id: $id}})"
                if set_clauses:
                    set_query = "SET " + ", ".join(set_clauses)
                    cypher_query = f"""
                        {merge_query}
                        ON CREATE {set_query}
                        ON MATCH {set_query}
                    """
                else:
                    cypher_query = merge_query
                cypher_query += " RETURN n"
                session.run(cypher_query, parameters)

            elif item['type'] == 'relationship':
                rel_type = item['relationshipType']
                parameters = {"startId": item['startNodeId'], "endId": item['endNodeId']}
                set_clauses = []

                if item.get('properties') and isinstance(item['properties'], dict):
                    for key, value in item['properties'].items():
                        if is_neo4j_compatible(value):
                            parameters[key] = value
                        else:
                            parameters[key] = json.dumps(value, ensure_ascii=False)
                        set_clauses.append(f"r.{key} = ${key}")

                cypher_query = f"""
                    MATCH (a {{id: $startId}}), (b {{id: $endId}})
                    MERGE (a)-[r:{rel_type}]->(b)
                """
                if set_clauses:
                    set_query = "SET " + ", ".join(set_clauses)
                    cypher_query += f"""
                        ON CREATE {set_query}
                        ON MATCH {set_query}
                    """
                cypher_query += " RETURN r"
                session.run(cypher_query, parameters)

    driver.close()

## Paper Metadata Import

### SemanticScholar Paper Metadata

In [None]:
import re 
from typing import List, Dict

def ss_papermeta_process(ss_metadata:List[Dict]):
    ss_papermeta_json = []
    # for node json
    # ss_papermeta_paper_json, ss_papermeta_author_json, ss_papermeta_journal_json, ss_papermeta_venue_json = [], [], [], []
    # for relationship json
    # ss_papermeta_author_paper_json, ss_papermeta_paper_journal_json, ss_papermeta_paper_venue_json = [], [], []

    for item in ss_metadata:
        item = rename_dict_key(item, 'url', 'ssUrl')
        item = rename_dict_key(item, 'paperId', 'ssPaperId')
        paper_id = item.get('ssPaperId')  # semantic scholar paper id

        if paper_id is not None:
            arxiv_id = item.get('externalIds',{}).get('ArXiv')  # arxiv id
            if arxiv_id is not None:
                arxiv_no = arxiv_id.replace('10.48550/arXiv.', '') 
                arxiv_id = re.sub(r'v\d+$', '', arxiv_no)
                version_match = re.search(r'v\d+$', arxiv_no)
                # generate arxiv related info
                item['version'] = version_match.group(0) if version_match else ""
                item['arxivUrl'] = f"https://arxiv.org/abs/{arxiv_no}"
                item['isOpenAccess'] = True, 
                item['openAccessPdf'] = f"https://arxiv.org/pdf/{arxiv_no}"
            item['arxivId'] = arxiv_id

            doi = item.get('externalIds',{}).get('DOI')  # doi
            if doi is None and arxiv_id is not None:
                doi = f"10.48550/arXiv.{arxiv_id}"  # assign 10.48550/arXiv. for arxiv id https://info.arxiv.org/help/doi.html
            item['DOI'] = doi

            # for unique id
            if arxiv_id is not None:
                id = f"10.48550/arXiv.{arxiv_id}"
            elif doi is not None:
                id = doi
            else:
                id = paper_id
            item['id'] = id

            authors = item.get('authors', [])[:10] if item.get('authors', []) is not None else []
            journal = item.get('journal', {}) if item.get('journal', {}) is not None else {}
            venue = item.get('publicationVenue', {}) if item.get('publicationVenue', {}) is not None else {}
            
            # process paper node
            paper_node = {
                "type": "node",
                "id": id,
                "labels": ["Paper"],
                "properties": item
                }
            # ss_papermeta_paper_json.append(paper_node)
            ss_papermeta_json.append(paper_node)

            for idx, author in enumerate(authors[:10]):
                # process author node
                author_id = author.get('authorId')
                if author_id is not None:
                    author_node = {
                        "type": "node",
                        "id": author.get('authorId'),
                        "labels": ["Author"],
                        "properties": author}
                    # ss_papermeta_author_json.append(author_node)
                    ss_papermeta_json.append(author_node)
                
                    # process author -> WRITES -> paper
                    author_order = idx + 1
                    coauthors = remove_kth_element(authors, idx)
                    author_paper_relationship = {
                        "type": "relationship",
                        "relationshipType": "WRITES",
                        "startNodeId": author_id,
                        "endNodeId": id,
                        "properties": {'authorOrder': author_order, 'coauthors': coauthors}
                        }
                    # ss_papermeta_author_paper_json.append(author_paper_relationship)
                    ss_papermeta_json.append(author_paper_relationship)

            journal_name = journal.get('name')
            if journal_name is not None:
                # process journal node
                journal_node = {
                    "type": "node",
                    "id": journal_name,
                    "labels": ["Journal"],
                    "properties": {"name": journal_name}}
                # ss_papermeta_journal_json.append(journal_node)
                ss_papermeta_json.append(journal_node)
                
                if 'arxiv' not in journal_name.lower():  # journal可能会有大量热点，预先进行排除
                    # process paper -> PRINTS_ON -> journal
                    paper_journal_relationship = {
                    "type": "relationship",
                    "relationshipType": "PRINTS_ON",
                    "startNodeId": id,
                    "endNodeId": journal_name,
                    "properties": journal}
                    # ss_papermeta_paper_journal_json.append(paper_journal_relationship)
                    ss_papermeta_json.append(paper_journal_relationship)

            venue_id = venue.get('id')
            if venue_id is not None:
                # process venue node
                venue_node = {
                    "type": "node",
                    "id": venue_id,
                    "labels": ["Venue"],
                    "properties": venue
                    }
                # ss_papermeta_venue_json.append(venue_node)
                ss_papermeta_json.append(venue_node)
                
                # process paper -> RELEASES_IN -> venue
                if 'arxiv' not in venue.get('name').lower():  # venue可能会有大量热点，预先进行排除
                    paper_venue_relationship = {
                    "type": "relationship",
                    "relationshipType": "RELEASES_IN",
                    "startNodeId": id,
                    "endNodeId": venue_id,
                    "properties": {}}
                    # ss_papermeta_paper_venue_json.append(paper_venue_relationship)
                    ss_papermeta_json.append(paper_venue_relationship)
    return ss_papermeta_json

In [None]:
ss_papermeta_json = ss_papermeta_process(ss_metadata)
len(ss_papermeta_json)

In [None]:
import_json_to_neo4j(processed_data=ss_papermeta_json, uri=neo4j_uri, username=neo4j_user, password=neo4j_password, database=database)

### Arxiv Paper Metadata

In [None]:
import re

def arxiv_papermeta_process(arxiv_metadata: List[Dict]):
    arxiv_papermeta_json = []

    for item in arxiv_metadata:
        # id
        arxiv_no = item.get('id').split('/')[-1]
        arxiv_id = re.sub(r'v\d+$', '', arxiv_no)
        id = f"10.48550/arXiv.{arxiv_id}"
        
        # version
        version_match = re.search(r'v\d+$', arxiv_no)
        version = version_match.group(0) if version_match else ""

        # time conversion
        updationDate = item.get('updated')[0:10]
        publicationDate = item.get('published')[0:10]

        # supplement information
        journal = {'name': 'ArXiv', 'volume': f'abs/{arxiv_no}'}
        publicationVenue = {'id': '1901e811-ee72-4b20-8f7e-de08cd395a10',
                    'name': 'arXiv.org',
                    'alternate_names': ['ArXiv'],
                    'issn': '2331-8422',
                    'url': 'https://arxiv.org'}
        
        arxiv_info = {'id': id, 'version': version, 'arxivUrl': item.get('link'), 
                    'updationDate':updationDate, 'publicationDate':publicationDate,
                    'title': item.get('title'), 'abstract': item.get('summary'), 
                    'arxivCategory': item.get('arxiv_primary_category'), 'arxivComment': item.get('arxiv_comment'),
                    'arxivId': arxiv_id, 'DOI': arxiv_id, 'isOpenAccess': True, 'openAccessPdf': item.get('link').replace('/abs/', '/pdf/'),
                    'journal': journal, 'publicationVenue': publicationVenue
                    }
        # process paper node
        paper_node = {
            "type": "node",
            "id": id,
            "labels": ["Paper"],
            "properties": arxiv_info
            }
        arxiv_papermeta_json.append(paper_node)
    return arxiv_papermeta_json

In [None]:
arxiv_papermeta_json = arxiv_papermeta_process(arxiv_metadata)
print(len(arxiv_metadata), len(arxiv_papermeta_json))

In [None]:
import_json_to_neo4j(processed_data=arxiv_papermeta_json, uri=neo4j_uri, username=neo4j_user, password=neo4j_password, database=database)

### SemanticScholar Reference Metadata

Reference data process

In [None]:
def ss_refmeta_process(original_id, reference_metadata: List[Dict]):
    ref_relationmeta_json = []  # store paper - CITES -> paer relationship
    ref_paperdata_json = [] # for paper metadata, to be processed

    for item in reference_metadata:
        citedPaper = item.get('citedPaper')
        if  citedPaper is not None and citedPaper.get('paperId') is not None:
            ref_arxiv_id = citedPaper.get('externalIds',{}).get('ArXiv')  # arxiv id
            if ref_arxiv_id is not None:
                ref_arxiv_id = re.sub(r'v\d+$', '', ref_arxiv_id.replace('10.48550/arXiv.', '') )

            ref_doi = item.get('externalIds',{}).get('DOI')  # doi
            if ref_doi is None and ref_arxiv_id is not None:
                ref_doi = f"10.48550/arXiv.{ref_arxiv_id}"  # assign 10.48550/arXiv. for arxiv id https://info.arxiv.org/help/doi.html

            # for unique id
            if ref_arxiv_id is not None:
                ref_id = f"10.48550/arXiv.{ref_arxiv_id}"
            elif ref_doi is not None:
                ref_id = ref_doi
            else:
                ref_id = citedPaper.get('paperId')
            
            if ref_id is not None: 
                # append paper meta
                ref_paperdata_json.append(citedPaper)

                # append relationship
                properties = filter_dict_keys(item, ['isInfluential', 'contexts', 'intents', 'contextsWithIntent'])
                paper_cites_relationship = {
                    "type": "relationship",
                    "relationshipType": "CITES",
                    "startNodeId": original_id,
                    "endNodeId": ref_id,
                    "properties": properties}
                ref_relationmeta_json.append(paper_cites_relationship)

    ref_papermeta_json = ss_papermeta_process(ref_paperdata_json)
    return ref_papermeta_json + ref_relationmeta_json

In [None]:
item = ss_metadata[0]
paper_id = item.get('paperId')  # semantic scholar paper id

if paper_id is not None:
    arxiv_id = item.get('externalIds',{}).get('ArXiv')  # arxiv id
    if arxiv_id is not None:
        arxiv_no = arxiv_id.replace('10.48550/arXiv.', '') 
        arxiv_id = re.sub(r'v\d+$', '', arxiv_no)
        version_match = re.search(r'v\d+$', arxiv_no)
        # generate arxiv related info
        item['version'] = version_match.group(0) if version_match else ""
        item['arxivUrl'] = f"https://arxiv.org/abs/{arxiv_no}"
        item['isOpenAccess'] = True, 
        item['openAccessPdf'] = f"https://arxiv.org/pdf/{arxiv_no}"

    doi = item.get('externalIds',{}).get('DOI')  # doi
    if doi is None and arxiv_id is not None:
        doi = f"10.48550/arXiv.{arxiv_id}"  # assign 10.48550/arXiv. for arxiv id https://info.arxiv.org/help/doi.html

    # for unique id
    if arxiv_id is not None:
        id = f"10.48550/arXiv.{arxiv_id}"
    elif doi is not None:
        id = doi
    else:
        id = paper_id

ss_refmeta_json = ss_refmeta_process(id, reference_metadata)
len(ss_refmeta_json)

In [None]:
import_json_to_neo4j(processed_data=ss_refmeta_json, uri=neo4j_uri, username=neo4j_user, password=neo4j_password, database=database)

In [None]:
for item in ss_refmeta_json:
    if item['type'] == 'relationship' and item['relationshipType'] == 'CITES':
        print(item)

In [None]:
def ss_citedbymeta_process(original_id, citedby_metadata: List[Dict]):
    citing_relationmeta_json = []  # store paper - CITES -> paer relationship
    citing_paperdata_json = [] # for paper metadata, to be processed

    for item in citedby_metadata:
        citingPaper = item.get('citingPaper')
        if  citingPaper is not None and citingPaper.get('paperId') is not None:
            citing_arxiv_id = citingPaper.get('externalIds',{}).get('ArXiv')  # arxiv id
            if citing_arxiv_id is not None:
                citing_arxiv_id = re.sub(r'v\d+$', '', citing_arxiv_id.replace('10.48550/arXiv.', '') )

            citing_doi = item.get('externalIds',{}).get('DOI')  # doi
            if citing_doi is None and citing_arxiv_id is not None:
                citing_doi = f"10.48550/arXiv.{citing_arxiv_id}"  # assign 10.48550/arXiv. for arxiv id https://info.arxiv.org/help/doi.html

            # for unique id
            if citing_arxiv_id is not None:
                citing_id = f"10.48550/arXiv.{citing_arxiv_id}"
            elif citing_doi is not None:
                citing_id = citing_doi
            else:
                citing_id = citingPaper.get('paperId')

            if citing_id is not None: 
                # append paper meta
                citing_paperdata_json.append(citingPaper)

                # append relationship
                properties = filter_dict_keys(item, ['isInfluential', 'contexts', 'intents', 'contextsWithIntent'])
                paper_cites_relationship = {
                    "type": "relationship",
                    "relationshipType": "CITES",
                    "startNodeId": citing_id,
                    "endNodeId": original_id,
                    "properties": properties}
                citing_relationmeta_json.append(paper_cites_relationship)

    citing_papermeta_json = ss_papermeta_process(citing_paperdata_json)
    return citing_papermeta_json + citing_relationmeta_json

In [None]:
ss_citedby_json = ss_citedbymeta_process(id, citedby_metadata)

In [None]:
len(ss_citedby_json)

In [None]:
for item in ss_citedby_json:
    if item['type'] == 'relationship' and item['relationshipType'] == 'CITES':
        print(item)

In [None]:
import_json_to_neo4j(processed_data=ss_citedby_json, uri=neo4j_uri, username=neo4j_user, password=neo4j_password, database=database)