In [5]:
import json
from neo4j import GraphDatabase

# --- Configuration ---
# 1. Neo4j AuraDB Credentials (Update with your own)
NEO4J_URI = "neo4j://127.0.0.1:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "12345678"

# 2. Path to your data file
# Make sure this file is in the same directory as the script,
# or provide the full path.
JSONL_FILE_PATH = "processed_data/cvpr_papers_cleaned.jsonl"

# --- Data Loading ---
def load_papers_from_jsonl(file_path):
    """Loads paper data from a .jsonl file."""
    papers = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            papers.append(json.loads(line))
    return papers

# --- Neo4j Ingestion Logic ---
def get_neo4j_driver():
    """Connects to the Neo4j database."""
    return GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

def add_paper_to_graph(tx, paper_data):
    """Adds a paper, its authors, and keywords to the graph."""
    
    # Extract keywords from the summary (a simple method)
    # You could use a more advanced NLP method here if needed.
    summary = paper_data.get('summary', '')
    keywords = [word for word in summary.split() if len(word) > 5 and word.lower() not in ['challenge', 'paper', 'results', 'method', 'dataset']]
    unique_keywords = list(set(keywords[:5])) # Take up to 5 unique keywords

    # Use MERGE to avoid creating duplicate nodes
    # Create the Paper node
    query_paper = "MERGE (p:Paper {title: $title})"
    tx.run(query_paper, title=paper_data['title'])

    # Create Author nodes and AUTHORED relationships
    for author_name in paper_data.get('authors', []):
        query_author = """
        MERGE (a:Author {name: $author_name})
        WITH a
        MATCH (p:Paper {title: $paper_title})
        MERGE (a)-[:AUTHORED]->(p)
        """
        tx.run(query_author, author_name=author_name, paper_title=paper_data['title'])

    # Create Keyword nodes and MENTIONS relationships
    for keyword in unique_keywords:
        query_keyword = """
        MERGE (k:Keyword {term: $keyword})
        WITH k
        MATCH (p:Paper {title: $paper_title})
        MERGE (p)-[:MENTIONS]->(k)
        """
        tx.run(query_keyword, keyword=keyword, paper_title=paper_data['title'])

# --- Main Execution ---
if __name__ == "__main__":
    driver = get_neo4j_driver()
    papers = load_papers_from_jsonl(JSONL_FILE_PATH)
    
    print(f"Found {len(papers)} papers. Ingesting into Neo4j...")
    with driver.session() as session:
        for i, paper in enumerate(papers):
            if 'title' in paper and 'authors' in paper:
                session.execute_write(add_paper_to_graph, paper)
                print(f"  ({i+1}/{len(papers)}) Ingested '{paper['title']}'")
            
    print("\nIngestion complete.")
    driver.close()

Found 50 papers. Ingesting into Neo4j...
  (1/50) Ingested 'SLRTP2025 Sign Language Production Challenge: Methodology, Results, and Future Work'
  (2/50) Ingested 'HeCoFuse: Cross-Modal Complementary V2X Cooperative Perception with Heterogeneous Sensors'
  (3/50) Ingested 'CuriosAI Submission to the EgoExo4D Proficiency Estimation Challenge 2025'
  (4/50) Ingested 'ZERO: Industry-ready Vision Foundation Model with Multi-modal Prompts'
  (5/50) Ingested 'Prompting without Panic: Attribute-aware, Zero-shot, Test-Time Calibration'
  (6/50) Ingested 'DIVE: Deep-search Iterative Video Exploration A Technical Report for the CVRR Challenge at CVPR 2025'
  (7/50) Ingested 'End-to-End RGB-IR Joint Image Compression With Channel-wise Cross-modality Entropy Model'
  (8/50) Ingested 'Exploring Non-contrastive Self-supervised Representation Learning for Image-based Profiling'
  (9/50) Ingested 'NTIRE 2025 Challenge on HR Depth from Images of Specular and Transparent Surfaces'
  (10/50) Ingested 'Ga