In [9]:
#!pip install neo4j langchain pandas

import os
import json
import pandas as pd
from neo4j import GraphDatabase

In [10]:
# Set up Neo4j database connection
class Neo4jConnection:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def query(self, query, parameters=None):
        with self.driver.session() as session:
            result = session.run(query, parameters)
            return [record for record in result]

# Initialize database connection
neo4j_conn = Neo4jConnection(uri="bolt://localhost:7687", user="neo4j", password="neo4j")

In [11]:
# Clear all nodes and relationships from the database
neo4j_conn.query("MATCH (n) DETACH DELETE n")

[]

In [12]:
# Load manager and company data from CSV
csv_file_path = r'C:/Users/jhyang/OneDrive/文档/GitHub_Projects/GraphRAG-Company-Info-Collector/resources/form13.csv'
df_csv = pd.read_csv(csv_file_path, nrows=50)

In [13]:
# Create Manager and Company nodes with relationships from CSV data
for _, row in df_csv.iterrows():
    manager_name = row['managerName']
    manager_cik = row['managerCik']
    manager_address = row['managerAddress']
    
    company_name = row['companyName']
    cusip6 = row['cusip6']
    cusip = row['cusip']
    
    value = row['value']
    shares = row['shares']
    report_calendar_or_quarter = row['reportCalendarOrQuarter']
    
    # Create Manager node
    neo4j_conn.query(
        """
        MERGE (m:Manager {name: $manager_name})
        SET m.cik = $manager_cik, m.address = $manager_address
        RETURN m
        """,
        parameters={
            "manager_name": manager_name,
            "manager_cik": manager_cik,
            "manager_address": manager_address
        }
    )
    
    # Create Company node
    neo4j_conn.query(
        """
        MERGE (c:Company {name: $company_name})
        SET c.cusip6 = $cusip6, c.cusip = $cusip
        RETURN c
        """,
        parameters={
            "company_name": company_name,
            "cusip6": cusip6,
            "cusip": cusip
        }
    )
    
    # Create relationship HOLDS with properties
    neo4j_conn.query(
        """
        MATCH (m:Manager {name: $manager_name})
        MATCH (c:Company {name: $company_name})
        MERGE (m)-[r:HOLDS]->(c)
        SET r.value = $value, r.shares = $shares, r.reportCalendarOrQuarter = $report_calendar_or_quarter
        RETURN r
        """,
        parameters={
            "manager_name": manager_name,
            "company_name": company_name,
            "value": value,
            "shares": shares,
            "report_calendar_or_quarter": report_calendar_or_quarter
        }
    )

In [14]:
# Define the maximum number of JSON files to import
max_files_to_import = 10
imported_files_count = 0

In [15]:
# Process JSON files in the specified directory
json_dir_path = r'C:/Users/jhyang/OneDrive/文档/GitHub_Projects/GraphRAG-Company-Info-Collector/resources/form10k'

for json_file in os.listdir(json_dir_path):
    if json_file.endswith('.json') and imported_files_count < max_files_to_import:
        with open(os.path.join(json_dir_path, json_file), 'r', encoding='utf-8') as f:
            data = json.load(f)
            
            # Extract relevant company data from the JSON
            company_name = data.get("companyName", "Unknown Company")
            business_overview = data.get("item1", "")
            address = data.get("address", "")
            
            # Create or update the Company node
            neo4j_conn.query(
                """
                MERGE (c:Company {name: $company_name})
                SET c.businessOverview = $business_overview, c.address = $address
                RETURN c
                """,
                parameters={
                    "company_name": company_name,
                    "business_overview": business_overview,
                    "address": address
                }
            )
            
            # Additional relationships or nodes can be created based on JSON structure
            subsidiaries = data.get("subsidiaries", [])
            for sub in subsidiaries:
                sub_name = sub.get("name", "Unknown Subsidiary")
                
                # Create Subsidiary node and relationship
                neo4j_conn.query(
                    """
                    MERGE (s:Subsidiary {name: $sub_name})
                    WITH s
                    MATCH (c:Company {name: $company_name})
                    MERGE (c)-[:OWNS]->(s)
                    RETURN s
                    """,
                    parameters={
                        "sub_name": sub_name,
                        "company_name": company_name
                    }
                )
        
        # Increment the counter after processing each file
        imported_files_count += 1

        # Stop if the maximum number of files has been reached
        if imported_files_count >= max_files_to_import:
            break

In [16]:
# Close the database connection
neo4j_conn.close()