In [53]:
#!pip install neo4j langchain pandas

import os
import json
import pandas as pd
from neo4j import GraphDatabase
import logging

In [54]:
# Set up Neo4j database connection
class Neo4jConnection:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def query(self, query, parameters=None):
        with self.driver.session() as session:
            result = session.run(query, parameters)
            return [record for record in result]

# Initialize database connection
neo4j_conn = Neo4jConnection(uri="bolt://localhost:7687", user="neo4j", password="neo4j")

In [55]:
# Clear all nodes and relationships from the database
neo4j_conn.query("MATCH (n) DETACH DELETE n")

[]

In [56]:
# Load manager and company data from CSV
csv_file_path = r'C:/Users/jhyang/OneDrive/文档/GitHub_Projects/GraphRAG-Company-Info-Collector/resources/form13.csv'
df_csv = pd.read_csv(csv_file_path, nrows=10000)

In [57]:
# Create Manager and Company nodes with relationships from CSV data
for _, row in df_csv.iterrows():
    manager_name = row['managerName']
    manager_cik = row['managerCik']
    manager_address = row['managerAddress']
    
    company_name = row['companyName']
    cusip6 = row['cusip6']
    cusip = row['cusip']
    
    value = row['value']
    shares = row['shares']
    report_calendar_or_quarter = row['reportCalendarOrQuarter']
    
    # Create Manager node
    neo4j_conn.query(
        """
        MERGE (m:Manager {name: $manager_name})
        SET m.cik = $manager_cik, m.address = $manager_address
        RETURN m
        """,
        parameters={
            "manager_name": manager_name,
            "manager_cik": manager_cik,
            "manager_address": manager_address
        }
    )
    
    # Create Company node
    neo4j_conn.query(
        """
        MERGE (c:Company {name: $company_name})
        SET c.cusip6 = $cusip6, c.cusip = $cusip
        RETURN c
        """,
        parameters={
            "company_name": company_name,
            "cusip6": cusip6,
            "cusip": cusip
        }
    )
    
    # Create relationship HOLDS with properties
    neo4j_conn.query(
        """
        MATCH (m:Manager {name: $manager_name})
        MATCH (c:Company {name: $company_name})
        MERGE (m)-[r:HOLDS]->(c)
        SET r.value = $value, r.shares = $shares, r.reportCalendarOrQuarter = $report_calendar_or_quarter
        RETURN r
        """,
        parameters={
            "manager_name": manager_name,
            "company_name": company_name,
            "value": value,
            "shares": shares,
            "report_calendar_or_quarter": report_calendar_or_quarter
        }
    )

In [58]:
# Define the maximum number of JSON files to import
max_files_to_import = 142
imported_files_count = 0

In [59]:
# Set up logging with detailed information about each operation
logging.basicConfig(filename='import_log.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [60]:
# Directory for processing JSON files
json_dir_path = r'C:/Users/jhyang/OneDrive/文档/GitHub_Projects/GraphRAG-Company-Info-Collector/resources/form10k'

In [61]:
for json_file in os.listdir(json_dir_path):
    if json_file.endswith('.json') and imported_files_count < max_files_to_import:
        try:
            with open(os.path.join(json_dir_path, json_file), 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Extract company data
            company_name = data.get("companyName", "Unknown Company")
            business_overview = data.get("item1", "")
            address = data.get("address", "")
            
            # Check if the Company node exists
            existing_company = neo4j_conn.query(
                """
                MATCH (c:Company {name: $company_name})
                RETURN c
                """,
                parameters={"company_name": company_name}
            )
            
            # If the company exists, log and update it, otherwise create it
            if existing_company:
                logging.info(f"Company {company_name} already exists, updating information.")
                neo4j_conn.query(
                    """
                    MATCH (c:Company {name: $company_name})
                    SET c.businessOverview = COALESCE(c.businessOverview, $business_overview),
                        c.address = COALESCE(c.address, $address)
                    RETURN c
                    """,
                    parameters={
                        "company_name": company_name,
                        "business_overview": business_overview,
                        "address": address
                    }
                )
            else:
                logging.info(f"Company {company_name} does not exist, creating new node.")
                neo4j_conn.query(
                    """
                    CREATE (c:Company {name: $company_name, businessOverview: $business_overview, address: $address})
                    RETURN c
                    """,
                    parameters={
                        "company_name": company_name,
                        "business_overview": business_overview,
                        "address": address
                    }
                )
            
            # Process subsidiary information
            subsidiaries = data.get("subsidiaries", [])
            for sub in subsidiaries:
                sub_name = sub.get("name", "Unknown Subsidiary")
                
                # Check if the Subsidiary node and relationship already exist
                existing_sub = neo4j_conn.query(
                    """
                    MATCH (c:Company {name: $company_name})-[:OWNS]->(s:Subsidiary {name: $sub_name})
                    RETURN s
                    """,
                    parameters={
                        "company_name": company_name,
                        "sub_name": sub_name
                    }
                )
                
                # If the subsidiary relationship doesn't exist, create it
                if existing_sub:
                    logging.info(f"Subsidiary {sub_name} under company {company_name} already exists.")
                else:
                    logging.info(f"Creating subsidiary {sub_name} under company {company_name}.")
                    neo4j_conn.query(
                        """
                        MERGE (s:Subsidiary {name: $sub_name})
                        WITH s
                        MATCH (c:Company {name: $company_name})
                        MERGE (c)-[:OWNS]->(s)
                        RETURN s
                        """,
                        parameters={
                            "sub_name": sub_name,
                            "company_name": company_name
                        }
                    )

            # Increment the file count after processing
            imported_files_count += 1

        except Exception as e:
            logging.error(f"Error processing file {json_file} for company {company_name}: {e}")
        
        # Stop if the maximum file count is reached
        if imported_files_count >= max_files_to_import:
            break

In [62]:
# Close the database connection
neo4j_conn.close()