In [5]:
import os
import json
import time
import csv
from neo4j import GraphDatabase
from threading import Thread

# Connect to Neo4j database
uri = "bolt://localhost:7687"
user = "neo4j"
password = "Wikipedia"
driver = GraphDatabase.driver(uri, auth=(user, password))

# Path to the folder containing the JSON files
json_folder = "D:\Wiki\CleanWikiSplit"

# Create CSV writer for logging node and relationship counts
with open('node_count.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Batch Number", "Nodes", "Relationships", "Time Elapsed"])

# Number of files to process in each batch
batch_size = 10

# Get a list of all the JSON files in the folder
json_files = [f for f in os.listdir(json_folder) if f.endswith(".json")]

# Split the list of files into batches
file_batches = [json_files[i:i + batch_size] for i in range(0, len(json_files), batch_size)]

# Create a function to process a batch of files
def process_batch(batch):
    # Create a session to execute Neo4j queries
    with driver.session() as session:
        # Loop through all the files in the batch
        for filename in batch:
            json_path = os.path.join(json_folder, filename)
            print("Loading File: " + filename)

            # Open the JSON file and load the data
            with open(json_path, encoding='utf-8') as json_file:
                data = json.load(json_file)

            # Loop through all the articles in the JSON data
            for title, links in data.items():
                session.run("CREATE (b:Article {title: $title})", title=title)
                position = 0
                for link, count in links.items():
                    position += 1
                    session.run("CREATE (b:Article {title: $title})", title=link)
                    session.run("""
                    MATCH (a:Article {title: $title}) MATCH (b:Article {title: $link_title}) 
                    CREATE (a)-[r:LINKS_TO {count: $count, position: $position}]->(b)"""
                                , title=title, link_title=link, count=count, position=position)
            print("File Done: " + )
    session.close()

# Start the timer
start_time = time.time()

# Loop through all the batches of files
batchcount = 0
for batch in file_batches:
    batchcount = batchcount + 1
    # Create a list of threads to process the batch
    threads = []
    for filename in batch:
        thread = Thread(target=process_batch, args=([filename],))
        threads.append(thread)

    # Start the threads
    for thread in threads:
        thread.start()

    # Wait for all the threads to finish
    for thread in threads:
        thread.join()

    with driver.session() as session:
        # Get the number of nodes and relationships in the database
        num_nodes = session.run("MATCH (n:Article) RETURN count(n)").single()[0]
        num_rels = session.run("MATCH ()-[r:LINKS_TO]->() RETURN count(r)").single()[0]

        # Log the file name, number of nodes, number of relationships, and current time to the CSV file
        with open('node_count.csv', 'a', newline='') as node_file:
                writer = csv.writer(node_file)
                writer.writerow([batchcount, num_nodes, num_rels, time.time() - start_time])
    session.close()

# Close the Neo4j driver
driver.close()

Loading File: CleanWikiDump_part000.json
Loading File: CleanWikiDump_part001.json
Loading File: CleanWikiDump_part002.json
Loading File: CleanWikiDump_part003.json
Loading File: CleanWikiDump_part004.json
Loading File: CleanWikiDump_part005.json
Loading File: CleanWikiDump_part006.json
Loading File: CleanWikiDump_part007.json
Loading File: CleanWikiDump_part008.json
Loading File: CleanWikiDump_part009.json
Loading File: CleanWikiDump_part010.json
Loading File: CleanWikiDump_part011.json
Loading File: CleanWikiDump_part012.json
Loading File: CleanWikiDump_part013.json
Loading File: CleanWikiDump_part014.json
