In [11]:
import os
import time
import csv
from neo4j import GraphDatabase

# Connect to the Neo4j database
uri = "bolt://localhost:7687"
user = "neo4j"
password = "12345678"
driver = GraphDatabase.driver(uri, auth=(user, password))

# Path to the folder containing the JSON files
json_folder = "C:\\Users\\josep\\.Neo4jDesktop\\relate-data\dbmss\\dbms-81676b45-399d-492a-b2c6-8199de366b04\\import"

# Create CSV writer for logging node and relationship counts
# Check if the CSV file already exists
csv_file = 'node_count.csv'
if os.path.exists(csv_file):
    # Read the last row from the CSV file and extract the time elapsed value
    with open(csv_file, 'r', newline='') as file:
        reader = csv.reader(file)
        last_row = list(reader)[-1]
        last_time = float(last_row[-1])
    # Start the timer from where it left off
    start_time = time.time() - last_time
else:
    # Create CSV writer for logging node and relationship counts
    with open(csv_file, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["FileName", "Nodes", "Relationships", "Time Elapsed"])
    # Start the timer from the beginning
    start_time = time.time()
    file_names = []
    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        next(reader)  # skip the header row
        for row in reader:
            file_names.append(row[0])

# Get a list of all the JSON files in the folder
json_files = [f for f in os.listdir(json_folder) if f.endswith(".json")]
json_files = sorted(json_files, key=lambda f: os.path.getsize(os.path.join(json_folder, f)))

for filename in json_files:
    if filename in file_names:
        continue
    with driver.session() as session:
        print("Loading File: " + filename)
        out = session.run("""CALL apoc.periodic.iterate(
                    "CALL apoc.load.json('"""+filename+"""') YIELD value
                    WITH value
                    UNWIND keys(value) as article_name
                    RETURN article_name, value[article_name] as related_articles",
                    "MERGE (article:Article {title: article_name})
                    WITH article, related_articles
                    UNWIND keys(related_articles) as related_article_name
                    MERGE (related_article:Article {title: related_article_name})
                    CREATE (article)-[r:LINKS_TO {position: related_articles[related_article_name]}]->(related_article)",
                    {batchSize: 500, parallel: false}
                    )
                    YIELD batches, total
                    RETURN batches, total""")
        #print(out)
        # Get the number of nodes and relationships in the database
        num_nodes = session.run("MATCH (n:Article) RETURN count(n)").single()[0]
        num_rels = session.run("MATCH ()-[r:LINKS_TO]->() RETURN count(r)").single()[0]
        # Log the file name, number of nodes, number of relationships, and current time to the CSV file
        with open('node_count.csv', 'a', newline='') as node_file:
            writer = csv.writer(node_file)
            writer.writerow([filename, num_nodes, num_rels, time.time() - start_time])
    session.close()

Loading File: CleanWikiDump_part451.json
Loading File: CleanWikiDump_part308.json
Loading File: CleanWikiDump_part190.json
Loading File: CleanWikiDump_part431.json
Loading File: CleanWikiDump_part111.json
Loading File: CleanWikiDump_part182.json
Loading File: CleanWikiDump_part181.json
Loading File: CleanWikiDump_part109.json
Loading File: CleanWikiDump_part110.json
Loading File: CleanWikiDump_part367.json
Loading File: CleanWikiDump_part342.json
Loading File: CleanWikiDump_part276.json
Loading File: CleanWikiDump_part449.json
Loading File: CleanWikiDump_part447.json
Loading File: CleanWikiDump_part437.json
Loading File: CleanWikiDump_part189.json
Loading File: CleanWikiDump_part450.json
Loading File: CleanWikiDump_part448.json
Loading File: CleanWikiDump_part364.json
Loading File: CleanWikiDump_part307.json
Loading File: CleanWikiDump_part363.json
Loading File: CleanWikiDump_part343.json
Loading File: CleanWikiDump_part445.json
Loading File: CleanWikiDump_part446.json
Loading File: Cl

In [5]:
import os
import json
import time
import csv
from neo4j import GraphDatabase
from threading import Thread

# Connect to Neo4j database
uri = "bolt://localhost:7687"
user = "neo4j"
password = "Wikipedia"
driver = GraphDatabase.driver(uri, auth=(user, password))

# Path to the folder containing the JSON files
json_folder = "D:\Wiki\CleanWikiSplit"

# Create CSV writer for logging node and relationship counts
with open('node_count.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Batch Number", "Nodes", "Relationships", "Time Elapsed"])

# Number of files to process in each batch
batch_size = 10

# Get a list of all the JSON files in the folder
json_files = [f for f in os.listdir(json_folder) if f.endswith(".json")]

# Split the list of files into batches
file_batches = [json_files[i:i + batch_size] for i in range(0, len(json_files), batch_size)]

# Create a function to process a batch of files
def process_batch(batch):
    # Create a session to execute Neo4j queries
    with driver.session() as session:
        # Loop through all the files in the batch
        for filename in batch:
            json_path = os.path.join(json_folder, filename)
            print("Loading File: " + filename)

            # Open the JSON file and load the data
            with open(json_path, encoding='utf-8') as json_file:
                data = json.load(json_file)

            # Loop through all the articles in the JSON data
            for title, links in data.items():
                session.run("CREATE (b:Article {title: $title})", title=title.lower())
                position = 0
                for link, count in links.items():
                    position += 1
                    session.run("CREATE (b:Article {title: $title})", title=link.lower())
                    session.run("""
                    MATCH (a:Article {title: $title}) MATCH (b:Article {title: $link_title}) 
                    CREATE (a)-[r:LINKS_TO {count: $count, position: $position}]->(b)"""
                                , title=title.lower(), link_title=link.lower(), count=count, position=position)
    session.close()

# Start the timer
start_time = time.time()

# Loop through all the batches of files
batchcount = 0
for batch in file_batches:
    batchcount = batchcount + 1
    # Create a list of threads to process the batch
    threads = []
    for filename in batch:
        thread = Thread(target=process_batch, args=([filename],))
        threads.append(thread)

    # Start the threads
    for thread in threads:
        thread.start()

    # Wait for all the threads to finish
    for thread in threads:
        thread.join()

    with driver.session() as session:
        # Get the number of nodes and relationships in the database
        num_nodes = session.run("MATCH (n:Article) RETURN count(n)").single()[0]
        num_rels = session.run("MATCH ()-[r:LINKS_TO]->() RETURN count(r)").single()[0]

        # Log the file name, number of nodes, number of relationships, and current time to the CSV file
        with open('node_count.csv', 'a', newline='') as node_file:
                writer = csv.writer(node_file)
                writer.writerow([batchcount, num_nodes, num_rels, time.time() - start_time])
    session.close()

# Close the Neo4j driver
driver.close()

Loading File: CleanWikiDump_part000.json
Loading File: CleanWikiDump_part001.json
Loading File: CleanWikiDump_part002.json
Loading File: CleanWikiDump_part003.json
Loading File: CleanWikiDump_part004.json
Loading File: CleanWikiDump_part005.json
Loading File: CleanWikiDump_part006.json
Loading File: CleanWikiDump_part007.json
Loading File: CleanWikiDump_part008.json
Loading File: CleanWikiDump_part009.json
Loading File: CleanWikiDump_part010.json
Loading File: CleanWikiDump_part011.json
Loading File: CleanWikiDump_part012.json
Loading File: CleanWikiDump_part013.json
Loading File: CleanWikiDump_part014.json
