In [None]:
import os
import re
from neo4j import GraphDatabase

# Function to parse a single .abs file
def parse_abs_data(abs_data):
    patterns = {
        'paper_id': r'Paper: (.+)',
        'authors': r'Authors: (.+)',
        'title': r'Title: (.+)',
        'comments': r'Comments: (.+)',
        'subj_class': r'Subj-class: (.+)',
        'journal_ref': r'Journal-ref: (.+)',
        'abstract': r'\n\\\\\n ([\s\S]+?)\n\\\\\n',
    }
    return {key: re.search(pattern, abs_data).group(1).strip() if re.search(pattern, abs_data) else None
            for key, pattern in patterns.items()}

# Neo4j uploader class
class Neo4jUploader:

    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def delete_all_papers(self):
        """Delete all existing Paper nodes in the database."""
        with self.driver.session() as session:
            session.write_transaction(self._delete_all_papers)

    @staticmethod
    def _delete_all_papers(tx):
        query = "MATCH (p:Paper) DETACH DELETE p"
        tx.run(query)
        print("All existing Paper nodes deleted.")

    def create_paper_node(self, parsed_data):
        with self.driver.session() as session:
            session.write_transaction(self._create_and_return_paper, parsed_data)

    @staticmethod
    def _create_and_return_paper(tx, parsed_data):
        query = """
        MERGE (p:Paper {paper_id: $paper_id})
        SET p.title = $title,
            p.authors = $authors,
            p.comments = $comments,
            p.subj_class = $subj_class,
            p.journal_ref = $journal_ref,
            p.abstract = $abstract
        """
        tx.run(query, 
               paper_id=parsed_data.get('paper_id'), 
               title=parsed_data.get('title'), 
               authors=parsed_data.get('authors'), 
               comments=parsed_data.get('comments'), 
               subj_class=parsed_data.get('subj_class'), 
               journal_ref=parsed_data.get('journal_ref'), 
               abstract=parsed_data.get('abstract'))

    def create_citation_edge(self, from_node, to_node):
        with self.driver.session() as session:
            session.write_transaction(self._create_citation_if_not_exists, from_node, to_node)

    @staticmethod
    def _create_citation_if_not_exists(tx, from_node, to_node):
        # Only create the edge if it doesn't already exist
        query = """
        MATCH (a:Paper {paper_id: $from_node})
        MATCH (b:Paper {paper_id: $to_node})
        MERGE (a)-[:CITES]->(b)
        """
        tx.run(query, from_node=from_node, to_node=to_node)

# Function to process all .abs files and create nodes
def process_all_abs_files(root_folder, neo4j_uploader):
    for year_folder in os.listdir(root_folder):
        year_path = os.path.join(root_folder, year_folder)
        if os.path.isdir(year_path) and year_folder.isdigit():
            for file_name in os.listdir(year_path):
                if file_name.endswith(".abs"):
                    file_path = os.path.join(year_path, file_name)
                    with open(file_path, 'r', encoding='utf-8') as file:
                        abs_data = file.read()
                        parsed_data = parse_abs_data(abs_data)
                        # Remove leading zeros from filename
                        parsed_data['paper_id'] = str(int(file_name.replace('.abs', '')))
                        neo4j_uploader.create_paper_node(parsed_data)
                        print(f"Uploaded: {parsed_data['paper_id']}")

# Function to read and parse Cit-HepTh.txt and create edges in Neo4j with progress tracking
def create_edges_from_file(file_path, neo4j_uploader, last_processed_edge=0):
    current_edge = 0
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith("#"):
                continue
            current_edge += 1
            # Skip edges already processed
            if current_edge <= last_processed_edge:
                continue

            from_node, to_node = line.strip().split()
            neo4j_uploader.create_citation_edge(from_node, to_node)
            print(f"Created edge: {from_node} -> {to_node}")

            # Save the progress every 1000 edges to avoid starting over completely if interrupted
            if current_edge % 1000 == 0:
                with open("last_processed_edge.txt", "w") as progress_file:
                    progress_file.write(str(current_edge))

    # Save the final progress
    with open("last_processed_edge.txt", "w") as progress_file:
        progress_file.write(str(current_edge))

# Helper function to load the last processed edge from file
def load_last_processed_edge():
    try:
        with open("last_processed_edge.txt", "r") as file:
            return int(file.read().strip())
    except FileNotFoundError:
        return 0

# Usage
# root_folder = "/Users/ericweng/Git/CS6400-project/abstract"
neo4j_uploader = Neo4jUploader("neo4j+s://849cda2b.databases.neo4j.io", "neo4j", "4-lyw_1kBYJo9ICwtV_64qFskgvpvgE6A9pwL6aeQ4U")

# Delete existing nodes, then upload new nodes and edges
# neo4j_uploader.delete_all_papers()
# process_all_abs_files(root_folder, neo4j_uploader)

# Load the last processed edge to continue from where it left off
# last_processed_edge = load_last_processed_edge()
create_edges_from_file("/nethome/xweng42/CS6400/cit-HepTh.txt", neo4j_uploader, 235329)

neo4j_uploader.close()
print("All nodes and edges uploaded successfully.")


  session.write_transaction(self._create_citation_if_not_exists, from_node, to_node)


Created edge: 202052 -> 105052
Created edge: 202052 -> 105070
Created edge: 202052 -> 105093
Created edge: 202052 -> 105117
Created edge: 202052 -> 105214
Created edge: 202052 -> 105249
Created edge: 202052 -> 106113
Created edge: 202052 -> 107238
Created edge: 202052 -> 110007
Created edge: 202052 -> 110265
Created edge: 202052 -> 111008
Created edge: 202052 -> 111093
Created edge: 202052 -> 111182
Created edge: 202052 -> 111238
Created edge: 202052 -> 112253
Created edge: 202052 -> 201247
Created edge: 202052 -> 9204046
Created edge: 202052 -> 9207016
Created edge: 202052 -> 9402002
Created edge: 202052 -> 9411187
Created edge: 202052 -> 9812013
Created edge: 202053 -> 104188
Created edge: 202053 -> 108058
Created edge: 202053 -> 111190
Created edge: 202053 -> 9208029
Created edge: 202053 -> 9211121
Created edge: 202053 -> 9312062
Created edge: 202053 -> 9403195
Created edge: 202053 -> 9407087
Created edge: 202053 -> 9408074
Created edge: 202053 -> 9408099
Created edge: 202053 -> 941