In [None]:
! pip3 install neo4j

In [None]:
from neo4j import GraphDatabase
import os
import time
from uuid import uuid4
import subprocess


class DirectoryGraph:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def create_nodes(self, path, type):
        with self.driver.session() as session:
            return session.execute_write(self._create_nodes, path, type)

    def create_edges(self, source_node, target_node, edge_type):
        with self.driver.session() as session:
            return session.execute_write(self._create_edges, source_node, target_node, edge_type)
    
    def clean_path(self,path):
        if len(os.path.basename(path)) == 0:
            return path.split("/")[2]
        else:
            return os.path.basename(path)
    
    ## only works on linux
    def get_directory_size(self, directory):
        result = subprocess.run(['du', '-s', '-h', directory], stdout=subprocess.PIPE, text=True)
        size, _ = result.stdout.split()
        return size  
    
    def _create_nodes(self, tx, path, node_type):
        node_id = str(uuid4())
        type_id = node_type 
        short_name = self.clean_path(path)
        long_name = "long" + short_name #TODO: Give the real long name later
        description = ""
        creation_date = time.strftime("%c", time.gmtime(os.path.getctime(path)))
        modified_date = time.strftime("%c", time.gmtime(os.path.getmtime(path)))
        if node_type == "directory":
            dsize = self.get_directory_size(path)
            tx.run("""
                MERGE (f:node_directory {
                    node_id: $node_id,
                    type_id: $type_id,
                    short_name: $short_name,
                    long_name: $long_name,
                    description: $description,
                    creation_date: $creation_date,
                    modified_date: $modified_date,
                    dsize: $dsize
                })
                """, node_id=node_id, type_id=type_id,
                short_name=short_name, long_name=long_name, description=description,
                creation_date=creation_date, modified_date=modified_date, dsize=dsize)        
        elif node_type == "file":
            extension = os.path.splitext(path)[1]
            fsize = os.path.getsize(path)  # in Bytes
            tx.run("""
                MERGE (f:node_file {
                    node_id: $node_id,
                    type_id: $type_id,
                    short_name: $short_name,
                    long_name: $long_name,
                    description: $description,
                    creation_date: $creation_date,
                    modified_date: $modified_date,
                    extension: $extension,
                    fsize: $fsize
                })
                """, node_id=node_id, type_id=type_id,
                short_name=short_name, long_name=long_name, description=description,
                creation_date=creation_date, modified_date=modified_date, extension=extension, fsize=fsize)            
        elif node_type == "table":
            num_cols = 0
            num_rows = 0
            tx.run("""
                MERGE (f:node_table {
                    node_id: $node_id,
                    type_id: $type_id,
                    short_name: $short_name,
                    long_name: $long_name,
                    description: $description,
                    creation_date: $creation_date,
                    modified_date: $modified_date,
                    num_cols: $num_cols,
                    num_rows: $num_rows
                })
                """, node_id=node_id, type_id=type_id,
                short_name=short_name, long_name=long_name, description=description,
                creation_date=creation_date, modified_date=modified_date, num_cols=num_cols, num_rows=num_rows)        
        elif node_type == "column":
            col_type = ""
            max_col_length = 50
            tx.run("""
                MERGE (f:node_column {
                    node_id: $node_id,
                    type_id: $type_id,
                    short_name: $short_name,
                    long_name: $long_name,
                    description: $description,
                    creation_date: $creation_date,
                    modified_date: $modified_date,
                    col_type: $col_type,
                    max_col_length: $max_col_length
                })
                """, node_id=node_id, type_id=type_id,
                short_name=short_name, long_name=long_name, description=description,
                creation_date=creation_date, modified_date=modified_date, col_type=col_type, max_col_length=max_col_length)            
        elif node_type == "database":
            database_type = 0
            tx.run("""
                MERGE (f:node_database {
                    node_id: $node_id,
                    type_id: $type_id,
                    short_name: $short_name,
                    long_name: $long_name,
                    description: $description,
                    creation_date: $creation_date,
                    modified_date: $modified_date,
                    database_type: $database_type
                })
                """, node_id=node_id, type_id=type_id,
                short_name=short_name, long_name=long_name, description=description,
                creation_date=creation_date, modified_date=modified_date, database_type=database_type)   
        elif node_type == "rdbms":
            num_tables = 0
            tx.run("""
                MERGE (f:node_rdbms {
                    node_id: $node_id,
                    type_id: $type_id,
                    short_name: $short_name,
                    long_name: $long_name,
                    description: $description,
                    creation_date: $creation_date,
                    modified_date: $modified_date,
                    num_tables: $num_tables
                })
                """, node_id=node_id, type_id=type_id,
                short_name=short_name, long_name=long_name, description=description,
                creation_date=creation_date, modified_date=modified_date, num_tables=num_tables)
        elif node_type == "nosql":
            tx.run("""
                MERGE (f:node_nosql {
                    node_id: $node_id,
                    type_id: $type_id,
                    short_name: $short_name,
                    long_name: $long_name,
                    description: $description,
                    creation_date: $creation_date,
                    modified_date: $modified_date,
                })
                """, node_id=node_id, type_id=type_id,
                short_name=short_name, long_name=long_name, description=description,
                creation_date=creation_date, modified_date=modified_date)
        elif node_type == "label":
            tx.run("""
                MERGE (f:node_label {
                    node_id: $node_id,
                    type_id: $type_id,
                    short_name: $short_name,
                    long_name: $long_name,
                    description: $description,
                    creation_date: $creation_date,
                    modified_date: $modified_date,
                })
                """, node_id=node_id, type_id=type_id,
                short_name=short_name, long_name=long_name, description=description,
                creation_date=creation_date, modified_date=modified_date)
        elif node_type == "business_term":
            tx.run("""
                MERGE (f:node_business_term {
                    node_id: $node_id,
                    type_id: $type_id,
                    short_name: $short_name,
                    long_name: $long_name,
                    description: $description,
                    creation_date: $creation_date,
                    modified_date: $modified_date,
                })
                """, node_id=node_id, type_id=type_id,
                short_name=short_name, long_name=long_name, description=description,
                creation_date=creation_date, modified_date=modified_date)
        elif node_type == "classification":
            tx.run("""
                MERGE (f:node_classification {
                    node_id: $node_id,
                    type_id: $type_id,
                    short_name: $short_name,
                    long_name: $long_name,
                    description: $description,
                    creation_date: $creation_date,
                    modified_date: $modified_date,
                })
                """, node_id=node_id, type_id=type_id,
                short_name=short_name, long_name=long_name, description=description,
                creation_date=creation_date, modified_date=modified_date)
        elif node_type == "owner":
            tx.run("""
                MERGE (f:node_owner {
                    node_id: $node_id,
                    type_id: $type_id,
                    short_name: $short_name,
                    long_name: $long_name,
                    description: $description,
                    creation_date: $creation_date,
                    modified_date: $modified_date,
                })
                """, node_id=node_id, type_id=type_id,
                short_name=short_name, long_name=long_name, description=description,
                creation_date=creation_date, modified_date=modified_date)
        else:
            tx.run("""
                MERGE (f:node {
                    node_id: $node_id,
                    type_id: $type_id,
                    short_name: $short_name,
                    long_name: $long_name,
                    description: $description,
                    creation_date: $creation_date,
                    modified_date: $modified_date,
                })
                """, node_id=node_id, type_id=type_id,
                short_name=short_name, long_name=long_name, description=description,
                creation_date=creation_date, modified_date=modified_date)
        return node_id

           
        
            
    def _create_edges(self, tx, source_node, target_node, edge_type):
        # Assigning unique UUID and type_id for files
        edge_id = str(uuid4())
        type_id = edge_type  # Example type_id for files
        short_name = "edge" + edge_type
        long_name = "long" + short_name
        description = ""
        source_node_id = source_node
        target_node_id = target_node
        creation_date = time.strftime("%c", time.gmtime())
        modified_date = time.strftime("%c", time.gmtime())
        
        if edge_type == "edge_has_dir_dir":
            tx.run("""
                MATCH (src:node_directory {node_id: $source_node_id}), (tgt:node_directory {node_id: $target_node_id})
                MERGE (src)-[r:edge_has_dir_dir]->(tgt)
                SET r.edge_id = $edge_id,
                    r.type_id = $type_id,
                    r.short_name = $short_name,
                    r.long_name = $long_name,
                    r.description = $description,
                    r.source_node_id = $source_node_id,
                    r.target_node_id = $target_node_id,
                    r.creation_date = $creation_date,
                    r.modified_date = $modified_date
                """, source_node_id=source_node_id, target_node_id=target_node_id,
                edge_id=edge_id, type_id=type_id, short_name=short_name, long_name=long_name,
                description=description, creation_date=creation_date, modified_date=modified_date)
        elif edge_type == "edge_has_dir_file":
            tx.run("""
                MATCH (src:node_directory {node_id: $source_node_id}), (tgt:node_file {node_id: $target_node_id})
                MERGE (src)-[r:edge_has_dir_file]->(tgt)
                SET r.edge_id = $edge_id,
                    r.type_id = $type_id,
                    r.short_name = $short_name,
                    r.long_name = $long_name,
                    r.description = $description,
                    r.source_node_id = $source_node_id,
                    r.target_node_id = $target_node_id,
                    r.creation_date = $creation_date,
                    r.modified_date = $modified_date
                """, source_node_id=source_node_id, target_node_id=target_node_id,
                edge_id=edge_id, type_id=type_id, short_name=short_name, long_name=long_name,
                description=description, creation_date=creation_date, modified_date=modified_date)
        elif edge_type == "edge_has_file_table":
            tx.run("""
                MATCH (src:node_file {node_id: $source_node_id}), (tgt:node_table {node_id: $target_node_id})
                MERGE (src)-[r:edge_has_file_table]->(tgt)
                SET r.edge_id = $edge_id,
                    r.type_id = $type_id,
                    r.short_name = $short_name,
                    r.long_name = $long_name,
                    r.description = $description,
                    r.source_node_id = $source_node_id,
                    r.target_node_id = $target_node_id,
                    r.creation_date = $creation_date,
                    r.modified_date = $modified_date
                """, source_node_id=source_node_id, target_node_id=target_node_id,
                edge_id=edge_id, type_id=type_id, short_name=short_name, long_name=long_name,
                description=description, creation_date=creation_date, modified_date=modified_date)
        elif edge_type == "edge_has_table_col":
            tx.run("""
                MATCH (src:node_table {node_id: $source_node_id}), (tgt:node_column {node_id: $target_node_id})
                MERGE (src)-[r:edge_has_table_col]->(tgt)
                SET r.edge_id = $edge_id,
                    r.type_id = $type_id,
                    r.short_name = $short_name,
                    r.long_name = $long_name,
                    r.description = $description,
                    r.source_node_id = $source_node_id,
                    r.target_node_id = $target_node_id,
                    r.creation_date = $creation_date,
                    r.modified_date = $modified_date
                """, source_node_id=source_node_id, target_node_id=target_node_id,
                edge_id=edge_id, type_id=type_id, short_name=short_name, long_name=long_name,
                description=description, creation_date=creation_date, modified_date=modified_date)
        elif edge_type == "edge_assoc_term_col":
            tx.run("""
                MATCH (src:node_column {node_id: $source_node_id}), (tgt:node_business_term {node_id: $target_node_id})
                MERGE (src)-[r:edge_assoc_term_col]->(tgt)
                SET r.edge_id = $edge_id,
                    r.type_id = $type_id,
                    r.short_name = $short_name,
                    r.long_name = $long_name,
                    r.description = $description,
                    r.source_node_id = $source_node_id,
                    r.target_node_id = $target_node_id,
                    r.creation_date = $creation_date,
                    r.modified_date = $modified_date
                """, source_node_id=source_node_id, target_node_id=target_node_id,
                edge_id=edge_id, type_id=type_id, short_name=short_name, long_name=long_name,
                description=description, creation_date=creation_date, modified_date=modified_date)
        elif edge_type == "edge_assoc_class_col":
            tx.run("""
                MATCH (src:node_column {node_id: $source_node_id}), (tgt:node_classification {node_id: $target_node_id})
                MERGE (src)-[r:edge_assoc_class_col]->(tgt)
                SET r.edge_id = $edge_id,
                    r.type_id = $type_id,
                    r.short_name = $short_name,
                    r.long_name = $long_name,
                    r.description = $description,
                    r.source_node_id = $source_node_id,
                    r.target_node_id = $target_node_id,
                    r.creation_date = $creation_date,
                    r.modified_date = $modified_date
                """, source_node_id=source_node_id, target_node_id=target_node_id,
                edge_id=edge_id, type_id=type_id, short_name=short_name, long_name=long_name,
                description=description, creation_date=creation_date, modified_date=modified_date)
        elif edge_type == "edge_derive_table_table":
            tx.run("""
                MATCH (src:node_table {node_id: $source_node_id}), (tgt:node_table {node_id: $target_node_id})
                MERGE (src)-[r:edge_derive_table_table]->(tgt)
                SET r.edge_id = $edge_id,
                    r.type_id = $type_id,
                    r.short_name = $short_name,
                    r.long_name = $long_name,
                    r.description = $description,
                    r.source_node_id = $source_node_id,
                    r.target_node_id = $target_node_id,
                    r.creation_date = $creation_date,
                    r.modified_date = $modified_date
                """, source_node_id=source_node_id, target_node_id=target_node_id,
                edge_id=edge_id, type_id=type_id, short_name=short_name, long_name=long_name,
                description=description, creation_date=creation_date, modified_date=modified_date)
        elif edge_type == "edge_joinable_table_table":
            tx.run("""
                MATCH (src:node_table {node_id: $source_node_id}), (tgt:node_table {node_id: $target_node_id})
                MERGE (src)-[r:edge_derive_table_table]->(tgt)
                SET r.edge_id = $edge_id,
                    r.type_id = $type_id,
                    r.short_name = $short_name,
                    r.long_name = $long_name,
                    r.description = $description,
                    r.source_node_id = $source_node_id,
                    r.target_node_id = $target_node_id,
                    r.creation_date = $creation_date,
                    r.modified_date = $modified_date
                """, source_node_id=source_node_id, target_node_id=target_node_id,
                edge_id=edge_id, type_id=type_id, short_name=short_name, long_name=long_name,
                description=description, creation_date=creation_date, modified_date=modified_date)
        elif edge_type == "edge_unionable_table_table":
            tx.run("""
                MATCH (src:node_table {node_id: $source_node_id}), (tgt:node_table {node_id: $target_node_id})
                MERGE (src)-[r:edge_unionable_table_table]->(tgt)
                SET r.edge_id = $edge_id,
                    r.type_id = $type_id,
                    r.short_name = $short_name,
                    r.long_name = $long_name,
                    r.description = $description,
                    r.source_node_id = $source_node_id,
                    r.target_node_id = $target_node_id,
                    r.creation_date = $creation_date,
                    r.modified_date = $modified_date
                """, source_node_id=source_node_id, target_node_id=target_node_id,
                edge_id=edge_id, type_id=type_id, short_name=short_name, long_name=long_name,
                description=description, creation_date=creation_date, modified_date=modified_date)
        elif edge_type == "edge_own":
            tx.run("""
                MATCH (src:node_owner {node_id: $source_node_id}), (tgt {node_id: $target_node_id})
                MERGE (src)-[r:edge_own]->(tgt)
                SET r.edge_id = $edge_id,
                    r.type_id = $type_id,
                    r.short_name = $short_name,
                    r.long_name = $long_name,
                    r.description = $description,
                    r.source_node_id = $source_node_id,
                    r.target_node_id = $target_node_id,
                    r.creation_date = $creation_date,
                    r.modified_date = $modified_date
                """, source_node_id=source_node_id, target_node_id=target_node_id,
                edge_id=edge_id, type_id=type_id, short_name=short_name, long_name=long_name,
                description=description, creation_date=creation_date, modified_date=modified_date)
        else:
            tx.run("""
                MATCH (src:node {node_id: $source_node_id}), (tgt:node {node_id: $target_node_id})
                MERGE (src)-[r:edge]->(tgt)
                SET r.edge_id = $edge_id,
                    r.type_id = $type_id,
                    r.short_name = $short_name,
                    r.long_name = $long_name,
                    r.description = $description,
                    r.source_node_id = $source_node_id,
                    r.target_node_id = $target_node_id,
                    r.creation_date = $creation_date,
                    r.modified_date = $modified_date
                """, source_node_id=source_node_id, target_node_id=target_node_id,
                edge_id=edge_id, type_id=type_id, short_name=short_name, long_name=long_name,
                description=description, creation_date=creation_date, modified_date=modified_date)

        
    def traverse_directory(self, directory_path):
        for root, dirs, files in os.walk(directory_path):
            root_id = self.create_nodes(root, "directory")
            for file in files:
                file_id = self.create_nodes(root + "/" + file, "file")
                self.create_edges(root_id, file_id, "edge_has_dir_file")
            for dir in dirs:
                dir_id = self.create_nodes(root + "/" + dir, "directory")
                self.create_edges(root_id, dir_id, "edge_has_dir_dir")
                self.traverse_sub_directory(root + "/" + dir, root_id)
    
    def traverse_sub_directory(self, directory_path, root_id):
        for root, dirs, files in os.walk(directory_path):
            for file in files:
                file_id = self.create_nodes(root + "/" + file, "file")
                self.create_edges(root_id, file_id, "edge_has_dir_file")
            for dir in dirs:
                dir_id = self.create_nodes(root + "/" + dir, "directory")
                self.create_edges(root_id, dir_id, "edge_has_dir_dir")
                self.traverse_sub_directory(root + "/" + dir, root_id)

    def delete_all_nodes(self):
        with self.driver.session() as session:
            session.execute_write(self._delete_all)

    @staticmethod
    def _delete_all(tx):
        tx.run("MATCH (n) DETACH DELETE n")
                

In [None]:
uri = "neo4j://localhost:7687"
user = "neo4j"
password = "neo4j"

graph = DirectoryGraph(uri, user, password)
graph.traverse_directory("./data/adventureworks/csv/")
# graph.delete_all_nodes()
graph.close()