Here is a proof of concept for importing nested folders of MicroCT data and extracting the .log files, then putting it all in a labeled property graph.

In [1]:
from PIL import Image
from PIL.TiffTags import TAGS
import glob
import os


def extract_meta_dict(glob_str, verbose=False):
    meta_dict = {}
    print("Scanning files ...\n")
    for _file in glob.glob(glob_str):
        if verbose:
            print(f"  - {_file}")
        img = Image.open(_file)
        _meta = {TAGS[key] : img.tag[key] for key in img.tag_v2}
        _fn_short = os.path.basename(_file)
        meta_dict[_fn_short] = _meta
    print("\n... done.")
    return {_k: meta_dict[_k] for _k in sorted(meta_dict)}
    

def extract_tiff_tags(img):
    for _k in img.tag_v2:
        if _k in TAGS.keys():
            print(TAGS[_k], img.tag[_k])
        else:
            print(f"CUSTOM TAG[{_k}]", img.tag[_k])
        print()


def scan_directory_with_meta(base_folder):
    glob_dict = {'folder': {}, 'file': [], 'meta': {}}
    for _f in glob.glob(f"{base_folder}/**", recursive=True):
        _fp = os.path.abspath(_f)
        if os.path.exists(_fp):
            relative_path = _fp[len(os.path.abspath(base_folder)) + 1:].split(os.sep)
            current_level = glob_dict
            for part in relative_path[:-1]:
                if part not in current_level['folder']:
                    current_level['folder'][part] = {'folder': {}, 'file': [], 'meta': {}}
                current_level = current_level['folder'][part]
            if os.path.isdir(_fp):
                if relative_path[-1] not in current_level['folder']:
                    current_level['folder'][relative_path[-1]] = {'folder': {}, 'file': [], 'meta': {}}
            elif os.path.isfile(_fp):
                file_info = {
                    'name': relative_path[-1],
                    'filepath': _fp,
                    'extension': os.path.splitext(_fp)[1],
                    'size': os.path.getsize(_fp)
                }

                current_level['file'].append(file_info)
                if _fp.endswith('.log'):
                    # Parse the log file and store its contents in the 'meta' dictionary
                    current_level['meta'] = parse_log_file(_fp)
    return glob_dict
    

def summarize_and_print_tree(glob_dict, indent_level=0):
    # Print file extension summary for the current folder
    print('')
    extension_count = {}
    for file_info in glob_dict['file']:
        file_name = file_info['name']
        extension = os.path.splitext(file_name)[1]  # Extract file extension
        if extension in extension_count:
            extension_count[extension] += 1
        else:
            extension_count[extension] = 1
    for ext, count in extension_count.items():
        print('    ' * indent_level + f"{ext if ext else '(no extension)'} ({count} files)")
    # Process each folder recursively
    for folder_name, folder_content in glob_dict['folder'].items():
        print('    ' * indent_level + f"+ {folder_name}/")
        summarize_and_print_tree(folder_content, indent_level + 1)


def collect_meta_entries(glob_dict, base_folder, current_path=''):
    meta_entries = {}
    # Check if there's meta data at this level
    if glob_dict['meta']:
        full_path = os.path.join(base_folder, current_path)
        meta_entries[full_path] = glob_dict['meta']

    # Recursively process each subfolder
    for folder_name, folder_content in glob_dict['folder'].items():
        new_path = os.path.join(current_path, folder_name)
        meta_entries.update(collect_meta_entries(folder_content, base_folder, new_path))

    return meta_entries


def parse_log_file(file_path):
    meta_dict = {}
    current_section = None
    try:
        with open(file_path, 'r') as file:
            for line in file:
                line = line.strip()
                if line.startswith('[') and line.endswith(']'):
                    current_section = line[1:-1]
                    meta_dict[current_section] = {}
                elif '=' in line:
                    key, value = line.split('=', 1)
                    if current_section:
                        meta_dict[current_section][key.strip()] = value.strip()
                    else:
                        meta_dict[key.strip()] = value.strip()
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
    return meta_dict



In [2]:
base_folder = 'LaraM new study'
result = scan_directory_with_meta(base_folder)
meta_data = collect_meta_entries(result, base_folder)
summarize_and_print_tree(result)


+ /

+ Scan1/

    + 2R/

        .tif (558 files)
        .log (1 files)
        + 2R_Rec/

            .tif (411 files)
            + temp/

                .bmp (1 files)
                .tif (3 files)
                .log (1 files)
    + both/

        .tif (558 files)
        .log (1 files)
        + both_Rec/

            .tif (459 files)
            .log (1 files)
            + temp/

                .tif (4 files)
                .bmp (1 files)
                .log (1 files)
    + none/

        .tif (557 files)
        .log (1 files)
        + none_Rec/

            .tif (434 files)
            + temp/

                .tif (3 files)
                .log (1 files)
                .bmp (1 files)
    + 1R/

        .tif (558 files)
        .log (1 files)
        + 1R_Rec/

            .tif (422 files)
            + temp/

                .bmp (1 files)
                .tif (3 files)
                .log (1 files)
    + 1L/

        .tif (557 files)
        .log (1 files)
      

In [3]:
unique_key_sets = set()
for _fp, _scan in meta_data.items():
    unique_key_sets.add(frozenset(_scan.keys()))
for key_set in unique_key_sets:
    print(list(key_set))

['Acquisition', 'User', 'System', 'Reconstruction', 'File name convention']
['Acquisition', 'System', 'Reconstruction', 'User']


In [4]:
print()
for _k, _v in meta_data['LaraM new study/Scan1/1L'].items():
    print(f" {_k}")
    print()
    for _kk, _vv in _v.items():
        print(f"   {_kk:>50s}:  {_vv}")
    print()


 System

                                              Scanner:  SkyScan1276
                                       Instrument S/N:  18F17059
                                     Software Version:  1.4
                          Magnification Drive Version:  3.4
                                 Camera Drive Version:  3.4
                                Filter  Drive Version:  3.4
                                 Animal Drive Version:  3.4
                                       Home Directory:  C:\SkyScan1276
                                          Source Type:  HAMAMATSU L10321
                                          Camera Type:  XIMEA MH110XC-KK-TP
                               Camera Pixel Size (um):  17.359
                                     Camera X/Y Ratio:  0.9949

 User

                                            User Name:  vspan
                                        Computer Name:  MICROCT

 Acquisition

                                       Data Directory:  D:\Res

In [5]:
result['folder']['Scan1']['folder'].keys()

dict_keys(['2R', 'both', 'none', '1R', '1L'])

In [6]:
from neomodel import StructuredNode, StringProperty, RelationshipTo, IntegerProperty, RelationshipFrom

class Folder(StructuredNode):
    name = StringProperty()
    filepath = StringProperty(index=True)
    contains_folder = RelationshipTo('Folder', 'CONTAINS_FOLDER')
    contained_by = RelationshipFrom('Folder', 'CONTAINS_FOLDER')
    contains_file = RelationshipTo('File', 'CONTAINS_FILE')

class File(StructuredNode):
    name = StringProperty()
    filepath = StringProperty(index=True)
    extension = StringProperty()
    size = IntegerProperty()
    contained_by = RelationshipFrom('Folder', 'CONTAINS_FILE')

class Scan(StructuredNode):
    filepath = StringProperty(index=True)
    stored_in = RelationshipTo('Folder', 'STORED_IN')
    involved = RelationshipTo('Section', 'INVOLVED')

class Section(StructuredNode):
    name = StringProperty(unique_index=True)
    involved_in = RelationshipFrom('Scan', 'INVOLVED')


import os
import networkx as nx

def build_graph(data_dict, base_folder):
    G = nx.DiGraph()
    base_path = os.path.abspath(base_folder)

    def process_folder(current_dict, parent_id=None):
        """Recursively process each folder and file in the current dictionary."""
        for folder_path, folder_content in current_dict['folder'].items():
            folder_id = f"folder_{folder_path}"
            folder_absolute_path = os.path.join(base_path, folder_path)
            G.add_node(folder_id, label='Folder', name=folder_path, filepath=folder_absolute_path)
            
            # Establish a containment relationship with the parent folder
            if parent_id:
                G.add_edge(parent_id, folder_id, relationship='contains_folder')

            # Process files in the folder
            process_files(folder_content['file'], folder_id)

            # Recurse into subfolders
            process_folder(folder_content, folder_id)

            # Process metadata associated with the folder
            process_metadata(folder_content['meta'], folder_id, folder_absolute_path)

    def process_files(files, parent_folder_id):
        """Add file nodes and establish relationships to the containing folder."""
        for file_info in files:
            file_id = f"file_{file_info['filepath']}"
            G.add_node(file_id, label='File', **file_info)
            G.add_edge(parent_folder_id, file_id, relationship='contains_file')

    def process_metadata(meta_dict, folder_id, folder_path):
        """Add metadata nodes and establish involved relationships."""
        if meta_dict:
            for section, attrs in meta_dict.items():
                section_id = f"{section}_{folder_path}"
                _properties = {
                    to_lower_camel_case(convert_chars_for_neo4j(_k)): _v
                    for _k, _v in attrs.items()
                }
                G.add_node(section_id, label=section, **_properties)
                scan_id = f"scan_{folder_path}"
                G.add_node(scan_id, label='Scan', filepath=folder_path)
                G.add_edge(scan_id, section_id, relationship='involved')
                G.add_edge(scan_id, folder_id, relationship='stored_in')

    # Start processing from the top-level dictionary
    process_folder(data_dict)

    return G

import re

def to_lower_camel_case(s):
    s = re.sub(r"(_|-)+", " ", s).title().replace(" ", "")
    return s[0].lower() + s[1:]

def convert_chars_for_neo4j(s):
    s = s.replace('/','')
    s = s.replace('(','_')
    s = s.replace(')','_')
    s = s.replace('+','plus')
    return s

def collect_properties_by_meta_section(nx_graph):
    schema_labels = ['Folder', 'File', 'Scan'] 
    meta_section_labels = []
    for node, data in nx_graph.nodes(data=True):
        label = data['label']
        if label not in schema_labels and label not in meta_section_labels:
            meta_section_labels.append(label)

    props_by_section = {}    
    for section in meta_section_labels:
        props_by_section[section] = set()
        for node, data in nx_graph.nodes(data=True):
            label = data['label']
            if label == section:
                for _k, _v in data.items():
                    if _k != 'label':
                        props_by_section[section].add(_k)

    return {_s: list(_p) for _s, _p in props_by_section.items()}, meta_section_labels, schema_labels

from neomodel import StructuredNode, StringProperty, db
from tqdm import tqdm

def initialize_class_map_from_graph(nx_graph, meta_section_properties):
    sections = meta_section_properties.keys()
    class_map = {'Folder': Folder, 'File': File, 'Scan': Scan, 'Section': Section}
    
    # Create Neomodel classes for each unique section label found
    for section_name in meta_section_properties.keys():
        # for node, data in nx_graph.nodes(data=True):
        #     label = data['label']            
        class_key = section_name.replace(' ', '_')
        if class_key not in class_map:
            class_map[class_key] = type(class_key, (Section,), {
                _p: StringProperty(unique_index=True) 
                for _p in meta_section_properties[section_name]
            })
            
    return class_map


In [7]:
# Example usage
# Assuming 'result' is your data dictionary structured as expected
G = build_graph(result,'LaraM new study/')

In [8]:
def find_nodes_by_label(label):
    for node, data in G.nodes(data=True):
        if data['label'] == label:
            print(node)
            print(data)
            print()

In [9]:
find_nodes_by_label('Acquisition')

Acquisition_/home/patch/PycharmProjects/neb4neo/neb4neo/etl/experiment/LaraM new study/temp
{'label': 'Acquisition', 'dataDirectory': 'D:\\Results\\LauraM\\new study\\Scan2\\1L', 'filenamePrefix': '1L', 'numberOfFiles': '558', 'numberOfRows': '672', 'numberOfColumns': '1008', 'filenameIndexLength': '8', 'partialWidth': 'OFF', 'imageCropOriginX': '0', 'imageCropOriginY': '0', 'cameraBinning': '4x4', 'imageRotation': '0.00000', 'opticalAxisLine': '346', 'objectToSourceMm': '92.863', 'cameraToSourceMm': '160.543', 'sourceVoltageKv': '100', 'sourceCurrentUa': '200', 'imagePixelSizeUm': '40.164000', 'scaledImagePixelSizeUm': '40.164000', 'imageFormat': 'TIFF', 'depthBits': '16', 'referenceIntensity': '58000', 'exposureMs': '90', 'rotationStepDeg': '0.648', 'use360Rotation': 'YES', 'scanningPosition': '241.306 mm', 'frameAveraging': 'OFF (1)', 'flatFieldCorrection': 'ON', 'randomMovement': 'OFF (5)', 'filter': 'Al 0.5mm', 'gantryDirection': 'CC', 'rotationDirection': 'CC', 'intrinsicCsRotati

In [None]:
from ..labdataranger.model import  

In [10]:
node_list = list(G.nodes(data=True))
len(node_list), node_list[:3]

(9982,
 [('folder_',
   {'label': 'Folder',
    'name': '',
    'filepath': '/home/patch/PycharmProjects/neb4neo/neb4neo/etl/experiment/LaraM new study/'}),
  ('folder_Scan1',
   {'label': 'Folder',
    'name': 'Scan1',
    'filepath': '/home/patch/PycharmProjects/neb4neo/neb4neo/etl/experiment/LaraM new study/Scan1'}),
  ('folder_2R',
   {'label': 'Folder',
    'name': '2R',
    'filepath': '/home/patch/PycharmProjects/neb4neo/neb4neo/etl/experiment/LaraM new study/2R'})])

In [11]:
edge_list = list(G.edges(data=True))
len(edge_list), edge_list[:3]

(9988,
 [('folder_Scan1', 'folder_2R', {'relationship': 'contains_folder'}),
  ('folder_Scan1', 'folder_both', {'relationship': 'contains_folder'}),
  ('folder_Scan1', 'folder_none', {'relationship': 'contains_folder'})])

In [12]:
meta_section_properties, meta_section_labels, schema_labels = collect_properties_by_meta_section(G)
class_map = initialize_class_map_from_graph(G, meta_section_properties)
class_map

{'Folder': __main__.Folder,
 'File': __main__.File,
 'Scan': __main__.Scan,
 'Section': __main__.Section,
 'System': neomodel.sync_.core.System,
 'User': neomodel.sync_.core.User,
 'Acquisition': neomodel.sync_.core.Acquisition,
 'Reconstruction': neomodel.sync_.core.Reconstruction,
 'File_name_convention': neomodel.sync_.core.File_name_convention}

In [13]:
def list_class_attributes(model_class):
    print("Attributes:")
    for attr in dir(model_class):
        if not attr.startswith("__") and not callable(getattr(model_class, attr)):
            print(f"{attr}: {getattr(model_class, attr)}")

# Example usage
list_class_attributes(class_map['Acquisition'])

Attributes:
cameraBinning: <neomodel.properties.StringProperty object at 0x73e414184f70>
cameraToSourceMm: <neomodel.properties.StringProperty object at 0x73e4141844c0>
dataDirectory: <neomodel.properties.StringProperty object at 0x73e414184fd0>
depthBits: <neomodel.properties.StringProperty object at 0x73e41408a820>
element_id: <property object at 0x73e33bcb45e0>
exposureMs: <neomodel.properties.StringProperty object at 0x73e41408ab80>
filenameIndexLength: <neomodel.properties.StringProperty object at 0x73e414184e80>
filenamePrefix: <neomodel.properties.StringProperty object at 0x73e414184f40>
filter: <neomodel.properties.StringProperty object at 0x73e414184ee0>
flatFieldCorrection: <neomodel.properties.StringProperty object at 0x73e414184e20>
frameAveraging: <neomodel.properties.StringProperty object at 0x73e41408aeb0>
gantryDirection: <neomodel.properties.StringProperty object at 0x73e41408af70>
id: <property object at 0x73e33bcb4630>
imageCropOriginX: <neomodel.properties.StringPro

In [19]:
from tqdm import tqdm

def load_networkx_to_neo4j(nx_graph, class_map):
    node_map = {}

    with db.transaction:
        for node, data in tqdm(nx_graph.nodes(data=True), desc="Nodes"):
            label = data['label']
            properties = {
                k: v 
                for k, v in data.items() if k not in ['label', 'relationship']
            }
            
            NodeClass = class_map.get(label, None)
            if NodeClass:
                node_instance = NodeClass(**properties).save()
                node_map[node] = node_instance
            else:
                label_mod = to_lower_camel_case(convert_chars_for_neo4j(label))
                NodeClass = class_map.get(label_mod, None)
                if NodeClass:
                    node_instance = NodeClass(**properties).save()
                    node_map[node] = node_instance
                else:
                    print(f"WARNING: Node {node} was not matched with a label... contains:")
                    print(f"         {data}")

        for source, target, data in tqdm(nx_graph.edges(data=True), desc="Edges"):
            relationship_type = data.get('relationship')
            source_node = node_map.get(source)
            target_node = node_map.get(target)
            
            if source_node and target_node and relationship_type:
                # relationship = getattr(source_node, relationship_type, None)
                source_type = nx_graph.nodes[source]['label']
                relationship = getattr(class_map[source_type], relationship_type, None)
                if relationship:
                    getattr(source_node, relationship_type).connect(target_node)
                else:
                    print(f"Relationship type '{relationship_type}' not found between {source} and {target}")

    # TODO: I am still manually removing the "Section" label afterwards, but this could be a final step here

    print("Graph loading complete!")
    return node_map


In [20]:
len(list(G.nodes()))

9982

In [21]:
len(list(G.edges()))

9988

In [22]:
list(G.nodes(data=True))[:1]

[('folder_',
  {'label': 'Folder',
   'name': '',
   'filepath': '/home/patch/PycharmProjects/neb4neo/neb4neo/etl/experiment/LaraM new study/'})]

In [23]:
from neomodel import config
import ..labdataranger as ldr

ldr.store.neomodel_db_config(database='instruments')

# config.DATABASE_URL = f'bolt://{username}:{password}@ki-ed3g.mit.edu:7687/instruments'
node_map = load_networkx_to_neo4j(G, class_map)

aenter called


Nodes:  12%|███▋                          | 1237/9982 [00:01<00:07, 1131.01it/s]

         {'label': 'File name convention', 'filenameIndexLength': '8', 'filenamePrefix': '1L_rec'}


Nodes: 100%|██████████████████████████████| 9982/9982 [00:05<00:00, 1704.40it/s]
Edges: 100%|██████████████████████████████| 9988/9988 [00:04<00:00, 2078.77it/s]


aexit called
Graph loading complete!


In [19]:
for source, target, data in tqdm(G.edges(data=True), desc="Edges"):
    relationship_type = data.get('relationship')
    source_node = node_map.get(source)
    target_node = node_map.get(target)
    print(f"  Source: {source_node}")
    print(f"  Target: {target_node}")
    print(f" RelType: {relationship_type}")
    if source_node and target_node and relationship_type:
        relationship = getattr(source_node, relationship_type, None)


    if source_node and target_node and relationship_type:
        relationship = getattr(source_node, relationship_type, None)
        print()
        print(f"       Relationship: {relationship}")
        break

Edges:   0%|                                           | 0/9988 [00:00<?, ?it/s]

  Source: {'name': 'Scan1', 'filepath': '/home/patch/PycharmProjects/neb4neo/neb4neo/etl/experiment/LaraM new study/Scan1', 'element_id_property': '4:31534542-ba07-4b86-8482-b4daf2237641:18068'}
  Target: {'name': '2R', 'filepath': '/home/patch/PycharmProjects/neb4neo/neb4neo/etl/experiment/LaraM new study/2R', 'element_id_property': '4:31534542-ba07-4b86-8482-b4daf2237641:18069'}
 RelType: contains_folder

       Relationship: zero or more relationships in a outgoing direction of type CONTAINS_FOLDER on node (4:31534542-ba07-4b86-8482-b4daf2237641:18068) of class 'Folder'





In [22]:
source_node, target_node

(<Folder: {'name': 'Scan1', 'filepath': '/home/patch/PycharmProjects/neb4neo/neb4neo/etl/experiment/LaraM new study/Scan1', 'element_id_property': '4:31534542-ba07-4b86-8482-b4daf2237641:18068'}>,
 <Folder: {'name': '2R', 'filepath': '/home/patch/PycharmProjects/neb4neo/neb4neo/etl/experiment/LaraM new study/2R', 'element_id_property': '4:31534542-ba07-4b86-8482-b4daf2237641:18069'}>)

In [23]:
relationship_type

'contains_folder'

In [33]:
source_type = G.nodes(data=True)[source]['label']
class_map = {'Folder': Folder, 'File': File, 'Scan': Scan, 'Section': Section}
getattr(class_map.get(source_type), relationship_type, None)

<neomodel.sync_.relationship_manager.RelationshipTo at 0x7efff459dbb0>

In [24]:
getattr(source_node['label'], relationship_type, None)

<neomodel.sync_.relationship_manager.ZeroOrMore at 0x7eff1c752f10>

In [24]:
for node, data in tqdm(G.nodes(data=True), desc="Nodes"):
    print(node_map.get(node))
    break

Nodes:   0%|                                           | 0/9982 [00:00<?, ?it/s]

{'name': '', 'filepath': '/home/patch/PycharmProjects/neb4neo/neb4neo/etl/experiment/LaraM new study/', 'element_id_property': '4:31534542-ba07-4b86-8482-b4daf2237641:58945'}





In [23]:
node_map[]

{'folder_': <Folder: {'name': '', 'filepath': '/home/patch/PycharmProjects/neb4neo/neb4neo/etl/experiment/LaraM new study/', 'element_id_property': '4:31534542-ba07-4b86-8482-b4daf2237641:58945'}>,
 'folder_Scan1': <Folder: {'name': 'Scan1', 'filepath': '/home/patch/PycharmProjects/neb4neo/neb4neo/etl/experiment/LaraM new study/Scan1', 'element_id_property': '4:31534542-ba07-4b86-8482-b4daf2237641:58946'}>,
 'folder_2R': <Folder: {'name': '2R', 'filepath': '/home/patch/PycharmProjects/neb4neo/neb4neo/etl/experiment/LaraM new study/2R', 'element_id_property': '4:31534542-ba07-4b86-8482-b4daf2237641:58947'}>,
 'file_/home/patch/PycharmProjects/neb4neo/neb4neo/etl/experiment/LaraM new study/Scan1/2R/2R00000083.tif': <File: {'name': '2R00000083.tif', 'filepath': '/home/patch/PycharmProjects/neb4neo/neb4neo/etl/experiment/LaraM new study/Scan1/2R/2R00000083.tif', 'extension': '.tif', 'size': 1355230, 'element_id_property': '4:31534542-ba07-4b86-8482-b4daf2237641:58948'}>,
 'file_/home/patch

In [18]:
source, target, data = list(G.edges(data=True))[1]

In [19]:
G.has_node(source)

True

In [23]:
G.nodes()[source]['label']

'Folder'

In [32]:
d[source]['label']

'Folder'

In [None]:
for source, target, data in G.edges(data=True):
    print(f"Edge from {source} to {target} with data {data}")