In [5]:
import os
import logging
import networkx as nx
from pathlib import Path
import time
import pandas as pd
import glob
from PIL import Image
from PIL.TiffTags import TAGS
import pickle


class FileTreeManager:
    
    def __init__(self, base_directory, skips=None, verbose=False, log_file=None, checkpoint_file=None):
        
        self.base_directory = Path(base_directory)
        self.skips = skips if skips is not None else []
        self.verbose = verbose
        self.log_file = log_file
        self.file_tree = None
        self.file_types = (
            '.log',
            '.json',
            '.xml',
            '.xml.bak',
            '.vxml',
            '.vxml.bak',
            '.mxml',
            '.mxml.bak',
            '.txt',
            '.dcm',
            '.dicom',
            '.tif',
            '.tiff'
        )
        
        if log_file:
            self.setup_logging(log_file, verbose)
        
        if checkpoint_file:
            if Path(checkpoint_file).is_file():
                self.load_state(checkpoint_file)
                self.build_file_path_index()
            else:
                if verbose:
                    print(f"No checkpoint available at {checkpoint_file}")
                    print(f"To create, run `manager.collect_file_tree()`")

    
    def setup_logging(self, log_file, verbose):
        logging.basicConfig(filename=log_file, level=logging.INFO, format='%(asctime)s %(message)s')
        if verbose:
            console = logging.StreamHandler()
            console.setLevel(logging.INFO)
            formatter = logging.Formatter('%(asctime)s %(message)s')
            console.setFormatter(formatter)
            logging.getLogger().addHandler(console)

    def log_message(self, message):
        logging.info(message)

    def remove_contents_key(self, meta):
        if isinstance(meta, dict) and 'contents' in meta:
            meta_copy = meta.copy()  # Create a shallow copy of the dictionary
            del meta_copy['contents']  # Remove the 'contents' key
            return meta_copy
        return meta

    def parse_metadata_file(self, file_path):
        parsers = {
            '.log': self.parse_log_file,
            '.json': self.parse_json_file,
            '.xml': self.parse_xml_file,
            '.dcm': self.parse_dicom_file,
            '.dicom': self.parse_dicom_file,
            '.tif': self.parse_tif_file,
            '.tiff': self.parse_tif_file,
            # Add other specific file type parsers here
        }
        _, ext = os.path.splitext(file_path)
        parser = parsers.get(ext)
        if parser:
            return parser(file_path)
        else:
            self.log_message(f"No parser available for file with extension {ext}")
            return {}

    # Dummy parser methods for demonstration
    def parse_log_file(self, file_path):
        meta_dict = {}
        current_section = None
        try:
            with open(file_path, 'r') as file:
                for line in file:
                    line = line.strip()
                    if line.startswith('[') and line.endswith(']'):
                        current_section = line[1:-1]
                        meta_dict[current_section] = {}
                    elif '=' in line:
                        key, value = line.split('=', 1)
                        if current_section:
                            meta_dict[current_section][key.strip()] = value.strip()
                        else:
                            meta_dict[key.strip()] = value.strip()
        except FileNotFoundError:
            print(f"File not found: {file_path}")
        except Exception as e:
            print(f"Error reading file {file_path}: {e}")
        return meta_dict


    def parse_json_file(self, file_path):
        return {"json": "metadata"}

    def parse_xml_file(self, file_path):
        return {"xml": "metadata"}

    def parse_dicom_file(self, file_path):
        return {"dicom": "metadata"}

    def parse_tif_file(self, file_path):
        try:
            img = Image.open(file_path)
            meta = {}
            for key in img.tag_v2:
                if key in TAGS:
                    meta[TAGS[key]] = img.tag[key]
                else:
                    meta[f"TAG_{key}"] = img.tag[key]
            return meta
        except Exception as e:
            self.log_message(f"Error processing TIFF file {file_path}: {e}")
            return {}

    def update_folder_sizes(self, parts, file_size, tree):
        current_tree = tree
        for part in parts:
            if part not in current_tree:
                current_tree[part] = {'type': 'folder', 'size': 0, 'created': None, 'modified': None, 'contents': {}}
            current_tree = current_tree[part]['contents']
            if 'size' in current_tree:
                current_tree['size'] += file_size
            else:
                current_tree['size'] = file_size

    def collect_file_tree(self):
        def add_to_tree(path, tree):
            parts = path.relative_to(self.base_directory).parts
            current_tree = tree
            for part in parts[:-1]:
                if part not in current_tree:
                    current_tree[part] = {'type': 'folder', 'size': 0, 'created': None, 'modified': None, 'contents': {}}
                current_tree = current_tree[part]['contents']
            if path.is_dir():
                stats = path.stat()
                current_tree[parts[-1]] = {
                    'type': 'folder',
                    'size': 0,  # Placeholder size for folders
                    'created': time.ctime(stats.st_ctime),
                    'modified': time.ctime(stats.st_mtime),
                    'contents': {}
                }
                self.log_message(f"Added directory: {path}")
            elif path.is_file():
                stats = path.stat()
                meta_data = {}
                if path.suffix in self.file_types:
                    meta_data = self.parse_metadata_file(path)
                current_tree[parts[-1]] = {
                    'type': f'{path.suffix}',
                    'size': stats.st_size,
                    'created': time.ctime(stats.st_ctime),
                    'modified': time.ctime(stats.st_mtime),
                    'contents': None,
                    'metadata': meta_data
                }
                self.log_message(f"Added file: {path}")
                # Update the size of all parent directories
                self.update_folder_sizes(parts[:-1], stats.st_size, tree)

        # Ensure the base directory itself is included
        file_tree = {
            'base': {
                'type': 'folder',
                'size': 0,  # Placeholder size for base directory
                'created': time.ctime(self.base_directory.stat().st_ctime),
                'modified': time.ctime(self.base_directory.stat().st_mtime),
                'contents': {}
            }
        }
        
        for item in self.base_directory.rglob('*'):
            if not any(skip in str(item) for skip in self.skips):
                add_to_tree(item, file_tree['base']['contents'])
                if item.is_dir():
                    self.log_message(f"Processing directory: {item}")
                elif item.is_file():
                    self.log_message(f"Processing file: {item}")
        
        self.file_tree = file_tree
        return file_tree

    def build_graph_from_file_tree(self):
        G = nx.DiGraph()
        
        def add_nodes_and_edges(tree, parent_path):
            for name, meta in tree.items():
                current_path = f"{parent_path}/{name}" if parent_path else name
                node_data = self.remove_contents_key(meta)
                if isinstance(node_data, dict):
                    G.add_node(current_path, **node_data)
                    if parent_path:
                        G.add_edge(parent_path, current_path)
                    if meta['type'] == 'folder' and 'contents' in meta:
                        add_nodes_and_edges(meta['contents'], current_path)
                else:
                    self.log_message(f"Skipping node {current_path} with non-dict metadata: {node_data}")
        
        add_nodes_and_edges(self.file_tree['base']['contents'], 'base')
        return G

    def build_file_path_index(self):
        """ Build an index of all file paths for autocompletion. """
        self.file_path_index = []

        def recurse_tree(tree, current_path):
            for name, meta in tree.items():
                if isinstance(meta, dict) and 'type' in meta:
                    new_path = f"{current_path}/{name}" if current_path else name
                    self.file_path_index.append(new_path)
                    if meta['type'] == 'folder' and 'contents' in meta:
                        recurse_tree(meta['contents'], new_path)

        if self.file_tree:
            recurse_tree(self.file_tree['base']['contents'], 'base')
        print("File path index built.")


    def autocomplete_path(self, prefix):
        """ Autocomplete potential directory paths based on the index. """
        return [path for path in self.file_path_index if path.startswith(prefix)]

    def get_directory_contents(self, path):
        parts = path.split('/')[1:]
        current_tree = self.file_tree['base']['contents']
        for part in parts:
            if part in current_tree:
                current_tree = current_tree[part]['contents']
            else:
                raise ValueError(f"Path '{path}' not found in the directory structure.")
        return current_tree

    def list_files(self, directory='base'):
        _l = []

        def process_files(tree):
            for _k, _v in tree.items():
                if isinstance(_v, dict) and _v.get('type') != 'folder':
                    _d = _v.copy()
                    del _d['contents']
                    _d['name'] = _k
                    _l.append(_d)

        directory_contents = self.get_directory_contents(directory)
        process_files(directory_contents)
        return pd.DataFrame(_l)

    def list_folders(self, directory='base'):
        _l = []

        def folder_size_sum(tree):
            _sum_size = 0
            for _i, _contents in tree.items():
                if isinstance(_contents, dict):
                    if _contents['type'] == 'folder':
                        _sum_size += folder_size_sum(_contents['contents'])
                    else:
                        _sum_size += _contents['size']
            return _sum_size

        def process_folders(tree):
            for _k, _v in tree.items():
                if isinstance(_v, dict) and _v.get('type') == 'folder':
                    _d = {
                        'type': _v['type'],
                        'size': folder_size_sum(_v['contents']),
                        'created': _v['created'],
                        'modified': _v['modified'],
                        'name': _k
                    }
                    _l.append(_d)

        directory_contents = self.get_directory_contents(directory)
        process_folders(directory_contents)
        return pd.DataFrame(_l)

    def list_all(self, directory='base'):
        files_df = self.list_files(directory)
        folders_df = self.list_folders(directory)
        return pd.concat([files_df, folders_df], ignore_index=True)

    def extract_tiff_tags(self, img):
        for _k in img.tag_v2:
            if _k in TAGS.keys():
                print(TAGS[_k], img.tag[_k])
            else:
                print(f"CUSTOM TAG[{_k}]", img.tag[_k])

    def save_state(self, file_name):
        """ Save the necessary data structures of the FileTreeManager to a file. """
        state = {
            'base_directory': self.base_directory,
            'file_tree': self.file_tree
        }
        with open(file_name, 'wb') as f:
            pickle.dump(state, f)
        print(f"State saved to {file_name}.")

    def load_state(self, file_name):
        """ Load the necessary data structures of the FileTreeManager from a file. """
        with open(file_name, 'rb') as f:
            state = pickle.load(f)
            self.base_directory = state['base_directory']
            self.file_tree = state['file_tree']
        print(f"State loaded from {file_name}.")
    

In [6]:
import os
import glob
import concurrent.futures
from pathlib import Path

# Define your FileTreeManager class here or import it if defined elsewhere

base_path = '/mnt/data/archive/LaraM_Full_Dataset'
skips = [
    'System Volume Information', 
    '$RECYCLE.BIN'
]
checkpoint_fstr = 'file_tree_scrape.pkl'
log_fstr = 'file_tree_scrape.log'

verbose = False

def process_directory(base_directory_path):
    checkpoint_file = os.path.join(base_directory_path, checkpoint_fstr)
    log_file = os.path.join(base_directory_path, log_fstr)
    
    if verbose:
        print(base_directory_path)
        print(checkpoint_file)
        print(log_file)
        print()
    
    manager = FileTreeManager(
        base_directory_path,
        skips, 
        log_file=log_file, 
        checkpoint_file=checkpoint_file
    )
    manager.collect_file_tree()
    manager.save_state(checkpoint_file)
    return base_directory_path

def get_base_dirs(base_path):
    base_dirs = []
    for _dir in glob.glob(f"{base_path}/*"):
        if Path(_dir).is_dir():
            base_dirs.append(_dir)
    return base_dirs

In [7]:
base_dirs = get_base_dirs(base_path)
len(base_dirs)

112

In [8]:
%%time


# Use ThreadPoolExecutor for parallel processing
with concurrent.futures.ThreadPoolExecutor(max_workers=56) as executor:

    futures = {executor.submit(process_directory, base_dir): base_dir 
               for base_dir in base_dirs}

    for future in concurrent.futures.as_completed(futures):
    
        base_dir = futures[future]
        
        try:
            result = future.result()
            if verbose:
                print(f"Processing completed for: {result}")
                
        except Exception as exc:
            print(f"Error processing {base_dir}: {exc}")


State loaded from /mnt/data/archive/LaraM_Full_Dataset/JnJ_MIT_uCTshare/file_tree_scrape.pkl.
File path index built.
State loaded from /mnt/data/archive/LaraM_Full_Dataset/$RECYCLE.BIN/file_tree_scrape.pkl.
File path index built.
State loaded from /mnt/data/archive/LaraM_Full_Dataset/System Volume Information/file_tree_scrape.pkl.
File path index built.
State saved to /mnt/data/archive/LaraM_Full_Dataset/$RECYCLE.BIN/file_tree_scrape.pkl.
State saved to /mnt/data/archive/LaraM_Full_Dataset/System Volume Information/file_tree_scrape.pkl.
State saved to /mnt/data/archive/LaraM_Full_Dataset/JnJ_MIT_uCTshare/file_tree_scrape.pkl.
State saved to /mnt/data/archive/LaraM_Full_Dataset/1_post_Rec/file_tree_scrape.pkl.
State saved to /mnt/data/archive/LaraM_Full_Dataset/1.10.22_SW/file_tree_scrape.pkl.
Error reading file /mnt/data/archive/LaraM_Full_Dataset/4.06.22_SW_U26/327/._327.log: 'utf-8' codec can't decode byte 0xb0 in position 37: invalid start byte
State saved to /mnt/data/archive/LaraM

In [8]:
log_file = os.path.join(base_dirs[0], 'file_tree_scrape.pkl')
checkpoint_file = os.path.join(base_dirs[0], 'file_tree_scrape.pkl')
manager = FileTreeManager(base_dirs[0], skips, checkpoint_file=checkpoint_file, log_file=log_file)

State loaded from /mnt/data/archive/LaraM_Full_Dataset/U38/file_tree.pkl.
File path index built.


In [16]:
_df = manager.list_all('base/2022_10_24/Cage 6/1L')

In [17]:
_df[_df['type'] == '.log']

Unnamed: 0,type,size,created,modified,metadata,name
125,.log,3390,Tue May 21 11:04:18 2024,Tue Oct 25 10:12:44 2022,{'log': 'metadata'},1L.log


In [25]:
!cat /mnt/data/archive/LaraM_Full_Dataset/U38/2022_10_24/Cage\ 6/1L/1L.log

[System]
Scanner=SkyScan1276
Instrument S/N=18F17059
Software Version=1.6
Magnification Drive Version=3.4
Camera Drive Version=3.4
Filter  Drive Version=3.4
Animal Drive Version=3.4
Home Directory=C:\SkyScan1276
Source Type=HAMAMATSU L10321
Camera Type=XIMEA MH110XC-KK-TP
Camera Pixel Size (um)=17.359
Camera X/Y Ratio=0.9949
[User]
User Name=qureshik
Computer Name=MICROCT
[Acquisition]
Data Directory=D:\Results\LauraM\U38\2022_10_24\Cage 6\1L
Filename Prefix=1L
Number Of Files=  926
Number Of Rows=  672
Number Of Columns= 1008
Filename Index Length=8
Partial Width=OFF
Image crop origin X=0
Image crop origin Y=0
Camera binning=4x4
Image Rotation=-0.01000
Optical Axis (line)=  346
Object to Source (mm)=92.863
Camera to Source (mm)=160.543
Source Voltage (kV)= 100
Source Current (uA)= 200
Image Pixel Size (um)=40.164000
Scaled Image Pixel Size (um)=40.164000
Image Format=TIFF
Depth (bits)=16
Reference Intensity=55000
Exposure (ms)=111
Rotation Step (deg)=0.389
Use 360 Rotation=YES
Scannin

Now that this seems to be working, I need something to find all of the .pkl files, export them as graphs, and then write them correctly to a knowledge graph database.

Note: If this becomes too cumbersome, there is probably a nice way to "compress" metadata on Tiff stacks so that redundant information isn't repeated.

In [2]:
base_directory_path = '/mnt/data/archive/LaraM_Full_Dataset/U10'
# base_directory_path = '/mnt/data/archive/LaraM_Full_Dataset/02.27.22_SW/'
skips = [
    'System Volume Information', 
    '$RECYCLE.BIN'
]
checkpoint_file = 'file_tree.pkl'
log_file = 'file_tree.log'

manager = FileTreeManager(
    base_directory_path,
    skips, 
    log_file=log_file, 
    checkpoint_file=checkpoint_file)

State loaded from file_tree.pkl.
File path index built.


In [3]:
paths = manager.autocomplete_path('')

In [4]:
manager.list_all("base/2021_0321")

Unnamed: 0,type,size,created,modified,name
0,folder,2857577690,Tue May 21 11:04:28 2024,Sun Mar 21 13:44:44 2021,Cage 6
1,folder,4751325695,Tue May 21 11:04:28 2024,Sun Mar 21 13:35:04 2021,Cage 1
2,folder,3837622281,Tue May 21 11:04:28 2024,Sun Mar 21 13:39:32 2021,Cage 4
3,folder,4812148812,Tue May 21 11:04:28 2024,Sun Mar 21 13:41:24 2021,Cage 5
4,folder,2970612415,Tue May 21 11:04:28 2024,Sun Mar 21 13:38:04 2021,Cage 3
5,folder,4846813889,Tue May 21 11:04:28 2024,Sun Mar 21 13:36:58 2021,Cage 2


In [8]:
manager.list_all("base/2021_0316/Cage 4/2R/")

Unnamed: 0,type,size,created,modified,metadata,name
0,.tif,1355230,Tue May 21 11:04:29 2024,Tue Mar 16 18:24:30 2021,"{'ImageWidth': (1008,), 'ImageLength': (672,),...",2R00000296.tif
1,.tif,1355230,Tue May 21 11:04:29 2024,Tue Mar 16 18:24:30 2021,"{'ImageWidth': (1008,), 'ImageLength': (672,),...",2R00000251.tif
2,.tif,1355230,Tue May 21 11:04:29 2024,Tue Mar 16 18:24:32 2021,"{'ImageWidth': (1008,), 'ImageLength': (672,),...",2R00000422.tif
3,.tif,1355230,Tue May 21 11:04:29 2024,Tue Mar 16 18:24:30 2021,"{'ImageWidth': (1008,), 'ImageLength': (672,),...",2R00000291.tif
4,.tif,1355230,Tue May 21 11:04:29 2024,Tue Mar 16 18:24:28 2021,"{'ImageWidth': (1008,), 'ImageLength': (672,),...",2R00000117.tif
...,...,...,...,...,...,...
572,.tif,1355230,Tue May 21 11:04:29 2024,Tue Mar 16 18:24:32 2021,"{'ImageWidth': (1008,), 'ImageLength': (672,),...",2R00000426.tif
573,.tif,1355230,Tue May 21 11:04:29 2024,Tue Mar 16 18:24:26 2021,"{'ImageWidth': (1008,), 'ImageLength': (672,),...",2R00000057.tif
574,.tif,1355230,Tue May 21 11:04:29 2024,Tue Mar 16 18:24:26 2021,"{'ImageWidth': (1008,), 'ImageLength': (672,),...",2R00000066.tif
575,.tif,1355230,Tue May 21 11:04:29 2024,Tue Mar 16 18:24:32 2021,"{'ImageWidth': (1008,), 'ImageLength': (672,),...",2R00000449.tif


In [23]:
contents.keys()

dict_keys(['Cage 6', 'Cage 1', 'Cage 4', 'Cage 5', 'Cage 3', 'Cage 2', 'size'])

In [25]:
contents['Cage 6'].keys()

dict_keys(['type', 'size', 'created', 'modified', 'contents'])