In [1]:
from pathlib import Path
import time
import pandas as pd

def collect_file_tree(base_directory):
    base_path = Path(base_directory)
    
    def add_to_tree(path, tree):
        parts = path.relative_to(base_path).parts
        current_tree = tree
        for part in parts[:-1]:
            current_tree = current_tree.setdefault(part, {'type': 'folder', 'size': 0, 'created': None, 'modified': None, 'contents': {}})['contents']
        if path.is_dir():
            stats = path.stat()
            current_tree[parts[-1]] = {
                'type': 'folder',
                'size': 0,  # Placeholder size for folders
                'created': time.ctime(stats.st_ctime),
                'modified': time.ctime(stats.st_mtime),
                'contents': {}
            }
        elif path.is_file():
            stats = path.stat()
            current_tree[parts[-1]] = {
                'type': f'{path.suffix}',
                'size': stats.st_size,
                'created': time.ctime(stats.st_ctime),
                'modified': time.ctime(stats.st_mtime),
                'contents': None
            }

    # Ensure the base directory itself is included
    file_tree = {
        'base': {
            'type': 'folder',
            'size': 0,  # Placeholder size for base directory
            'created': time.ctime(base_path.stat().st_ctime),
            'modified': time.ctime(base_path.stat().st_mtime),
            'contents': {}
        }
    }
    
    for item in base_path.rglob('*'):
        add_to_tree(item, file_tree['base']['contents'])
    
    return file_tree


def list_files(file_tree):
    _l = []
    
    def process_files(tree):
        for _k, _v in tree.items():
            if isinstance(_v, dict) and _v['type'] != 'folder':
                _d = _v.copy()
                _d['name'] = _k
                _l.append(_d)

    process_files(file_tree)
    return pd.DataFrame(_l)

def list_folders(file_tree):
    _l = []

    def folder_size_sum(tree):
        _sum_size = 0
        for _i, _contents in tree.items():
            if isinstance(_contents, dict):
                if _contents['type'] == 'folder':
                    _sum_size += folder_size_sum(_contents['contents'])
                else:
                    _sum_size += _contents['size']
        return _sum_size

    def process_folders(tree):
        for _k, _v in tree.items():
            if isinstance(_v, dict) and 'type' in _v and _v['type'] == 'folder':
                _d = {
                    'type': _v['type'],
                    'size': folder_size_sum(_v['contents']),
                    'created': _v['created'],
                    'modified': _v['modified'],
                    'name': _k
                }
                _l.append(_d)

    process_folders(file_tree)
    return pd.DataFrame(_l)

def list_all(file_tree):
    files_df = list_files(file_tree)
    folders_df = list_folders(file_tree)
    return pd.concat([files_df, folders_df], ignore_index=True)


In [3]:
# Example usage:
base_directory_path = '.'
file_tree = collect_file_tree(base_directory_path)


In [4]:
# List files
files_df = list_files(file_tree['base']['contents'])
print("Files:")
files_df

Files:


Unnamed: 0,type,size,created,modified,contents,name
0,.ipynb,458708,Wed May 15 11:26:37 2024,Wed May 15 11:26:37 2024,,MicroCT_Filetree_to_KG.ipynb
1,.py,0,Wed May 15 11:45:52 2024,Wed May 15 11:45:52 2024,,__init__.py
2,,94,Wed May 15 15:00:25 2024,Wed May 15 15:00:25 2024,,.gitignore
3,.ipynb,35253,Thu May 9 12:27:29 2024,Thu May 9 12:27:29 2024,,MicroCT_KG_Meta_Analysis.ipynb
4,.ipynb,9434,Tue May 14 15:43:05 2024,Tue May 14 15:43:05 2024,,CFT_Filetree_to_KG.ipynb
5,.py,4907,Thu May 9 15:16:11 2024,Thu May 9 15:16:11 2024,,kg_property_summary_microct-Copy3.py
6,.py,3710,Thu May 9 14:31:15 2024,Thu May 9 14:31:15 2024,,kg_property_summary_microct-Copy2.py
7,.ipynb,13144,Wed May 15 12:44:31 2024,Wed May 15 12:44:31 2024,,DICOM_Reader.ipynb
8,.py,4999,Thu May 16 10:17:58 2024,Thu May 16 10:17:58 2024,,kg_property_summary_microct.py
9,.ipynb,12517,Wed May 15 13:20:36 2024,Wed May 15 13:20:36 2024,,US_Filetree_to_KG.ipynb


In [5]:
# List folders with total size
folders_df = list_folders(file_tree['base']['contents'])
print("Folders:")
folders_df

Folders:


Unnamed: 0,type,size,created,modified,name
0,folder,16024,Mon Nov 6 15:09:25 2023,Mon Nov 6 15:09:25 2023,eda
1,folder,865838216,Tue May 14 15:25:54 2024,Tue May 14 15:25:54 2024,US v Caliper analyzed
2,folder,10598002342,Tue May 7 10:37:46 2024,Tue Feb 7 16:12:30 2023,LaraM new study
3,folder,8984944257,Thu May 2 14:57:15 2024,Thu May 2 13:28:01 2024,LungData
4,folder,44671,Thu May 16 14:37:12 2024,Thu May 16 14:37:12 2024,filetree
5,folder,4728462,Thu May 2 12:46:06 2024,Thu May 2 12:45:39 2024,PancreaticTumor
6,folder,50720,Mon Nov 6 15:09:25 2023,Mon Nov 6 15:09:25 2023,intake_form
7,folder,337,Wed May 15 11:45:52 2024,Wed May 15 11:45:52 2024,__pycache__
8,folder,773145,Fri May 17 12:21:12 2024,Fri May 17 12:21:12 2024,.ipynb_checkpoints
9,folder,793597790,Thu May 2 14:57:15 2024,Thu May 2 14:56:40 2024,Example mcroCT tiff files


In [6]:
# List all files and folders
all_df = list_all(file_tree['base']['contents'])
print("All files and folders:")
all_df

All files and folders:


Unnamed: 0,type,size,created,modified,contents,name
0,.ipynb,458708,Wed May 15 11:26:37 2024,Wed May 15 11:26:37 2024,,MicroCT_Filetree_to_KG.ipynb
1,.py,0,Wed May 15 11:45:52 2024,Wed May 15 11:45:52 2024,,__init__.py
2,,94,Wed May 15 15:00:25 2024,Wed May 15 15:00:25 2024,,.gitignore
3,.ipynb,35253,Thu May 9 12:27:29 2024,Thu May 9 12:27:29 2024,,MicroCT_KG_Meta_Analysis.ipynb
4,.ipynb,9434,Tue May 14 15:43:05 2024,Tue May 14 15:43:05 2024,,CFT_Filetree_to_KG.ipynb
5,.py,4907,Thu May 9 15:16:11 2024,Thu May 9 15:16:11 2024,,kg_property_summary_microct-Copy3.py
6,.py,3710,Thu May 9 14:31:15 2024,Thu May 9 14:31:15 2024,,kg_property_summary_microct-Copy2.py
7,.ipynb,13144,Wed May 15 12:44:31 2024,Wed May 15 12:44:31 2024,,DICOM_Reader.ipynb
8,.py,4999,Thu May 16 10:17:58 2024,Thu May 16 10:17:58 2024,,kg_property_summary_microct.py
9,.ipynb,12517,Wed May 15 13:20:36 2024,Wed May 15 13:20:36 2024,,US_Filetree_to_KG.ipynb


In [26]:
list_folders(file_tree)

Unnamed: 0,type,size,created,modified,name
0,folder,16024,Mon Nov 6 15:09:25 2023,Mon Nov 6 15:09:25 2023,eda
1,folder,16024,Wed Jun 14 14:33:30 2023,Wed Jun 14 14:33:30 2023,eda/__pycache__
2,folder,865838216,Tue May 14 15:25:54 2024,Tue May 14 15:25:54 2024,US v Caliper analyzed
3,folder,2084,Tue May 14 15:55:04 2024,Tue May 14 15:55:04 2024,US v Caliper analyzed/.ipynb_checkpoints
4,folder,865833551,Tue May 14 16:16:08 2024,Tue May 14 16:16:08 2024,US v Caliper analyzed/20211110180827403
5,folder,357292,Tue May 14 16:20:52 2024,Tue May 14 16:20:52 2024,US v Caliper analyzed/20211110180827403/.ipynb...
6,folder,10598002342,Tue May 7 10:37:46 2024,Tue Feb 7 16:12:30 2023,LaraM new study
7,folder,5236700074,Tue May 7 10:37:46 2024,Tue Feb 7 16:12:37 2023,LaraM new study/Scan1
8,folder,1028833491,Tue May 7 10:38:27 2024,Tue Feb 7 17:49:53 2023,LaraM new study/Scan1/2R
9,folder,272612009,Tue May 7 10:38:31 2024,Tue Feb 7 17:03:37 2023,LaraM new study/Scan1/2R/2R_Rec


In [7]:
def get_directory_contents(file_tree, path):
    parts = path.split('/')
    current_tree = file_tree
    for part in parts:
        if part in current_tree:
            current_tree = current_tree[part]['contents']
        else:
            raise ValueError(f"Path '{path}' not found in the directory structure.")
    return current_tree

In [13]:
get_directory_contents(file_tree, 'base/LaraM new study/Scan1').keys()

dict_keys(['2R', 'both', 'none', '1R', '1L'])

In [8]:
# Example usage:
base_directory_path = '.'
file_tree = collect_file_tree(base_directory_path)

# Get contents of the base directory
base_contents = get_directory_contents(file_tree, 'base')

# List files in the base directory
files_df = list_files(base_contents)
print("Files in base directory:")
print(files_df)

# List folders in the base directory
folders_df = list_folders(base_contents)
print("Folders in base directory:")
print(folders_df)

# List all files and folders in the base directory
all_df = list_all(base_contents)
print("All files and folders in base directory:")
print(all_df)

# Get contents of a subdirectory (e.g., 'base/folder_name')
subfolder_contents = get_directory_contents(file_tree, 'base/folder_name')

# List files in the subdirectory
files_df = list_files(subfolder_contents)
print("Files in subdirectory:")
print(files_df)

# List folders in the subdirectory
folders_df = list_folders(subfolder_contents)
print("Folders in subdirectory:")
print(folders_df)

# List all files and folders in the subdirectory
all_df = list_all(subfolder_contents)
print("All files and folders in subdirectory:")
print(all_df)

Files in base directory:
      type    size                   created                  modified  \
0   .ipynb  458708  Wed May 15 11:26:37 2024  Wed May 15 11:26:37 2024   
1      .py       0  Wed May 15 11:45:52 2024  Wed May 15 11:45:52 2024   
2               94  Wed May 15 15:00:25 2024  Wed May 15 15:00:25 2024   
3   .ipynb   35253  Thu May  9 12:27:29 2024  Thu May  9 12:27:29 2024   
4   .ipynb    9434  Tue May 14 15:43:05 2024  Tue May 14 15:43:05 2024   
5      .py    4907  Thu May  9 15:16:11 2024  Thu May  9 15:16:11 2024   
6      .py    3710  Thu May  9 14:31:15 2024  Thu May  9 14:31:15 2024   
7   .ipynb   13144  Wed May 15 12:44:31 2024  Wed May 15 12:44:31 2024   
8      .py    4999  Thu May 16 10:17:58 2024  Thu May 16 10:17:58 2024   
9   .ipynb   12517  Wed May 15 13:20:36 2024  Wed May 15 13:20:36 2024   
10  .ipynb   80102  Fri May 17 12:37:57 2024  Fri May 17 12:37:57 2024   
11  .ipynb   10686  Wed May 15 16:48:19 2024  Wed May 15 16:32:17 2024   
12     .py   

ValueError: Path 'base/folder_name' not found in the directory structure.

In [1]:
from pathlib import Path
import time
import pandas as pd


def list_files(file_tree):
    _l = []
    for _k, _v in file_tree.items():
        if _v['type'] != 'folder':
            _d = _v.copy()
            _d['name'] = _k
            _l.append(_d)
    return pd.DataFrame(_l)


def list_folders(file_tree):
    _l = []

    def folder_size_sum(tree):
        _sum_size = 0
        for _i, _contents in tree.items():
            if isinstance(_contents, dict) and 'type' in _contents:
                if _contents['type'] == 'folder':
                    _sum_size += folder_size_sum(_contents)
                else:
                    _sum_size += _contents['size']
        return _sum_size

    def process_folders(tree, path=""):
        for _k, _v in tree.items():
            if isinstance(_v, dict) and 'type' in _v and _v['type'] == 'folder':
                if path == "":
                    folder_path = f"{path}/{_k}" if path else _k
                    _d = {
                        'type': _v['type'],
                        'size': folder_size_sum(_v),
                        'created': _v['created'],
                        'modified': _v['modified'],
                        'name': folder_path
                    }
                    _l.append(_d)

    process_folders(file_tree)
    return pd.DataFrame(_l)


def list_all(file_tree):
    files_df = list_files(file_tree)
    folders_df = list_folders(file_tree)
    return pd.concat([files_df, folders_df], ignore_index=True)


def collect_file_tree(base_directory):
    base_path = Path(base_directory)
    
    def add_to_tree(path, tree):
        parts = path.relative_to(base_path).parts
        for part in parts[:-1]:
            tree = tree.setdefault(part, {})
        if path.is_dir():
            stats = path.stat()
            tree[parts[-1]] = {
                'type': 'folder',
                'size': 0,
                'created': time.ctime(stats.st_ctime),
                'modified': time.ctime(stats.st_mtime)
            }
        elif path.is_file():
            stats = path.stat()
            tree[parts[-1]] = {
                'type': f'{path.suffix}',
                'size': stats.st_size,
                'created': time.ctime(stats.st_ctime),
                'modified': time.ctime(stats.st_mtime)
            }

    file_tree = {}
    
    for item in base_path.rglob('*'):
        add_to_tree(item, file_tree)
    
    return file_tree


In [2]:
base_directory_path = '.'
file_tree = collect_file_tree(base_directory_path)

files_df = list_files(file_tree)
folders_df = list_folders(file_tree)
all_df = list_all(file_tree)

In [3]:
all_df.sort_values(by=['type','name'], ascending=False).head()

Unnamed: 0,type,size,created,modified,name
22,folder,50720,Mon Nov 6 15:09:25 2023,Mon Nov 6 15:09:25 2023,intake_form
20,folder,44671,Thu May 16 14:37:12 2024,Thu May 16 14:37:12 2024,filetree
16,folder,16024,Mon Nov 6 15:09:25 2023,Mon Nov 6 15:09:25 2023,eda
23,folder,337,Wed May 15 11:45:52 2024,Wed May 15 11:45:52 2024,__pycache__
17,folder,865838216,Tue May 14 15:25:54 2024,Tue May 14 15:25:54 2024,US v Caliper analyzed


In [4]:
folders_df.head()

Unnamed: 0,type,size,created,modified,name
0,folder,16024,Mon Nov 6 15:09:25 2023,Mon Nov 6 15:09:25 2023,eda
1,folder,865838216,Tue May 14 15:25:54 2024,Tue May 14 15:25:54 2024,US v Caliper analyzed
2,folder,10598002342,Tue May 7 10:37:46 2024,Tue Feb 7 16:12:30 2023,LaraM new study
3,folder,8984944257,Thu May 2 14:57:15 2024,Thu May 2 13:28:01 2024,LungData
4,folder,44671,Thu May 16 14:37:12 2024,Thu May 16 14:37:12 2024,filetree


In [5]:
files_df.sort_values(by='modified').head()

Unnamed: 0,type,size,created,modified,name
15,.ipynb,322858,Fri May 17 11:00:16 2024,Fri May 17 11:00:16 2024,Demo_Scrape_and_Store_Metadata.ipynb
11,.ipynb,18573,Fri May 17 12:20:18 2024,Fri May 17 12:20:18 2024,Untitled.ipynb
14,.ipynb,12813,Thu May 2 17:45:17 2024,Thu May 2 17:45:17 2024,TIFF_Reader.ipynb
3,.ipynb,35253,Thu May 9 12:27:29 2024,Thu May 9 12:27:29 2024,MicroCT_KG_Meta_Analysis.ipynb
13,.py,3902,Thu May 9 14:23:09 2024,Thu May 9 14:23:09 2024,kg_property_summary_microct-Copy1.py


In [6]:
list_folders(file_tree['intake_form']).sort_values(['type', 'name'], ascending=False).reset_index(drop=True).head()

Unnamed: 0,type,size,created,modified,name
0,folder,17315,Mon Nov 6 15:09:25 2023,Mon Nov 6 15:09:25 2023,schema
1,folder,20897,Thu Jun 29 14:52:11 2023,Thu Jun 29 14:52:11 2023,__pycache__
2,folder,12508,Thu Jun 29 14:05:36 2023,Thu Jun 29 14:05:36 2023,.ipynb_checkpoints


In [7]:
file_tree.keys()

dict_keys(['MicroCT_Filetree_to_KG.ipynb', '__init__.py', 'eda', '.gitignore', 'MicroCT_KG_Meta_Analysis.ipynb', 'US v Caliper analyzed', 'CFT_Filetree_to_KG.ipynb', 'LaraM new study', 'LungData', 'kg_property_summary_microct-Copy3.py', 'kg_property_summary_microct-Copy2.py', 'filetree', 'DICOM_Reader.ipynb', 'kg_property_summary_microct.py', 'US_Filetree_to_KG.ipynb', 'PancreaticTumor', 'intake_form', '__pycache__', '.ipynb_checkpoints', 'IVIS_Filetree_Parse.ipynb', 'Untitled.ipynb', 'kg_property_summary_microct-Copy4.py', 'Example mcroCT tiff files', 'kg_property_summary_microct-Copy1.py', 'TIFF_Reader.ipynb', 'Demo_Scrape_and_Store_Metadata.ipynb'])