This notebook validates that the featurization is equivalent on both CPU and GPU.
Regardless of the processor used, the output should be the same.

In [11]:
import argparse
import os
import pathlib
import sys
import time

import numpy as np
import pandas as pd
import psutil
import skimage

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False
if in_notebook:
    from tqdm.notebook import tqdm
else:
    from tqdm import tqdm

import gc

# Get the current working directory
cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd

else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break

# Check if a Git root directory was found
if root_dir is None:
    raise FileNotFoundError("No Git root directory found.")
sys.path.append(str(root_dir / "3.cellprofiling" / "featurization_utils"))
from featurization_parsable_arguments import parse_featurization_args

In [None]:
if not in_notebook:
    arguments_dict = parse_featurization_args()
    patient = arguments_dict["patient"]
    well_fov = arguments_dict["well_fov"]


else:
    well_fov = "C4-2"
    patient = "NF0014_T1"


output_parent_path = pathlib.Path(
    f"{root_dir}/data/{patient}/extracted_features/{well_fov}/"
).resolve(strict=True)

In [13]:
features_dict = {
    "feature_name": [],
    "feature_processor": [],
    "file_path": [],
}
dict_of_dfs = {}
# get each of the features
feature_files = list(output_parent_path.glob("*parquet"))
feature_files = [f for f in feature_files if f.is_file()]
for file in feature_files:
    if "Area" in file.name and "CPU" in file.name:
        features_dict["feature_name"].append("Area")
        features_dict["feature_processor"].append("CPU")
        features_dict["file_path"].append(file)
        dict_of_dfs["Area_CPU"] = pd.read_parquet(file)
    elif "Area" in file.name and "GPU" in file.name:
        features_dict["feature_name"].append("Area")
        features_dict["feature_processor"].append("GPU")
        features_dict["file_path"].append(file)
        dict_of_dfs["Area_GPU"] = pd.read_parquet(file)
    elif "Coloc" in file.name and "CPU" in file.name:
        features_dict["feature_name"].append("Coloc")
        features_dict["feature_processor"].append("CPU")
        features_dict["file_path"].append(file)
        dict_of_dfs["Coloc_CPU"] = pd.read_parquet(file)
    elif "Coloc" in file.name and "GPU" in file.name:
        features_dict["feature_name"].append("Coloc")
        features_dict["feature_processor"].append("GPU")
        features_dict["file_path"].append(file)
        dict_of_dfs["Coloc_GPU"] = pd.read_parquet(file)
    elif "Intensity" in file.name and "CPU" in file.name:
        features_dict["feature_name"].append("Intensity")
        features_dict["feature_processor"].append("CPU")
        features_dict["file_path"].append(file)
        dict_of_dfs["Intensity_CPU"] = pd.read_parquet(file)
    elif "Intensity" in file.name and "GPU" in file.name:
        features_dict["feature_name"].append("Intensity")
        features_dict["feature_processor"].append("GPU")
        features_dict["file_path"].append(file)
        dict_of_dfs["Intensity_GPU"] = pd.read_parquet(file)
    elif "Gran" in file.name and "CPU" in file.name:
        features_dict["feature_name"].append("Granularity")
        features_dict["feature_processor"].append("CPU")
        features_dict["file_path"].append(file)
        dict_of_dfs["Granularity_CPU"] = pd.read_parquet(file)
    elif "Gran" in file.name and "GPU" in file.name:
        features_dict["feature_name"].append("Granularity")
        features_dict["feature_processor"].append("GPU")
        features_dict["file_path"].append(file)
        dict_of_dfs["Granularity_GPU"] = pd.read_parquet(file)
    elif "Neighbors" in file.name and "CPU" in file.name:
        features_dict["feature_name"].append("Neighbors")
        features_dict["feature_processor"].append("CPU")
        features_dict["file_path"].append(file)
        dict_of_dfs["Neighbors_CPU"] = pd.read_parquet(file)
    elif "Texture" in file.name and "CPU" in file.name:
        features_dict["feature_name"].append("Texture")
        features_dict["feature_processor"].append("CPU")
        features_dict["file_path"].append(file)
        dict_of_dfs["Texture_CPU"] = pd.read_parquet(file)
    else:
        print(f"Unknown feature file: {file.name}")

In [14]:
import hashlib


def get_file_hash(file_path):
    """Calculate SHA256 hash of a file without loading it as an image."""
    try:
        with open(file_path, "rb") as f:
            return hashlib.sha256(f.read()).hexdigest()
    except Exception as e:
        return f"Error: {e}"

In [15]:
features_df = pd.DataFrame.from_dict(features_dict)

features_df.rename(columns={"index": "feature_processor"}, inplace=True)
# get the file size in KB
features_df["file_size_KB"] = features_df["file_path"].apply(
    lambda x: x.stat().st_size / (1024)
)
# get the sha256 hash of the file
features_df["sha256"] = features_df["file_path"].apply(get_file_hash)
features_df

Unnamed: 0,feature_name,feature_processor,file_path,file_size_KB,sha256
0,Intensity,CPU,~/Documents/GFF_3D_organoid_profi...,28.285156,cf20b4ef5bf383b5b8f2b4a0b919454d96aaa85068cc71...
1,Coloc,GPU,~/Documents/GFF_3D_organoid_profi...,31.249023,a3d62069e0d87d2663a5c42969be44ba762ab77ab0e71b...
2,Coloc,CPU,~/Documents/GFF_3D_organoid_profi...,31.249023,a3d62069e0d87d2663a5c42969be44ba762ab77ab0e71b...
3,Texture,CPU,~/Documents/GFF_3D_organoid_profi...,15.901367,2a691c2c3c902f5aca484ab5ae31faed3f7b1ad733b771...
4,Granularity,CPU,~/Documents/GFF_3D_organoid_profi...,15.957031,c024a7766ffb295726239b9e2d6c19136f0b4e2f02699e...
5,Neighbors,CPU,~/Documents/GFF_3D_organoid_profi...,3.459961,5057656d8fa4d0330c74ea1d7de4d0c41ca3085f437dc8...
6,Area,GPU,~/Documents/GFF_3D_organoid_profi...,14.212891,63dc8d3f3049e77d6730f6b32376a74395df893df6b9a1...
7,Intensity,GPU,~/Documents/GFF_3D_organoid_profi...,28.285156,cf20b4ef5bf383b5b8f2b4a0b919454d96aaa85068cc71...
8,Area,CPU,~/Documents/GFF_3D_organoid_profi...,14.212891,63dc8d3f3049e77d6730f6b32376a74395df893df6b9a1...


In [16]:
# pivot the dataframe to have one row per feature and processor type
features_df = features_df.pivot(
    index=["feature_name"], columns="feature_processor", values="file_path"
).reset_index()
features_df["CPU_file_size_KB"] = features_df["CPU"].apply(
    lambda x: x.stat().st_size / (1024) if isinstance(x, pathlib.Path) else None
)
features_df["GPU_file_size_KB"] = features_df["GPU"].apply(
    lambda x: x.stat().st_size / (1024) if isinstance(x, pathlib.Path) else None
)
features_df["CPU_sha256"] = features_df["CPU"].apply(get_file_hash)
features_df["GPU_sha256"] = features_df["GPU"].apply(get_file_hash)
features_df.insert(
    1,
    "sha256_match",
    features_df.apply(lambda row: row["CPU_sha256"] == row["GPU_sha256"], axis=1),
)
features_df

feature_processor,feature_name,sha256_match,CPU,GPU,CPU_file_size_KB,GPU_file_size_KB,CPU_sha256,GPU_sha256
0,Area,True,~/Documents/GFF_3D_organoid_profi...,~/Documents/GFF_3D_organoid_profi...,14.212891,14.212891,63dc8d3f3049e77d6730f6b32376a74395df893df6b9a1...,63dc8d3f3049e77d6730f6b32376a74395df893df6b9a1...
1,Coloc,True,~/Documents/GFF_3D_organoid_profi...,~/Documents/GFF_3D_organoid_profi...,31.249023,31.249023,a3d62069e0d87d2663a5c42969be44ba762ab77ab0e71b...,a3d62069e0d87d2663a5c42969be44ba762ab77ab0e71b...
2,Granularity,False,~/Documents/GFF_3D_organoid_profi...,,15.957031,,c024a7766ffb295726239b9e2d6c19136f0b4e2f02699e...,"Error: expected str, bytes or os.PathLike obje..."
3,Intensity,True,~/Documents/GFF_3D_organoid_profi...,~/Documents/GFF_3D_organoid_profi...,28.285156,28.285156,cf20b4ef5bf383b5b8f2b4a0b919454d96aaa85068cc71...,cf20b4ef5bf383b5b8f2b4a0b919454d96aaa85068cc71...
4,Neighbors,False,~/Documents/GFF_3D_organoid_profi...,,3.459961,,5057656d8fa4d0330c74ea1d7de4d0c41ca3085f437dc8...,"Error: expected str, bytes or os.PathLike obje..."
5,Texture,False,~/Documents/GFF_3D_organoid_profi...,,15.901367,,2a691c2c3c902f5aca484ab5ae31faed3f7b1ad733b771...,"Error: expected str, bytes or os.PathLike obje..."
