<a href="https://colab.research.google.com/github/leesuyee/connectivity-edu-tutorial/blob/main/mesoscale_connectivity_coding_activity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Set up environment**

Run the cell below to pip install the necessary packages. After install, restart the session and start at the next cell.

## **DO NOT RERUN THIS CELL**



In [None]:
!pip install s3fs lxml pandas


Collecting botocore<1.42.31,>=1.41.0 (from aiobotocore<4.0.0,>=2.5.4->s3fs)
  Using cached botocore-1.42.30-py3-none-any.whl.metadata (5.9 kB)
Using cached botocore-1.42.30-py3-none-any.whl (14.6 MB)
Installing collected packages: botocore
  Attempting uninstall: botocore
    Found existing installation: botocore 1.42.33
    Uninstalling botocore-1.42.33:
      Successfully uninstalled botocore-1.42.33
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
boto3 1.42.33 requires botocore<1.43.0,>=1.42.33, but you have botocore 1.42.30 which is incompatible.[0m[31m
[0mSuccessfully installed botocore-1.42.30
Collecting botocore<1.43.0,>=1.42.33 (from boto3->aind-data-access-api==1.2.1)
  Using cached botocore-1.42.33-py3-none-any.whl.metadata (5.9 kB)
Using cached botocore-1.42.33-py3-none-any.whl (14.6 MB)
Installing collected packages: botocore
  Attempting un

In [None]:
from pathlib import Path
import s3fs
import pandas as pd


class load_data:
    """
    Minimal Colab-compatible loader for SmartSPIM data (CCF coordinates and region counts) streamed directly from S3

    Parameters
    ----------
    mouse_ID : str | int
        Mouse ID (e.g. 689305)

        bucket : str
        S3 bucket name (e.g. "s3://aind-open-data")

        anon : bool
        Whether to use anonymous credentials (e.g. True)

        prefer_stitched : bool
        Whether to prefer stitched data (e.g. True)

    Attributes
    ----------
    rootDir : str
        Resolved S3 path to the selected SmartSPIM dataset.

    quantPaths : dict[str, str]
        Mapping from imaging channel (e.g., ``"488"``) to the
        corresponding ``cell_count_by_region.csv`` file path.

    ccfCellsPaths : dict[str, str]
        Mapping from imaging channel to the corresponding
        ``transformed_cells.xml`` file path containing CCF
        coordinates.

    channels : list[str]
        Sorted list of available imaging channels discovered
        for the dataset.

    Methods
   ----------
   resolve_paths()
        Method to get path to whole brain volume data

    getCellsCCFdf(ch: list[str])
        Retrieves and formats CCF transformed coordinates of segmented cells into a DataFrame

    getcellcounts(ch: list[str])
        Imports the cell_counts_by_region.csv

    """

    def __init__(
        self,
        mouse_ID: str | int,
        bucket: str = "s3://aind-open-data",
        anon: bool = True,
        prefer_stitched: bool = True,
    ):
        self.mouse_ID = str(mouse_ID)
        self.bucket = bucket
        self.fs = s3fs.S3FileSystem(anon=anon)
        self.prefer_stitched = prefer_stitched

        self._resolve_paths()

    # ------------------------------------------------------------------
    # Path resolution
    # ------------------------------------------------------------------
    def _resolve_paths(self):
      """
      Method to get path to whole brain volume data
      """
      roots = self.fs.ls(self.bucket)
      matches = [r for r in roots if self.mouse_ID in r]

      if not matches:
          raise FileNotFoundError(f"No datasets found for mouse_ID {self.mouse_ID}")

      if self.prefer_stitched:
          stitched = [r for r in matches if "stitched" in r.lower()]
          if len(stitched) == 1:
              self.rootDir = stitched[0]
          elif len(stitched) > 1:
              raise ValueError(f"Multiple stitched datasets found: {stitched}")
          else:
              self.rootDir = matches[0]
      else:
          self.rootDir = matches[0]

      quant_dir = f"{self.rootDir}/image_cell_quantification"
      if not self.fs.exists(quant_dir):
          raise FileNotFoundError("image_cell_quantification directory not found")

      quant_paths = self.fs.glob(f"{quant_dir}/Ex*")

      self.quantPaths = {
          Path(p).name.split("_")[1]: f"{p}/cell_count_by_region.csv"
          for p in quant_paths
      }
      self.ccfCellsPaths = {
          Path(p).name.split("_")[1]: f"{p}/transformed_cells.xml"
          for p in quant_paths
      }

      self.channels = sorted(self.quantPaths.keys())

    # ------------------------------------------------------------------
    # Cell coordinates in CCF
    # ------------------------------------------------------------------
    def getCellsCCFdf(self, ch: list[str]):
        """
        Retrieves and formats CCF transformed coordinates of segmented cells into a DataFrame

        Parameters
        ----------
        ch : list[str]
            List of imaging channels to retrieve coordinates from (e.g., ["488", "561"])

        Returns
        -------
        dfs : pd.DataFrame
            DataFrame cwhere each row is a cell and each column is a coordinate:
            AP (anterior-posterior), DV(dorsal-ventral), ML(medial-lateral),
            with an additional "channel column indicating the channel of origin
        """
        ccfDim = [528, 320, 456]
        dfs = []

        for channel in ch:
            if channel not in self.ccfCellsPaths:
                raise KeyError(f"Channel {channel} not found")

            with self.fs.open(self.ccfCellsPaths[channel], "rb") as f:
                df = pd.read_xml(
                    f,
                    xpath="//CellCounter_Marker_File//Marker_Data//Marker_Type//Marker",
                )

            # export data in XYZ order and rename columns to AP, DV, ML
            df = (
                df[["MarkerX", "MarkerY", "MarkerZ"]]
                .rename(
                    columns={
                        "MarkerX": "AP",
                        "MarkerY": "DV",
                        "MarkerZ": "ML",
                    }
                )
                .assign(channel=channel)
            )
            # Clip coordinates within specified dimensions
            df["AP"] = df["AP"].clip(0, ccfDim[0] - 1)
            df["DV"] = df["DV"].clip(0, ccfDim[1] - 1)
            df["ML"] = df["ML"].clip(0, ccfDim[2] - 1)

            dfs.append(df)

        return pd.concat(dfs, ignore_index=True)

    # ------------------------------------------------------------------
    # Cell counts by region
    # ------------------------------------------------------------------
    def getcellcounts(self, ch: list[str]):
        """
        Imports the cell_counts_by_region.csv (quantifiction of detected cells in brain regions) as a DataFrame

        Parameters
        ----------
        ch : list[str]
            List of imaging channels to retrieve coordinates from (e.g., ["488", "561"]

        Returns
        -------
        dfs : pd.DataFrame
            DataFrame where each row is a brain region cell count in a given channel
        """
        required_columns = [
            "ID", "Acronym", "Name", "Struct_Info", "Struct_area_um3",
            "Left", "Right", "Total",
            "Left_Density", "Right_Density", "Total_Density",
        ]

        # Initialize an empty list to hold DataFrames
        cell_counts_list = []

        for channel in ch:
            if channel not in self.quantPaths:
                raise KeyError(f"Channel {channel} not found")

            # Load csv
            with self.fs.open(self.quantPaths[channel], "rb") as f:
                df = pd.read_csv(f)

                # Check if all required columns are present
                if set(required_columns).issubset(df.columns):
                  # Truncate the DataFrame to specific columns
                  cell_counts = df[required_columns]
                  # Add a new column indicating the channel
                  cell_counts = cell_counts.assign(channel=channel)
                  # Append to list
                  cell_counts_list.append(cell_counts)

                # Throw error if missing columns
                if not set(required_columns).issubset(df.columns):
                    raise ValueError(f"Missing required columns in {channel}")

            # Concatenate list into a single DataFrame
            if cell_counts_list:
              cell_counts_df = pd.concat(cell_counts_list, ignore_index=True)
            else:
            # return empty DataFrame if no data is found
              cell_counts_df = pd.DataFrame(columns = required_columns + ["channel"])

        return cell_counts_df

In [None]:
mouse_ID = "689238"
data = load_data(mouse_ID)
channels = data.channels

ccf_df = data.getCellsCCFdf(channels)
cell_counts_df = data.getcellcounts(channels)

In [None]:
ccf_df

In [None]:
cell_counts_df.channel.unique()

In [None]:
cell_counts_df