- **Orginally created by: Samuel Hobbs on 12/8/2024**
- **Last edited by: Ben Harris 1/9/2024**

## The script’s primary goal is to:

- Read hyperspectral .tiff files from designated directories.

- Check that each .tiff file matches a valid resolution range (e.g., 4.0 - 7.0 currently).

- Verify that each .tiff file has a label in a corresponding CSV (via sample number).

- Convert each .tiff file into a pandas DataFrame where each row represents one pixel, and columns represent the spectral bands plus additional metadata (e.g., filename, label, resolution shape).

- Concatenate all the individual DataFrames into a single large DataFrame of pixel-level data for all valid .tiff files.

- Save the resulting DataFrame (and an accompanying file-UID map) as a CSV file.

- Once that main CSV is saved we add pixel-level coordinates within each image used for displaying results as well as post-processing morphology.



## Setup


### Installs

In [4]:
!pip install rasterio



### Imports

In [5]:
import rasterio
from rasterio.plot import show
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join, basename, normpath
from google.colab import drive
from ast import literal_eval

### Connect Google Drive

In [None]:
drive.mount('/content/drive/')

### File Paths
NOTE: Please update these paths as needed!

In [None]:
#Directories
main_dir     = '/content/drive/Shareddrives/Land_Classification_Training_shared/Land_Classification_training_work/'
samples_dir  = join(main_dir, 'Samples/')  # Directory where the .tiff files are located as well as the corresponding csv defining their resp. resolutions

# Directory containing all .tiff files
tiff_dir     = join(samples_dir, 'ANG_L2A_v2_sample_subset')

# CSV that contains resolution info (columns: file_name, x_res, y_res)
res_csv_path = join(samples_dir, 'ANG_L2A_v2_sample_subset_resolutions.csv')

# CSV containing labels for each sample_num
labels_csv   = join(main_dir, 'labels.csv')

# Output CSV file names and paths
csv_dir                 = join(main_dir, 'Updated/')
csv_file_name_samples   = 'samples_single_dir.csv'   # main pixel-level data
csv_file_name_uid       = 'files_single_dir.csv'     # file-UID map
path_to_save_sample_csv = join(csv_dir, csv_file_name_samples)
path_to_save_uid_csv    = join(csv_dir, csv_file_name_uid)

# These correspond to the columns in the labels CSV that contain (sample_num, label).
# If the labels.csv columns differ, update accordingly. (However these are used in later steps in modeling to try to keep consistent!)
label_col_sample_num = 'Sample_num'
label_col_label      = 'Class'

## Functions


### General Functions

In [None]:
def get_labels(
  file_path,
  col1='Sample_num',
  col2='Class',
  name_col_id='Sample_num',
  name_col_label='Label'
):
  """
  Reads a CSV of labels from `file_path`. Subsets two columns (col1, col2),
  renames them, and sorts by col1 ascending. Returns the resulting DataFrame.

  Paramaters:
    file_path: Path to the labels CSV.
    col1: Name of the column in the CSV that holds the sample number.
    col2: Name of the column in the CSV that holds the label/class.
    name_col_id: Desired name for the ID column in the returned DataFrame.
    name_col_label: Desired name for the label column in the returned DataFrame.

  Returns:
    A sorted Pandas DataFrame with columns [name_col_id, name_col_label].
  """
  df_labels = (
    pd.read_csv(file_path)[[col1, col2]]
    .rename(columns={col1: name_col_id, col2: name_col_label})
    .sort_values(by=name_col_id, ascending=True)
  )
  return df_labels


In [None]:
def trim_data_files(
  filenames,
  labels,
  sample_num_col_name='Sample_num'
):
  """
  Trims a list of filenames by matching them to provided labels (by sample_num).
  Only retains filenames whose leading integer matches a label's sample number.

  Parameters:
    filenames: Sorted list or array of filename strings (ex: "12_abcdef.tiff").
    labels: Pandas DataFrame of labels, sorted by sample_num_col_name.
    sample_num_col_name: Column in `labels` that matches the file prefix (sample_num).

  Returns:
    A numpy array of matching filenames.
  """
  trimmed_filenames = []

  size_label      = labels.shape[0]
  size_filenames  = len(filenames)

  count_label     = 0
  count_filenames = 0

  while (count_label < size_label and count_filenames < size_filenames):
    filename_num = int(filenames[count_filenames].split('_')[0])
    label_num    = int(labels[sample_num_col_name].iloc[count_label])

    if filename_num == label_num:
      trimmed_filenames.append(filenames[count_filenames])
      count_filenames += 1
    elif filename_num > label_num:
      count_label += 1
    else:  # filename_num < label_num
      count_filenames += 1

  # Ensure we have at least some matching samples
  assert len(trimmed_filenames) > 0, "No filenames match the given labels."

  return np.array(trimmed_filenames)


In [None]:
def check_res(
  filename,
  res_dict,
  min_res_bound=4.0,
  max_res_bound=7.0
):
  """
  Checks if the x/y resolution of a given filename is within the min/max bound.
  Instead of reading resolution from Rasterio, we read from a dictionary
  (filename -> (xres, yres)) that was loaded from a CSV file.

  Parameters:
    filename: The .tiff filename (string).
    res_dict: Dictionary with structure {filename: (xres, yres)}.
    min_res_bound: Minimum allowed resolution (inclusive).
    max_res_bound: Maximum allowed resolution (inclusive).

  Returns:
    True if within bounds; otherwise False.
  """
  if filename not in res_dict:
    return False

  xres, yres = res_dict[filename]

  if (
    xres < min_res_bound or xres > max_res_bound
    or yres < min_res_bound or yres > max_res_bound
  ):
    return False

  return True


In [None]:
def tiff_to_arr(filepath):
  """
  Opens a .tiff file with Rasterio and returns its raw data as a numpy array,
  plus the shape as a string.

  Parameters:
    filepath: Full path to the .tiff file.

  Returns:
    data_3D -> numpy array of shape (bands, rows, cols)
    shape_str -> string representation of (rows, cols)
  """
  with rasterio.open(filepath) as dataset:
      data_array  = dataset.read()      # shape (bands, rows, cols)
      shape_str   = str(dataset.shape)  # e.g. '(10, 10)'
  return data_array, shape_str


In [None]:
def convert_3D_to_1D(data_3D):
  """
  Reshapes a 3D array (bands, rows, cols) into (rows*cols, bands).

  Parameters:
    data_3D: NumPy array of shape (num_bands, num_rows, num_cols).

  Returns:
    NumPy array of shape (num_pixels, num_bands).
  """
  # Flatten rows*cols, then transpose:
  return data_3D.reshape(data_3D.shape[0], -1).T


In [None]:
def get_filenames(directory_path):
  """
  Retrieves all files in a directory (no subfolders),
  then sorts them by the integer prefix before '_' in the filename.

  Parameters:
    directory_path: Path to the directory containing .tiff files.

  Returns:
    A sorted np.array of filenames.
  """
  files_in_dir = [
      f for f in listdir(directory_path)
      if isfile(join(directory_path, f)) and f.lower().endswith('.tiff')
  ]
  # Sort by integer portion (split on '_')
  files_in_dir_sorted = sorted(files_in_dir, key=lambda x: int(x.split('_')[0]))
  return np.array(files_in_dir_sorted)


In [None]:
def make_pandas_dataframe(
  dir_path,
  filename,
  label=pd.NA,
  uid=0
):
  """
  Converts one .tiff file (with known valid resolution) to a pandas DataFrame.
  Automatically determines how many spectral bands are available.

  Parameters:
    dir_path: Directory where the .tiff file resides.
    filename: The .tiff file name.
    label: The label/class for all pixels in this image (default: NaN).
    uid: A unique integer ID for the file (default: 0).

  Returns:
    A pandas DataFrame of shape (num_pixels, num_bands+4) with columns [frq0, frq1, ..., 'Label', 'Shape', 'File_UID_Num', 'File'].
  """
  filepath_full = join(dir_path, filename)

  # Read the 3D data and shape string from the .tiff
  data_3D, shape_str = tiff_to_arr(filepath_full)

  # Flatten from (bands, rows, cols) --> (rows*cols, bands)
  data_2D = convert_3D_to_1D(data_3D)

  # Dynamically name frequency columns based on the number of bands
  num_bands = data_3D.shape[0]
  freq_columns = [f"frq{i}" for i in range(num_bands)]

  # Create DataFrame with band columns + metadata
  df = pd.DataFrame(data_2D, columns=freq_columns)
  df['Label'] = label
  df['Shape'] = shape_str
  df['File_UID_Num'] = uid
  df['File'] = filename

  return df


In [None]:
def load_data_and_add_positions(csv_path):
  """
  Loads the CSV that was saved from the main aggregator,
  renames the first column to 'img_pxl_index',
  and computes the pixel-level (row, col) location in each image.
  """
  df = pd.read_csv(csv_path)

  if df.columns[0].lower().startswith('unnamed'):
    df.rename(columns={df.columns[0]: 'img_pxl_index'}, inplace=True)
  else:
    # Otherwise, assume the first column is indeed the pixel index
    df.rename(columns={df.columns[0]: 'img_pxl_index'}, inplace=True)

  # Convert shape from string to actual tuple
  df['Shape'] = df['Shape'].apply(literal_eval)

  # Add pixel-level coordinate as a tuple
  df['img_pos'] = df.apply(add_img_lvl_pixel_loc, axis=1)

  return df

In [None]:
def add_img_lvl_pixel_loc(row):
  """
  Given a row with 'Shape' and 'img_pxl_index',
  calculates the (row, col) within the image's 2D array.
  'Shape' must be a tuple (rows, cols).
  """
  shape_tuple = row['Shape']   # e.g. "(100, 100)" as a string or actual tuple
  if isinstance(shape_tuple, str):
      shape_tuple = literal_eval(shape_tuple)  # convert string to tuple

  (num_rows, num_cols) = shape_tuple
  pixel_index = row['img_pxl_index']

  # row index is floor division by num_cols, col index is modulus
  row_coord = pixel_index // num_cols
  col_coord = pixel_index %  num_cols

  return (row_coord, col_coord)

### Main Function Call

In [None]:
def get_all_data_single_dir(
    tiff_directory,
    label_csv_path,
    res_dict,
    min_res=4.0,
    max_res=7.0,
    label_col_1='Sample_num',
    label_col_2='Class'
):
    """
    Reads all .tiff files in `tiff_directory` and their corresponding labels from `label_csv_path`.
    Checks resolution using `res_dict`. Creates a large DataFrame of all valid .tiff files.
    Automatically extracts the number of spectral bands from each file.

    Parameters:
      tiff_directory: Path to the directory containing all .tiff files.
      label_csv_path: Path to the CSV that contains (sample_num, label) info.
      res_dict: A dictionary of {filename: (x_res, y_res)} loaded from the resolution CSV.
      min_res: Minimum allowed resolution. (default: 4.0)
      max_res: Maximum allowed resolution. (default: 7.0)
      label_col_1: Column name in label CSV that is the sample_num. (default: 'Sample_num')
      label_col_2: Column name in label CSV that is the class/label. (default: 'Class')

    Returns:
      (df, df_files)
      df -> A pandas DataFrame of all pixel data across valid .tiff files.
      df_files -> A pandas DataFrame with columns ['Label','UID','Filename'] for each included .tiff file.
    """
    print("=== Starting Data Aggregation ===")

    # Load label CSV
    labels = get_labels(
        file_path=label_csv_path,
        col1=label_col_1,
        col2=label_col_2,
        name_col_id='Sample_num',
        name_col_label='Label'
    )

    # Gather and sort filenames in directory
    filenames = get_filenames(tiff_directory)

    # Trim filenames to only those that have a matching label
    print("Trimming filenames to match labels...")
    trimmed_filenames = trim_data_files(filenames, labels, 'Sample_num')
    print(f"Number of files with matching labels: {len(trimmed_filenames)} / {len(filenames)}")

    df_list     = []
    included    = []
    uid_counter = 1

    # Iterate over each trimmed filename
    for fname in trimmed_filenames:
        sample_num   = int(fname.split('_')[0])
        label_value  = labels.loc[labels['Sample_num'] == sample_num, 'Label'].values[0]

        # Check resolution using CSV-based res_dict
        if check_res(fname, res_dict, min_res_bound=min_res, max_res_bound=max_res):

            # Convert this TIFF to a DataFrame
            df_temp = make_pandas_dataframe(
                dir_path=tiff_directory,
                filename=fname,
                label=label_value,
                uid=uid_counter
            )
            df_list.append(df_temp)
            included.append((label_value, uid_counter, fname))
            uid_counter += 1
        else:
            pass  # resolution check failed

    # Concatenate all valid DataFrames
    if not df_list:
        raise ValueError("No valid .tiff files passed the resolution check.")

    df_all = pd.concat(df_list, ignore_index=True)

    # Create a file-level DataFrame
    df_files = pd.DataFrame(included, columns=['Label','UID','Filename'])

    print("=== Data Aggregation Complete ===")
    return df_all, df_files


## Main Run



In [None]:
#############################################
#               LOAD RESOLUTIONS
#############################################
# Loading the CSV of x_res and y_res so that we can check
# resolutions without relying on Rasterio for this information. (bit quicker given we have the metadata)

# The CSV (res_csv_path) must have these columns: file_name, x_res, y_res

df_resolutions = pd.read_csv(res_csv_path)
# Create a dictionary: { "some_file.tiff": (x_res, y_res), ... }

res_dict = {}
for idx, row in df_resolutions.iterrows():
  # ASSUMPTION: file_name is EXACTLY the .tiff file's name (As it should be lol...)
  filename = row['file_name']
  xres     = float(row['x_res'])
  yres     = float(row['y_res'])
  res_dict[filename] = (xres, yres)

In [None]:
# Run the main data retrieval using our single directory approach
# CAUTION: So Slow... this took 22 to 41 minutes to run. Depends on Google, idk?
df, valid_files = get_all_data_single_dir(
    tiff_directory = tiff_dir,
    label_csv_path = labels_csv,
    res_dict       = res_dict,     # loaded from ANG_L2A_v2_sample_subset_resolutions.csv
    min_res        = 4.0,          # Defined resolution boundaries
    max_res        = 7.0,
    label_col_1    = 'Sample_num',
    label_col_2    = 'Class'
)

In [None]:
# Adding Pixel-Level Locations
df = load_data_and_add_positions(path_to_save_sample_csv)

### View Data Frames

In [None]:
# Lets check it out; Pandas Data Frame
df

In [None]:
# Check out Files and UID; Numpy Array
valid_files

### Save To CSV

In [None]:
df.to_csv(path_to_save_sample_csv, index=False)
valid_files.to_csv(path_to_save_uid_csv, index=False)

print(f"Saved pixel-level data to: {path_to_save_sample_csv}")
print(f"Saved file-UID map to:     {path_to_save_uid_csv}")