In [11]:
import pandas as pd
import os
import sys

def aggregate_data_part(base_path: str, data_part_name: str) -> pd.DataFrame:
    """
    Aggregates all CSV files for a specific data part across multiple day folders.

    Args:
        base_path (str): The path to the main directory containing all 'DAY_X' folders.
        data_part_name (str): The name of the data folder to aggregate (e.g., 'ACTIVITIES').

    Returns:
        pd.DataFrame: A single DataFrame containing all the aggregated data.
                      Returns an empty DataFrame if no data is found.
    """
    all_dataframes = []

    if not os.path.isdir(base_path):
        print(f"Error: Base directory not found at '{base_path}'")
        print("Please ensure your directory structure is correct: project_root/data/SDHAR/")
        return pd.DataFrame()

    day_folders = sorted([d for d in os.listdir(base_path) if d.startswith('DAY_') and os.path.isdir(os.path.join(base_path, d))])

    if not day_folders:
        print(f"Warning: No 'DAY_X' folders found in '{base_path}'")
        return pd.DataFrame()

    print(f"Found {len(day_folders)} day folders. Processing...")

    for day_folder in day_folders:
        part_path = os.path.join(base_path, day_folder, day_folder, data_part_name)

        if not os.path.isdir(part_path):
            continue

        csv_files = [f for f in os.listdir(part_path) if f.lower().endswith('.csv')]

        for csv_file in csv_files:
            file_path = os.path.join(part_path, csv_file)
            try:
                temp_df = pd.read_csv(file_path)
                temp_df['source_day'] = day_folder
                temp_df['source_file'] = csv_file
                all_dataframes.append(temp_df)
            except Exception as e:
                print(f"Could not read or process file '{file_path}' because {e}")

    if not all_dataframes:
        print(f"No CSV files found for '{data_part_name}'.")
        return pd.DataFrame()

    # Concatenate all dataframes into one
    final_df = pd.concat(all_dataframes, ignore_index=True)
    return final_df

In [12]:
PART_TO_AGGREGATE = 'Activities'

try:
    notebook_dir = os.getcwd()
    project_root = os.path.abspath(os.path.join(notebook_dir, '..'))
    DATASET_BASE_DIR = os.path.join(project_root, 'data', 'SDHAR')

    print(f"Starting data aggregation for part: '{PART_TO_AGGREGATE}'")
    print(f"Detected data directory: '{DATASET_BASE_DIR}'\n")

    aggregated_df = aggregate_data_part(DATASET_BASE_DIR, PART_TO_AGGREGATE)

except Exception as e:
    print(f"An error occurred: {e}")


if 'aggregated_df' in locals() and not aggregated_df.empty:
    print("\nAggregation complete!")
    print(f"Shape of the final DataFrame: {aggregated_df.shape} (rows, columns)")

    print("\nDataFrame Info:")
    aggregated_df.info()

    print("\nFirst 5 rows of aggregated data:")
    aggregated_df.head()

    print("\nLast 5 rows of aggregated data:")
    aggregated_df.tail()
else:
    print("Aggregation did not produce any data.")

Starting data aggregation for part: 'Activities'
Detected data directory: 'C:\Users\jesse\OneDrive - University of Georgia\25-26\Fall 2025\Data Mining\SDHAR\SDHAR-Dataset-Project\data\SDHAR'

Found 62 day folders. Processing...

Aggregation complete!
Shape of the final DataFrame: (8854, 7) (rows, columns)

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8854 entries, 0 to 8853
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   time                       8854 non-null   object
 1   activity_predicted_user_1  3664 non-null   object
 2   source_day                 8854 non-null   object
 3   source_file                8854 non-null   object
 4   activity_predicted_user_2  2713 non-null   object
 5   activity_user_1            1545 non-null   object
 6   activity_user_2            932 non-null    object
dtypes: object(7)
memory usage: 484.3+ KB

First 5 rows of aggregated data: