In [None]:
# --- Step 1: Mount Google Drive ---
from google.colab import drive
drive.mount('/content/drive')

# --- Step 2: Import Libraries ---
import os
import pandas as pd
import numpy as np

# --- Step 3: Define File Paths ---
# Update these paths to match the location of your files in Google Drive.
# For example, if your files are in "MyDrive/YourFolder", adjust the paths accordingly.
sensor_file = '/content/drive/My Drive/Folder_data_ABC_challenge/Acc_Train-20250213T050129Z-001/Acc_Train/Acc_Train_2024-09-01 21_42_00.csv'
activities_file = '/content/drive/My Drive/TrainingDataPD25/TrainActivities.csv'

# --- Step 4: Read the Data and Prepare Time Columns ---
# Read sensor data. It might have a column 'Timestamp'.
# If your sensor file already has a 'Started' column, use that; otherwise, rename 'Timestamp'.
sensor_df = pd.read_csv(sensor_file)
if 'Started' not in sensor_df.columns:
    sensor_df.rename(columns={'Timestamp': 'Started'}, inplace=True)

# Convert sensor file times (originally in UTC+1) to UTC+0 by subtracting 1 hour.
sensor_df['Started'] = pd.to_datetime(sensor_df['Started']) - pd.Timedelta(hours=1)

# Create a 'day' column from the sensor data (using UTC+0 date).
sensor_df['day'] = sensor_df['Started'].dt.floor('d')

# Read TrainActivities data.
activities_df = pd.read_csv(activities_file)
# Convert the Updated column (originally in UTC+9) to UTC+0 by subtracting 9 hours.
activities_df['Updated'] = pd.to_datetime(activities_df['Updated']) - pd.Timedelta(hours=9)

# Create a 'day' column from the TrainActivities data.
activities_df['day'] = activities_df['Updated'].dt.floor('d')

# --- Step 5: Check Unique Days for Linking ---
print("Unique days in sensor file (UTC+0):")
print(sensor_df['day'].unique())

print("\nUnique days in TrainActivities (UTC+0):")
print(activities_df['day'].unique())

# --- Step 6: Define a Function to Assign Activity Labels for a Day ---
def assign_activity_for_day(sensor_rows, activities_rows):
    """
    For sensor data rows in one day, evenly divide them among the
    TrainActivities events (for that day) and assign the corresponding 'Activity Type'.
    """
    sensor_rows = sensor_rows.sort_values('Started').copy()
    n = len(sensor_rows)       # Number of sensor rows for that day
    m = len(activities_rows)   # Number of TrainActivities events for that day

    # If no TrainActivities events for this day, assign NaN.
    if m == 0:
        sensor_rows['Activity Type'] = np.nan
        return sensor_rows

    # Calculate boundaries to split sensor rows evenly among the m events.
    boundaries = np.linspace(0, n, m + 1, dtype=int)

    # Sort activities by Updated time (if needed) and get the activity labels.
    activities_list = activities_rows.sort_values('Updated')['Activity Type'].tolist()

    # Create the new column for activity labels.
    sensor_rows['Activity Type'] = None

    # Assign each segment its corresponding activity label.
    for i in range(m):
        start = boundaries[i]
        end = boundaries[i + 1]
        sensor_rows.iloc[start:end, sensor_rows.columns.get_loc('Activity Type')] = activities_list[i]

    return sensor_rows

# --- Step 7: Process the Data by Day ---
labeled_sensor_df = pd.DataFrame()

for day, group in sensor_df.groupby('day'):
    # Get all TrainActivities events for this day.
    acts = activities_df[activities_df['day'] == day]
    print(f"Processing day: {day} - Found {len(acts)} matching TrainActivities")

    # Assign activity labels for this day.
    labeled_day = assign_activity_for_day(group.copy(), acts.copy())
    labeled_sensor_df = pd.concat([labeled_sensor_df, labeled_day], ignore_index=True)

# Sort the final DataFrame by the sensor's Started time.
labeled_sensor_df.sort_values('Started', inplace=True)

# --- Step 8: Display the Result ---
print("\nLabeled Sensor Data (first few rows):")
print(labeled_sensor_df.head())

# In Colab, you can also simply display the DataFrame:
labeled_sensor_df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Unique days in sensor file (UTC+0):
<DatetimeArray>
['2024-09-01 00:00:00']
Length: 1, dtype: datetime64[ns]

Unique days in TrainActivities (UTC+0):
<DatetimeArray>
['2024-09-01 00:00:00', '2024-09-03 00:00:00', '2024-09-05 00:00:00',
 '2024-09-09 00:00:00', '2024-09-10 00:00:00']
Length: 5, dtype: datetime64[ns]
Processing day: 2024-09-01 00:00:00 - Found 8 matching TrainActivities

Labeled Sensor Data (first few rows):
   MotionType                 Started      X      Y      Z        day  \
0        2806 2024-09-01 20:42:00.024  1.831  7.473  5.411 2024-09-01   
1        2806 2024-09-01 20:42:00.089  2.447  7.348  5.222 2024-09-01   
2        2806 2024-09-01 20:42:00.141  3.098  7.501  5.265 2024-09-01   
3        2806 2024-09-01 20:42:00.171  3.424  7.580  4.862 2024-09-01   
4        2806 2024-09-01 20:42:00.198  2.854  7.621  4.817 2024-09-01   

      

Unnamed: 0,MotionType,Started,X,Y,Z,day,Activity Type
0,2806,2024-09-01 20:42:00.024,1.831,7.473,5.411,2024-09-01,1 (FACING camera) Sit and stand
1,2806,2024-09-01 20:42:00.089,2.447,7.348,5.222,2024-09-01,1 (FACING camera) Sit and stand
2,2806,2024-09-01 20:42:00.141,3.098,7.501,5.265,2024-09-01,1 (FACING camera) Sit and stand
3,2806,2024-09-01 20:42:00.171,3.424,7.58,4.862,2024-09-01,1 (FACING camera) Sit and stand
4,2806,2024-09-01 20:42:00.198,2.854,7.621,4.817,2024-09-01,1 (FACING camera) Sit and stand


In [None]:
labeled_sensor_df.describe()

Unnamed: 0,MotionType,Started,X,Y,Z,day
count,858.0,858,858.0,858.0,858.0,858
mean,2806.0,2024-09-01 20:42:26.389299712,-0.228332,3.11604,7.639671,2024-08-31 23:59:59.999999744
min,2806.0,2024-09-01 20:42:00.024000,-15.272,-4.12,-8.164,2024-09-01 00:00:00
25%,2806.0,2024-09-01 20:42:12.247749888,-0.382,-0.327,5.041,2024-09-01 00:00:00
50%,2806.0,2024-09-01 20:42:24.545000192,-0.131,-0.272,9.84,2024-09-01 00:00:00
75%,2806.0,2024-09-01 20:42:39.640749824,-0.088,7.957,9.852,2024-09-01 00:00:00
max,2806.0,2024-09-01 20:42:59.985000,15.319,23.399,16.781,2024-09-01 00:00:00
std,0.0,,1.776744,4.394888,3.047447,


In [None]:
# List unique values in the "Activity Type" column
unique_values = labeled_sensor_df['Activity Type'].unique()
print("Unique Activity Type values:")
print(unique_values)

# Optionally, to see a count of each value, you can use value_counts()
print("\nActivity Type value counts:")
print(labeled_sensor_df['Activity Type'].value_counts())


Unique Activity Type values:
['1 (FACING camera) Sit and stand'
 '2 (FACING camera) both hands SHAKING (sitting position)'
 '3 Stand up from chair - both hands with SHAKING']

Activity Type value counts:
Activity Type
1 (FACING camera) Sit and stand                            430
2 (FACING camera) both hands SHAKING (sitting position)    321
3 Stand up from chair - both hands with SHAKING            107
Name: count, dtype: int64


In [None]:
# --- Step 1: Mount Google Drive ---
from google.colab import drive
drive.mount('/content/drive')

# --- Step 2: Import Libraries ---
import os
import glob
import pandas as pd
import numpy as np

# --- Step 3: Define File Paths ---
# Folder containing sensor files
sensor_folder = '/content/drive/My Drive/Folder_data_ABC_challenge/Acc_Train-20250213T050129Z-001/Acc_Train'
# TrainActivities file (common for all sensor files)
activities_file = '/content/drive/My Drive/TrainingDataPD25/TrainActivities.csv'
# Folder where output files will be saved
output_folder = '/content/drive/MyDrive/TrainingDataPD25/Acc_Train_Labeled'
os.makedirs(output_folder, exist_ok=True)

# --- Step 4: Read and Process the TrainActivities File ---
activities_df = pd.read_csv(activities_file)
# Convert the 'Updated' column from UTC+9 to UTC+0 by subtracting 9 hours
activities_df['Updated'] = pd.to_datetime(activities_df['Updated']) - pd.Timedelta(hours=9)
# Create a new column 'day' by flooring to day (UTC+0)
activities_df['day'] = activities_df['Updated'].dt.floor('d')

# --- Step 5: Define a Function to Process a Single Sensor File ---
def process_sensor_file(file_path, activities_df):
    # Read the sensor file
    sensor_df = pd.read_csv(file_path)

    # Use the 'Started' column for sensor timestamps.
    # If your file only has 'Timestamp', rename it to 'Started'
    if 'Started' not in sensor_df.columns:
        sensor_df.rename(columns={'Timestamp': 'Started'}, inplace=True)

    # Convert sensor timestamps from UTC+1 to UTC+0 by subtracting 1 hour
    sensor_df['Started'] = pd.to_datetime(sensor_df['Started']) - pd.Timedelta(hours=1)
    # Create a 'day' column (UTC+0) for matching
    sensor_df['day'] = sensor_df['Started'].dt.floor('d')

    # Function to assign activity labels for one day
    def assign_activity_for_day(sensor_rows, activities_rows):
        sensor_rows = sensor_rows.sort_values('Started').copy()
        n = len(sensor_rows)
        m = len(activities_rows)
        if m == 0:
            sensor_rows['Activity Type'] = np.nan
            return sensor_rows
        # Evenly split sensor rows among m events
        boundaries = np.linspace(0, n, m + 1, dtype=int)
        # Get the activity labels (sorted by Updated time)
        activities_list = activities_rows.sort_values('Updated')['Activity Type'].tolist()
        sensor_rows['Activity Type'] = None
        for i in range(m):
            start = boundaries[i]
            end = boundaries[i + 1]
            sensor_rows.iloc[start:end, sensor_rows.columns.get_loc('Activity Type')] = activities_list[i]
        return sensor_rows

    # Process sensor data grouped by day
    labeled_sensor_df = pd.DataFrame()
    for day, group in sensor_df.groupby('day'):
        acts = activities_df[activities_df['day'] == day]
        print(f"Processing day: {day} - Found {len(acts)} matching TrainActivities")
        labeled_day = assign_activity_for_day(group.copy(), acts.copy())
        labeled_sensor_df = pd.concat([labeled_sensor_df, labeled_day], ignore_index=True)

    labeled_sensor_df.sort_values('Started', inplace=True)
    return labeled_sensor_df

# --- Step 6: Process All Sensor Files and Save Output ---
# Get list of all CSV files in the sensor folder
sensor_files = glob.glob(os.path.join(sensor_folder, '*.csv'))

for file_path in sensor_files:
    print("Processing file:", file_path)
    labeled_df = process_sensor_file(file_path, activities_df)

    # Create a new file name: original name + '_labeled_activity_type.csv'
    base_name = os.path.basename(file_path)
    new_file_name = base_name.replace('.csv', '_labeled_activity_type.csv')
    output_path = os.path.join(output_folder, new_file_name)

    # Save the labeled DataFrame to CSV
    labeled_df.to_csv(output_path, index=False)
    print("Saved labeled file to:", output_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Processing file: /content/drive/My Drive/Folder_data_ABC_challenge/Acc_Train-20250213T050129Z-001/Acc_Train/Acc_Train_2024-09-03 11_21_00.csv
Processing day: 2024-09-03 00:00:00 - Found 167 matching TrainActivities
Saved labeled file to: /content/drive/MyDrive/TrainingDataPD25/Acc_Train_Labeled/Acc_Train_2024-09-03 11_21_00_labeled_activity_type.csv
Processing file: /content/drive/My Drive/Folder_data_ABC_challenge/Acc_Train-20250213T050129Z-001/Acc_Train/Acc_Train_2024-09-01 21_42_00.csv
Processing day: 2024-09-01 00:00:00 - Found 8 matching TrainActivities
Saved labeled file to: /content/drive/MyDrive/TrainingDataPD25/Acc_Train_Labeled/Acc_Train_2024-09-01 21_42_00_labeled_activity_type.csv
Processing file: /content/drive/My Drive/Folder_data_ABC_challenge/Acc_Train-20250213T050129Z-001/Acc_Train/Acc_Train_2024-09-03 11_02_00.csv
Processing day: 2024-09-03 

In [None]:
# --- Step 1: Mount Google Drive ---
from google.colab import drive
drive.mount('/content/drive')

# --- Step 2: Import Libraries ---
import os
import glob
import pandas as pd
import numpy as np

# --- Step 3: Define File Paths ---
# Folder containing sensor files
sensor_folder = '/content/drive/My Drive/Folder_data_ABC_challenge/Acc_Train-20250213T050129Z-001/Acc_Train'
# TrainActivities file (common for all sensor files)
activities_file = '/content/drive/My Drive/TrainingDataPD25/TrainActivities.csv'
# Output folder for labeled sensor files
output_folder = '/content/drive/MyDrive/TrainingDataPD25/Acc_Train_Labeled'
os.makedirs(output_folder, exist_ok=True)

# --- Step 4: Read and Process the TrainActivities File ---
activities_df = pd.read_csv(activities_file)

# Convert the 'Updated' column from UTC+9 to UTC+0 by subtracting 9 hours.
activities_df['Updated'] = pd.to_datetime(activities_df['Updated']) - pd.Timedelta(hours=9)
# Sort by Updated time
activities_df.sort_values('Updated', inplace=True)
# For merging, we keep only the columns we need (e.g., Updated and Activity Type)
activities_merge = activities_df[['Updated', 'Activity Type']].copy()

# --- Step 5: Define a Function to Process a Single Sensor File Using Merge_Asof ---
def process_sensor_file(file_path, activities_merge):
    # Read the sensor file.
    sensor_df = pd.read_csv(file_path)

    # If the sensor file doesn't already have a 'Started' column, rename 'Timestamp'
    if 'Started' not in sensor_df.columns:
        sensor_df.rename(columns={'Timestamp': 'Started'}, inplace=True)

    # Convert sensor timestamps from UTC+1 to UTC+0 by subtracting 1 hour.
    sensor_df['Started'] = pd.to_datetime(sensor_df['Started']) - pd.Timedelta(hours=1)

    # Sort sensor data by 'Started' time (required for merge_asof).
    sensor_df.sort_values('Started', inplace=True)

    # Use merge_asof to assign each sensor row the most recent TrainActivities event.
    # The merge_asof function finds, for each sensor row, the last row in activities_merge
    # whose 'Updated' is less than or equal to the sensor's 'Started' time.
    labeled_df = pd.merge_asof(sensor_df, activities_merge,
                               left_on='Started', right_on='Updated',
                               direction='backward')

    # Optionally drop the extra 'Updated' column from the merge.
    labeled_df.drop(columns=['Updated'], inplace=True)

    return labeled_df

# --- Step 6: Process All Sensor Files and Save the Output ---
sensor_files = glob.glob(os.path.join(sensor_folder, '*.csv'))

for file_path in sensor_files:
    print("Processing file:", file_path)
    labeled_df = process_sensor_file(file_path, activities_merge)

    # Create new file name: original sensor file name + '_labeled_activity_type.csv'
    base_name = os.path.basename(file_path)
    new_file_name = base_name.replace('.csv', '_labeled_activity_type.csv')
    output_path = os.path.join(output_folder, new_file_name)

    # Save the labeled sensor DataFrame to CSV.
    labeled_df.to_csv(output_path, index=False)
    print("Saved labeled file to:", output_path)


Mounted at /content/drive
Processing file: /content/drive/My Drive/Folder_data_ABC_challenge/Acc_Train-20250213T050129Z-001/Acc_Train/Acc_Train_2024-09-01 21_42_00.csv
Saved labeled file to: /content/drive/MyDrive/TrainingDataPD25/Acc_Train_Labeled/Acc_Train_2024-09-01 21_42_00_labeled_activity_type.csv
Processing file: /content/drive/My Drive/Folder_data_ABC_challenge/Acc_Train-20250213T050129Z-001/Acc_Train/Acc_Train_2024-09-03 11_02_00.csv
Saved labeled file to: /content/drive/MyDrive/TrainingDataPD25/Acc_Train_Labeled/Acc_Train_2024-09-03 11_02_00_labeled_activity_type.csv
Processing file: /content/drive/My Drive/Folder_data_ABC_challenge/Acc_Train-20250213T050129Z-001/Acc_Train/Acc_Train_2024-09-03 10_57_00.csv
Saved labeled file to: /content/drive/MyDrive/TrainingDataPD25/Acc_Train_Labeled/Acc_Train_2024-09-03 10_57_00_labeled_activity_type.csv
Processing file: /content/drive/My Drive/Folder_data_ABC_challenge/Acc_Train-20250213T050129Z-001/Acc_Train/Acc_Train_2024-09-03 10_56_00