In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load train and test datasets
train_path = "/Users/gabemiller/Desktop/MIT/Edge/project/df_train.csv"
test_path = "/Users/gabemiller/Desktop/MIT/Edge/project/df_test.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
train.drop(columns=['Unnamed: 0'], inplace=True)
test.drop(columns=['Unnamed: 0'], inplace=True)
print(len(train), len(test))
print(train.columns)

2456025 408031
Index(['activityID', 'heart_rate', 'hand temperature (°C)',
       'hand acceleration X ±16g', 'hand acceleration Y ±16g',
       'hand acceleration Z ±16g', 'hand gyroscope X', 'hand gyroscope Y',
       'hand gyroscope Z', 'hand magnetometer X', 'hand magnetometer Y',
       'hand magnetometer Z', 'chest temperature (°C)',
       'chest acceleration X ±16g', 'chest acceleration Y ±16g',
       'chest acceleration Z ±16g', 'chest gyroscope X', 'chest gyroscope Y',
       'chest gyroscope Z', 'chest magnetometer X', 'chest magnetometer Y',
       'chest magnetometer Z', 'ankle temperature (°C)',
       'ankle acceleration X ±16g', 'ankle acceleration Y ±16g',
       'ankle acceleration Z ±16g', 'ankle gyroscope X', 'ankle gyroscope Y',
       'ankle gyroscope Z', 'ankle magnetometer X', 'ankle magnetometer Y',
       'ankle magnetometer Z', 'PeopleId'],
      dtype='object')


In [23]:
# Normalize per person
def normalize_per_person(df, cols_to_normalize, group_col):
    """
    Normalize numerical columns by person (group_col).
    Each person's data is scaled to have mean=0 and std=1.
    """
    group_means = df.groupby(group_col)[cols_to_normalize].transform('mean')
    group_stds = df.groupby(group_col)[cols_to_normalize].transform('std')
    
    # Avoid division by zero by replacing std=0 with 1 (no scaling)
    group_stds.replace(0, 1, inplace=True)
    
    # Normalize and handle NaNs
    normalized = (df[cols_to_normalize] - group_means) / group_stds
    normalized.fillna(0, inplace=True)
    
    # Update the DataFrame with normalized values
    df[cols_to_normalize] = normalized
    return df

# Add magnitude columns
def calculate_magnitude(df, x_col, y_col, z_col, new_col_name):
    """
    Calculate the magnitude of a 3D vector and add it as a new column.
    """
    df[new_col_name] = np.sqrt(df[x_col]**2 + df[y_col]**2 + df[z_col]**2)

def add_magnitude_columns(df):
    """
    Add magnitude columns for hand, chest, and ankle accelerations.
    """
    calculate_magnitude(df, 'hand acceleration X ±16g', 'hand acceleration Y ±16g', 'hand acceleration Z ±16g', 'hand_acceleration_magnitude')
    calculate_magnitude(df, 'chest acceleration X ±16g', 'chest acceleration Y ±16g', 'chest acceleration Z ±16g', 'chest_acceleration_magnitude')
    calculate_magnitude(df, 'ankle acceleration X ±16g', 'ankle acceleration Y ±16g', 'ankle acceleration Z ±16g', 'ankle_acceleration_magnitude')

# Rolling window features
def rolling_window_features(df, window_size, step_size, cols_to_aggregate, group_col):
    """
    Compute rolling window features (mean and std only) for numerical columns,
    ensuring windows only collapse rows with the same person and activity.
    """
    results = []

    # Group by person (PeopleId)
    grouped = df.groupby(group_col)
    
    for group_id, group in grouped:
        # Reset index for consistent rolling calculation
        group = group.reset_index(drop=True)

        # Create a helper column to track changes in activity
        group['activity_change'] = (group['activityID'] != group['activityID'].shift()).cumsum()

        # Group further by activity within each person
        activity_groups = group.groupby('activity_change')

        for _, activity_group in activity_groups:
            # Check if the activity group is large enough for rolling windows
            if len(activity_group) < window_size:
                continue

            # Compute rolling statistics
            rolling_stats = activity_group[cols_to_aggregate].rolling(window=window_size, min_periods=1)

            # Extract statistics at step intervals
            rolling_means = rolling_stats.mean()[window_size-1::step_size].reset_index(drop=True)
            rolling_stds = rolling_stats.std()[window_size-1::step_size].reset_index(drop=True)

            # Extract activityID and other metadata for valid rows
            metadata = activity_group.iloc[window_size-1::step_size].reset_index(drop=True)

            # Construct the result DataFrame
            group_results = pd.DataFrame({
                group_col: group_id,  # PeopleId
                'activityID': metadata['activityID']  # Activity
            })

            # Add rolling features for each column (mean and std only)
            for col in cols_to_aggregate:
                group_results[f'{col}_mean'] = rolling_means[col]
                group_results[f'{col}_std'] = rolling_stds[col]

            # Append the results for this activity group
            results.append(group_results)
    
    # Combine all results into a single DataFrame
    return pd.concat(results, ignore_index=True)

# Create rolling feature dataset
def create_rolling_feature_dataset(df, window_size, step_size):
    """
    Add magnitude features and create a rolling feature dataset.
    """
    # Add magnitude columns
    add_magnitude_columns(df)
    
    # Select numerical columns to aggregate
    cols_to_aggregate = [col for col in df.columns if np.issubdtype(df[col].dtype, np.number)]
    cols_to_aggregate = [col for col in cols_to_aggregate if col not in ['activityID', 'PeopleId']]
    
    # Generate rolling features
    return rolling_window_features(df, window_size, step_size, cols_to_aggregate, 'PeopleId')

# Normalize datasets
cols_to_normalize = [col for col in train.columns if np.issubdtype(train[col].dtype, np.number)]
cols_to_normalize = [col for col in cols_to_normalize if col not in ['activityID', 'PeopleId']]

# Normalize train and test datasets
train = normalize_per_person(train, cols_to_normalize, 'PeopleId')
test = normalize_per_person(test, cols_to_normalize, 'PeopleId')

# Generate rolling feature datasets (e.g., 0.1s window with 50% overlap)
train_rolling = create_rolling_feature_dataset(train, window_size=10, step_size=5)
test_rolling = create_rolling_feature_dataset(test, window_size=10, step_size=5)

print(train_rolling.head())
print(test_rolling.head())
print(train_rolling.columns)
print(len(train_rolling), len(test_rolling))


   PeopleId            activityID  heart_rate_mean  heart_rate_std  \
0         1  transient activities        -0.839332             0.0   
1         1  transient activities        -0.839332             0.0   
2         1  transient activities        -0.839332             0.0   
3         1  transient activities        -0.839332             0.0   
4         1  transient activities        -0.839332             0.0   

   hand temperature (°C)_mean  hand temperature (°C)_std  \
0                   -1.963838                        0.0   
1                   -1.963838                        0.0   
2                   -1.963838                        0.0   
3                   -1.963838                        0.0   
4                   -1.963838                        0.0   

   hand acceleration X ±16g_mean  hand acceleration X ±16g_std  \
0                       0.885088                      0.015354   
1                       0.898852                      0.016356   
2                   

In [25]:
# export train_rolling and test_rolling to csv as 'train.csv' and 'test.csv'
train_rolling.to_csv('train.csv', index=False)
test_rolling.to_csv('test.csv', index=False)


In [5]:
# Prepare labels and feature matrices
y_train = train_rolling['activityID']
y_test = test_rolling['activityID']

X_train = train_rolling.drop(columns=['activityID', 'PeopleId']).to_numpy()
X_test = test_rolling.drop(columns=['activityID', 'PeopleId']).to_numpy()

print(train_rolling.head())

   PeopleId            activityID  Unnamed: 0_mean  heart_rate_mean  \
0         1  transient activities        -1.732002        -0.839332   
1         1  transient activities        -1.731956        -0.839332   
2         1  transient activities        -1.731910        -0.839332   
3         1  transient activities        -1.731864        -0.839332   
4         1  transient activities        -1.731818        -0.839332   

   hand temperature (°C)_mean  hand acceleration X ±16g_mean  \
0                   -1.963838                       0.885088   
1                   -1.963838                       0.898852   
2                   -1.963838                       0.904960   
3                   -1.963838                       0.907269   
4                   -1.963838                       0.904964   

   hand acceleration Y ±16g_mean  hand acceleration Z ±16g_mean  \
0                       0.486930                       0.044555   
1                       0.507069                      