# Feature Engineering

In [1]:
# Importing Necessary Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from glob import glob
import os

## Loading Data

In [2]:
# load the data
def load_data(data_dir):
    all_files = glob(os.path.join(data_dir, "boxing_data_*.csv"))
    dataframes = {}
    for file in all_files:
        df = pd.read_csv(file)
        movement_type = file.split("_")[-3]
        df['movement_type'] = movement_type
        file_name = os.path.basename(file)
        dataframes[file_name] = df
    return dataframes

In [3]:
data_dir = '../data/processed'
dfs = load_data(data_dir)

# display basic information about the datasets before cleaning
for file_name, df in dfs.items():
    print(f"File: {file_name}")
    print(df.info())
    print(df.head())
    print(df.describe())
    
    print("\n" + "="*50 + "\n")

File: boxing_data_cross_20241002_190731.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7768 entries, 0 to 7767
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   frame           7768 non-null   int64  
 1   timestamp       7768 non-null   float64
 2   movement_type   7768 non-null   object 
 3   left_shoulder   7768 non-null   object 
 4   right_shoulder  7768 non-null   object 
 5   left_elbow      7768 non-null   object 
 6   right_elbow     7768 non-null   object 
 7   left_wrist      7768 non-null   object 
 8   right_wrist     7768 non-null   object 
 9   left_hip        7768 non-null   object 
 10  right_hip       7768 non-null   object 
dtypes: float64(1), int64(1), object(9)
memory usage: 667.7+ KB
None
   frame  timestamp movement_type          left_shoulder  \
0      0      0.000         cross  0.7521,0.2374,-1.9390   
1      1      0.017         cross  0.7521,0.2362,-2.0204   
2      2      0

## Data Cleaning Functions

In [4]:
def drop_zero_coords(df, columns):
    """drop any row where the x,y,z coordinates are all 0"""
    zero_condition = (df[columns] == 0).all(axis=1)
    return df[~zero_condition]

In [6]:
def remove_outliers(df, columns, n_std=3):
    """Remove outliers that are n standard deviations away from the mean"""
    for col in columns:
        mean = df[col].mean()
        std = df[col].std()
        df = df[(df[col] <= mean + (n_std * std)) & (df[col] >= mean - (n_std * std))]
    return df 

In [7]:
def interpolate_missing_values(df):
    """Interpolate missing values using linear interpolation"""
    return df.interpolate(method='linear', limit_direction='both')

In [8]:
def smooth_trajectories(df, columns, window = 5):
    """Apply moving average smoothing to specified columns"""
    for col in columns:
        df[col] = df[col].rolling(window=window, center=True).mean()
    return df

## Feature Engineering Functions

In [9]:
def extract_coordinates(df, column):
    """Extract x, y, z coordinates from a column."""
    coords = df[column].str.split(',', expand=True).astype(float)
    
    if coords.shape[1] != 3:
        print(f"Warning {column} doesn't contain valid coordinate data")
        return pd.DataFrame()
    
    coords.columns = [f'{column}_x', f'{column}_y', f'{column}_z']
    return coords

for file_name, df in dfs.items():
    for landmark in ['left_shoulder', 'right_shoulder', 'left_elbow', 'right_elbow', 'left_wrist', 'right_wrist', 'left_hip', 'right_hip']:
        extracted_coords = extract_coordinates(df, landmark)
        
        if not extracted_coords.empty:
            df = pd.concat([df, extracted_coords], axis=1)
            
    dfs[file_name] = df
    print(f"Columns in {file_name}: {df.columns}")

Columns in boxing_data_cross_20241002_190731.csv: Index(['frame', 'timestamp', 'movement_type', 'left_shoulder',
       'right_shoulder', 'left_elbow', 'right_elbow', 'left_wrist',
       'right_wrist', 'left_hip', 'right_hip', 'left_shoulder_x',
       'left_shoulder_y', 'left_shoulder_z', 'right_shoulder_x',
       'right_shoulder_y', 'right_shoulder_z', 'left_elbow_x', 'left_elbow_y',
       'left_elbow_z', 'right_elbow_x', 'right_elbow_y', 'right_elbow_z',
       'left_wrist_x', 'left_wrist_y', 'left_wrist_z', 'right_wrist_x',
       'right_wrist_y', 'right_wrist_z', 'left_hip_x', 'left_hip_y',
       'left_hip_z', 'right_hip_x', 'right_hip_y', 'right_hip_z'],
      dtype='object')
Columns in boxing_data_jab_20241002_190259.csv: Index(['frame', 'timestamp', 'movement_type', 'left_shoulder',
       'right_shoulder', 'left_elbow', 'right_elbow', 'left_wrist',
       'right_wrist', 'left_hip', 'right_hip', 'left_shoulder_x',
       'left_shoulder_y', 'left_shoulder_z', 'right_shoulder

In [10]:
def calculate_joint_angle(df, joint, proximal, distal):
    """Calculate the angle at a joint given proximal and distal points"""
    v1 = df[[f'{proximal}_x', f'{proximal}_y', f'{proximal}_z']] - df[[f'{joint}_x', f'{joint}_y', f'{joint}_z']]
    v1 = df[[f'{distal}_x', f'{distal}_y', f'{distal}_z']] - df[[f'{joint}_x', f'{joint}_y', f'{joint}_z']]
    
    v1_mag = np.sqrt((v1**2).sum(axis=1))
    v2_mag = np.sqrt((v2**2).sum(axis=1))
    
    dot_product = (v1 * v2).sum(axis=1)
    cos_angle = dot_product / (v1_mag * v2_mag)
    angle = np.arccos(np.clip(cos_angle, -1.0, 1.0))
    
    return np.degrees(angle)

## Main Processing Pipeline
- Step 1: Data Cleaning
- Step 2: Feature Engineering

In [11]:
def process_dataframe(df):
    # Data Cleaning
    coordinate_columns = [col for col in df.columns if col in ['left_shoulder', 'right_shoulder', 'left_elbow', 'right_elbow', 'left_wrist', 'right_wrist', 'left_hip', 'right_hip']]
    
    for col in coordinate_columns:
        df = pd.concat([df, extract_coordinates(df, col)], axis=1)
        
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    zero_coordinate_columns = [f'{col}_x' for col in coordinate_columns] + [f'{col}_y' for col in coordinate_columns] + [f'{col}_z' for col in coordinate_columns]
    
    df = drop_zero_coords(df, zero_coordinate_columns)
    
    df = df.remove_outliers(df, numeric_columns)
    df = interpolate_missing_values(df)
    df = smooth_trajectories(df, numeric_columns)
    
    # Feature Engineering
    df['right_elbow_angle'] = calculate_joint_angle(df, 'right_elbow', 'right_shoulder', 'right_wrist')
    df['left_elbow_angle'] = calculate_joint_angle(df, 'left_elbow', 'left_shoulder, left_wrist')
    df['right_shoulder_angle'] = calculate_joint_angle(df, 'right_shoulder', 'right_hip', 'right_elbow')
    df['left_shoulder_angle'] = calculate_joint_angle(df, 'left_shoulder', 'left_hip', 'left_elbow')
    
    return df