In [2]:
!pip install numpy>=1.21.0 pandas>=1.3.0 scikit-learn>=1.0.0 lightgbm>=3.3.0 xgboost>=1.5.0 matplotlib>=3.4.0 seaborn>=0.11.0 tqdm>=4.62.0 joblib>=1.1.0 ipykernel>=6.0.0

In [49]:

from sklearn.base import clone
from sklearn.model_selection import GridSearchCV, ParameterGrid, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from lightgbm import LGBMClassifier
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb


import warnings
warnings.filterwarnings('ignore')


np.random.seed(42)

In [4]:
df_train = pd.read_csv('df_train.csv')
df_test = pd.read_csv('df_test.csv')

In [66]:
class RobustActivityClassifier:
    def __init__(self, df_train, df_test, random_state=42):
        self.random_state = random_state
        self.models = {}

        self.locations = ['hand', 'chest', 'ankle']
        self.measurements = {
            'temperature': 'temperature (°C)',
            'acceleration': ['acceleration X ±16g', 'acceleration Y ±16g', 'acceleration Z ±16g'],
            'gyroscope': ['gyroscope X', 'gyroscope Y', 'gyroscope Z'],
            'magnetometer': ['magnetometer X', 'magnetometer Y', 'magnetometer Z']
        }

    def normalize_by_person(self, df):
        """
        Normalize features by person-specific baseline metrics, handling IMU data
        """
        normalized_df = df.copy()

        # Group features by sensor type and location
        for location in self.locations:
            # Temperature normalization
            temp_col = f'{location} {self.measurements["temperature"]}'

            # Calculate person-specific temperature baselines
            temp_baselines = df.groupby('PeopleId')[temp_col].mean()

            # Normalize temperatures
            for person_id in df['PeopleId'].unique():
                mask = df['PeopleId'] == person_id
                baseline = temp_baselines[person_id]
                if baseline != 0:
                    normalized_df.loc[mask, temp_col] = df.loc[mask, temp_col] / baseline

            # Normalize IMU sensors (using Z-score normalization per person)
            for measure_type in ['acceleration', 'gyroscope', 'magnetometer']:
                for axis in ['X', 'Y', 'Z']:
                    col = f'{location} {measure_type} {axis}'
                    if measure_type == 'acceleration':
                        col += ' ±16g'

                    # Normalize per person
                    for person_id in df['PeopleId'].unique():
                        mask = df['PeopleId'] == person_id
                        person_data = df.loc[mask, col]

                        # Use robust scaling to handle outliers
                        median = person_data.median()
                        iqr = person_data.quantile(0.75) - person_data.quantile(0.25)
                        if iqr != 0:
                            normalized_df.loc[mask, col] = (person_data - median) / iqr

        return normalized_df

    def engineer_features(self, df):
        """
        Engineer features from IMU sensor data
        """
        df_features = df.copy()

        for location in self.locations:
            #  magnitude acceleration
            accel_cols = [f'{location} acceleration {axis} ±16g' for axis in ['X', 'Y', 'Z']]
            df_features[f'{location}_acceleration_magnitude'] = np.sqrt(
                df[accel_cols].pow(2).sum(axis=1)
            )

            # magnitude of angular velocity
            gyro_cols = [f'{location} gyroscope {axis}' for axis in ['X', 'Y', 'Z']]
            df_features[f'{location}_angular_velocity_magnitude'] = np.sqrt(
                df[gyro_cols].pow(2).sum(axis=1)
            )

            # magintude of magnetic field
            mag_cols = [f'{location} magnetometer {axis}' for axis in ['X', 'Y', 'Z']]
            df_features[f'{location}_magnetic_magnitude'] = np.sqrt(
                df[mag_cols].pow(2).sum(axis=1)
            )

            # rolling statistics for each magnitude
            windows = [5, 10, 20]  # window sizes
            for window in windows:
                # Group by person to avoid window spillover
                for person_id in df['PeopleId'].unique():
                    mask = df['PeopleId'] == person_id

                    # Calculate rolling statistics for each magnitude
                    for feature in ['acceleration', 'angular_velocity', 'magnetic']:
                        col = f'{location}_{feature}_magnitude'

                        # Rolling mean
                        df_features.loc[mask, f'{col}_roll_mean_{window}'] = (
                            df_features.loc[mask, col]
                            .rolling(window=window, min_periods=1)
                            .mean()
                        )

                        # Rolling standard deviation
                        df_features.loc[mask, f'{col}_roll_std_{window}'] = (
                            df_features.loc[mask, col]
                            .rolling(window=window, min_periods=1)
                            .std()
                        )

            # Cross-sensor features
            df_features[f'{location}_accel_gyro_ratio'] = (
                df_features[f'{location}_acceleration_magnitude'] /
                (df_features[f'{location}_angular_velocity_magnitude'] + 1e-6)
            )

        # Inter-location features
        for loc1, loc2 in [('hand', 'chest'), ('hand', 'ankle'), ('chest', 'ankle')]:
            df_features[f'{loc1}_{loc2}_accel_correlation'] = (
                df_features[f'{loc1}_acceleration_magnitude'] *
                df_features[f'{loc2}_acceleration_magnitude']
            )

        return df_features

    def prepare_data(self, df_train, df_test):
        """
        Prepare and preprocess the data
        """
        print("Normalizing and engineering features...")

        # normalize by person
        train_normalized = self.normalize_by_person(df_train)
        test_normalized = self.normalize_by_person(df_test)


        self.processed_train = self.engineer_features(train_normalized)
        self.processed_test = self.engineer_features(test_normalized)

        # target labels encoding
        self.le = LabelEncoder()
        self.processed_train['activityID'] = self.le.fit_transform(
            self.processed_train['activityID']
        )
        self.processed_test['activityID'] = self.le.transform(
            self.processed_test['activityID']
        )


        exclude_cols = ['activityID', 'PeopleId', 'Unnamed: 0']
        feature_cols = [col for col in self.processed_train.columns
                       if col not in exclude_cols]

        self.X_train = self.processed_train[feature_cols]
        self.y_train = self.processed_train['activityID']
        self.X_test = self.processed_test[feature_cols]
        self.y_test = self.processed_test['activityID']


        self.scaler = StandardScaler()
        self.X_train = self.scaler.fit_transform(self.X_train)
        self.X_test = self.scaler.transform(self.X_test)

        print(f"Final feature shape: {self.X_train.shape}")
        print(f"Number of classes: {len(np.unique(self.y_train))}")

    def train_models(self):
        """
        Train LightGBM with progress tracking
        """
        print("\nTraining LightGBM model...")

        # Initialize base model with good defaults
        model = LGBMClassifier(
            random_state=self.random_state,
            n_jobs=-1,
            verbose=-1,
            n_estimators=100,
            max_depth=7,
            learning_rate=0.1,
            reg_alpha=0.1,  # L1 regularization
            reg_lambda=0.1,  # L2 regularization
            min_child_samples=20,  # Minimum number of samples in a leaf
            min_split_gain=0.1,  # Minimum loss reduction for split
            feature_fraction=0.8,  # Feature subsampling
            bagging_fraction=0.8,  # Row subsampling
            bagging_freq=5,  # Bagging frequency
        )

        param_grid = {
            'max_depth': [7],  # single value for time
            'feature_fraction': [0.8]
        }

        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=self.random_state)

        # Calculate total iterations for progress tracking
        n_candidates = len(list(ParameterGrid(param_grid)))
        n_splits = cv.get_n_splits()
        total_fits = n_candidates * n_splits

        print(f"\nRunning {total_fits} fits ({n_candidates} candidates × {n_splits} splits)")

        # Run grid search with progress bar
        print("\nStarting grid search with cross-validation...")
        best_score = -np.inf
        best_params = None
        best_model = None

        with tqdm(total=total_fits, desc="Cross-validation progress") as pbar:
            for parameters in ParameterGrid(param_grid):
                scores = []
                for train, test in cv.split(self.X_train, self.y_train):
                    model_clone = clone(model)
                    model_clone.set_params(**parameters)

                    # Add early stopping
                    model_clone.fit(
                        self.X_train[train],
                        self.y_train[train],
                        eval_set=[(self.X_train[test], self.y_train[test])],
                        callbacks=[
                            lgb.early_stopping(stopping_rounds=20),
                            lgb.log_evaluation(period=0)
                        ]
                    )


                    score = model_clone.score(self.X_train[test], self.y_train[test])
                    scores.append(score)

                    pbar.update(1)
                    pbar.set_postfix({'score': f'{score:.4f}'})

                avg_score = np.mean(scores)
                if avg_score > best_score:
                    best_score = avg_score
                    best_params = parameters
                    best_model = model_clone

        print(f"\nBest CV score: {best_score:.4f}")
        print(f"Best parameters: {best_params}")


        print("\nTraining final model with best parameters...")
        final_model = LGBMClassifier(
            **best_params,
            random_state=self.random_state,
            n_jobs=-1,
            verbose=-1
        )

        def lgb_progress_callback(env):
            """Callback for progress bar"""
            train_pbar.update(1)
            if env.evaluation_result_list:
                train_pbar.set_postfix({'loss': f"{env.evaluation_result_list[0][2]:.4f}"})

        with tqdm(total=100, desc="Final training progress") as train_pbar:
            final_model.fit(
                self.X_train,
                self.y_train,
                eval_set=[(self.X_test, self.y_test)],
                callbacks=[
                    lgb_progress_callback,
                    lgb.early_stopping(stopping_rounds=20),
                    lgb.log_evaluation(period=0)
                ]
            )
        self.model = final_model

        # Evaluate
        y_pred = self.model.predict(self.X_test)
        train_pred = self.model.predict(self.X_train)

        train_accuracy = accuracy_score(self.y_train, train_pred)
        test_accuracy = accuracy_score(self.y_test, y_pred)
        report = classification_report(self.y_test, y_pred,
                                    target_names=self.le.classes_)

        print(f"\nResults:")
        print(f"Training accuracy: {train_accuracy:.4f}")
        print(f"Test accuracy: {test_accuracy:.4f}")
        print("\nClassification Report:")
        print(report)

        return self.model

    def plot_results(self):
        """
        Comprehensive visualization of results
        """
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
        self.feature_names = [col for col in self.processed_train.columns
                            if col not in exclude_cols]
        # Confusion Matrix
        y_pred = self.model.predict(self.X_test)
        cm = confusion_matrix(self.y_test, y_pred)

        # Get activity names
        activities = self.le.classes_

        # Plot confusion matrix
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax1,
                    xticklabels=activities,
                    yticklabels=activities)
        ax1.set_title('Confusion Matrix')
        ax1.set_xlabel('Predicted')
        ax1.set_ylabel('Actual')
        plt.setp(ax1.get_xticklabels(), rotation=45, ha='right')

        # Feature Importance
        importance = pd.DataFrame({
            'feature': self.feature_names,
            'importance': self.model.feature_importances_
        }).sort_values('importance', ascending=True)

        # Show top 15 features
        top_features = importance.tail(15)
        ax2.barh(range(len(top_features)), top_features['importance'])
        ax2.set_yticks(range(len(top_features)))
        ax2.set_yticklabels(top_features['feature'])
        ax2.set_title('Top 15 Feature Importance')

        plt.tight_layout()
        plt.show()

        return importance

    def predict(self, X):
        """
        Make predictions on new data
        """
        return self.model.predict(X)

    def predict_proba(self, X):
        """
        Get probability predictions
        """
        return self.model.predict_proba(X)

In [67]:
classifier = RobustActivityClassifier(df_train, df_test, random_state=42)


In [68]:
classifier.prepare_data(df_train, df_test)

Normalizing and engineering features...
Final feature shape: (2456025, 100)
Number of classes: 13


In [69]:
classifier.train_models()


Training LightGBM model...

Running 3 fits (1 candidates × 3 splits)

Starting grid search with cross-validation...


Cross-validation progress:   0%|          | 0/3 [00:00<?, ?it/s]

Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's multi_logloss: 0.0151698


Cross-validation progress:  33%|███▎      | 1/3 [03:08<06:16, 188.07s/it, score=0.9982]

Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's multi_logloss: 0.0150485


Cross-validation progress:  67%|██████▋   | 2/3 [06:11<03:05, 185.60s/it, score=0.9983]

Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's multi_logloss: 0.0154166


Cross-validation progress: 100%|██████████| 3/3 [09:24<00:00, 188.32s/it, score=0.9983]



Best CV score: 0.9983
Best parameters: {'feature_fraction': 0.8, 'max_depth': 7}

Training final model with best parameters...


Final training progress:   1%|          | 1/100 [00:07<12:36,  7.64s/it, loss=1.8370]

Training until validation scores don't improve for 20 rounds


Final training progress:  41%|████      | 41/100 [01:41<02:26,  2.48s/it, loss=1.5167]


Early stopping, best iteration is:
[21]	valid_0's multi_logloss: 1.2686

Results:
Training accuracy: 0.9638
Test accuracy: 0.6324

Classification Report:
                      precision    recall  f1-score   support

      Nordic walking       0.00      0.00      0.00     28888
    ascending stairs       0.49      0.30      0.38     11683
             cycling       0.99      0.91      0.95     25475
   descending stairs       0.33      0.49      0.40      9655
             ironing       0.00      0.00      0.00     32990
               lying       0.97      0.95      0.96     24165
        rope jumping       0.43      0.18      0.25      8806
             running       0.70      0.83      0.76     16532
             sitting       0.99      0.46      0.62     22923
            standing       0.00      0.00      0.00     25160
transient activities       0.53      0.92      0.68    145929
     vacuum cleaning       0.70      0.66      0.68     24292
             walking       0.98      0.

In [71]:
import joblib
import os
import datetime
from datetime import datetime

def save_model(classifier, directory='models', prefix='activity_classifier'):
    """
    Save a trained classifier model and its associated data with robust attribute checking
    """
    # Create directory if it doesn't exist
    os.makedirs(directory, exist_ok=True)

    # Create timestamp for unique filename
    model = 'class_l1_l2'
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f"{prefix}_{timestamp}_{model}.joblib"
    filepath = os.path.join(directory, filename)

    # Initialize model data dictionary with required model
    model_data = {'model': classifier.model}

    # Optional attributes to save if they exist
    optional_attributes = {
        'scaler': 'scaler',
        'label_encoder': 'le',
        'feature_names': 'feature_names',
        'X_train': 'X_train'
    }

    # Add existing attributes to model_data
    for key, attr in optional_attributes.items():
        if hasattr(classifier, attr):
            model_data[key] = getattr(classifier, attr)

    # Print what's being saved
    print(f"\nSaving the following components:")
    for key in model_data.keys():
        print(f"- {key}")

    # Save the model data
    joblib.dump(model_data, filepath)
    print(f"\nModel saved to {filepath}")
    return filepath

In [72]:
model_path = save_model(classifier)


Saving the following components:
- model
- scaler
- label_encoder
- X_train

Model saved to models/activity_classifier_20241207_182735_class_l1_l2.joblib
