# Human Activity Recognition — Simple Notebook

Authors: girdauskaite + partner

A compact, easy-to-run notebook that assumes you already have separate train and test CSV files. It:
- Loads train/test CSVs
- Aligns features and drops constant columns
- Fits a StandardScaler on train and applies to test
- Trains and evaluates: RandomForest, XGBoost, CatBoost (if available), and k-NN
- Saves final models (optional)

Run cells in order. Edit file paths in the config cell.

In [None]:
# Uncomment to install missing packages in a clean environment
# !pip install xgboost catboost scikit-learn seaborn matplotlib joblib


In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
import xgboost as xgb
try:
    from catboost import CatBoostClassifier
except Exception:
    CatBoostClassifier = None

import joblib
print('imports ready')

In [None]:
# CONFIG: update these paths if needed
TRAIN_CSV = 'activity_train.csv'
TEST_CSV  = 'activity_test.csv'
MODELS_DIR = 'models'
os.makedirs(MODELS_DIR, exist_ok=True)

label_map = {
    1:'WALKING', 2:'WALKING_UPSTAIRS', 3:'WALKING_DOWNSTAIRS',
    4:'SITTING', 5:'STANDING', 6:'LAYING',
    7:'STAND_TO_SIT', 8:'SIT_TO_STAND', 9:'SIT_TO_LIE',
    10:'LIE_TO_SIT', 11:'STAND_TO_LIE', 12:'LIE_TO_STAND'
}
RANDOM_STATE = 42
print('config OK')

In [None]:
# Load CSVs
df_train = pd.read_csv(TRAIN_CSV)
df_test  = pd.read_csv(TEST_CSV)
print('train shape:', df_train.shape)
print('test  shape:', df_test.shape)
print('train has activity?', 'activity' in df_train.columns)
print('test  has activity?', 'activity' in df_test.columns)


In [None]:
# Separate X / y and align feature columns
if 'activity' not in df_train.columns:
    raise ValueError('Train CSV must contain column `activity`')

y_train = df_train['activity'].copy()
X_train = df_train.drop(columns=['activity'])

if 'activity' in df_test.columns:
    y_test = df_test['activity'].copy()
    X_test = df_test.drop(columns=['activity'])
else:
    y_test = None
    X_test = df_test.copy()

# Keep only columns present in both
common_cols = [c for c in X_train.columns if c in X_test.columns]
X_train = X_train[common_cols].copy()
X_test  = X_test[common_cols].copy()
print('aligned feature count:', X_train.shape[1])

In [None]:
# Drop columns that are constant in train
const_cols = [c for c in X_train.columns if X_train[c].nunique() <= 1]
if const_cols:
    print('dropping', len(const_cols), 'constant columns')
    X_train.drop(columns=const_cols, inplace=True)
    X_test.drop(columns=const_cols, inplace=True)
print('features after drop:', X_train.shape[1])

In [None]:
# Fit scaler on train and transform both sets (useful for k-NN)
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled  = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)

joblib.dump(scaler, os.path.join(MODELS_DIR, 'scaler.joblib'))
print('scaler fitted and saved')

In [None]:
def evaluate(model, X, y, name):
    y_pred = model.predict(X)
    print('---', name)
    print('Accuracy:', accuracy_score(y, y_pred))
    print('Macro F1:', f1_score(y, y_pred, average='macro'))
    print(classification_report(y, y_pred, zero_division=0))
    cm = confusion_matrix(y, y_pred)
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, cmap='Blues', annot=False)
    plt.title(f'Confusion matrix: {name}')
    plt.show()


In [None]:
# 1) RandomForest (quick)
rf = RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=RANDOM_STATE, n_jobs=-1)
rf.fit(X_train, y_train)
if y_test is not None:
    evaluate(rf, X_test, y_test, 'RandomForest')
joblib.dump(rf, os.path.join(MODELS_DIR, 'rf.joblib'))
print('RF saved')

In [None]:
# 2) XGBoost (quick)
n_classes = y_train.nunique()
xgb_clf = xgb.XGBClassifier(objective='multi:softprob', num_class=n_classes, use_label_encoder=False,
                            eval_metric='mlogloss', random_state=RANDOM_STATE, n_jobs=-1)
xgb_clf.fit(X_train, y_train)
if y_test is not None:
    evaluate(xgb_clf, X_test, y_test, 'XGBoost')
xgb_clf.save_model(os.path.join(MODELS_DIR, 'xgb.json'))
print('XGB saved')

In [None]:
# 3) CatBoost (if available)
if CatBoostClassifier is None:
    print('CatBoost not installed — skip')
else:
    cb = CatBoostClassifier(iterations=300, learning_rate=0.1, random_seed=RANDOM_STATE, verbose=0)
    cb.fit(X_train, y_train)
    if y_test is not None:
        evaluate(cb, X_test, y_test, 'CatBoost')
    cb.save_model(os.path.join(MODELS_DIR, 'catboost.cbm'))
    print('CatBoost saved')

In [None]:
# 4) k-NN (using scaled data)
knn = KNeighborsClassifier(n_neighbors=5, weights='distance')
knn.fit(X_train_scaled, y_train)
if y_test is not None:
    evaluate(knn, X_test_scaled, y_test, 'k-NN (scaled)')
joblib.dump(knn, os.path.join(MODELS_DIR, 'knn.joblib'))
print('k-NN saved')

## Notes
- All preprocessing (dropping constant columns, scaler) was fit on the training set only and then applied to the test set.
- Use cross-validation on the training set (StratifiedKFold) for any hyperparameter tuning — do not use the test set for tuning.
- If you prefer not to save models, remove the joblib.save / save_model lines.
- If the test CSV has no labels, set `y_test` to None (the notebook already handles that) and save predictions instead of evaluating.

If you'd like, I can now:
- add a small cross-validation tuning cell for each method (light-weight), or
- add code to produce and save predictions when test labels are not present.
Which would you like next?