# XGBoost

## Imports

In [25]:
import xgboost as xgb
from sklearn.utils import compute_sample_weight
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix
import numpy as np, matplotlib.pyplot as plt, seaborn as sns
import pandas as pd

In [26]:
TRAIN_CSV = '../data/activity_train.csv'   # path to train CSV
TEST_CSV  = '../data/activity_test.csv'    # path to test CSV

label_map = {
    1:'WALKING', 2:'WALKING_UPSTAIRS', 3:'WALKING_DOWNSTAIRS',
    4:'SITTING', 5:'STANDING', 6:'LAYING',
    7:'STAND_TO_SIT', 8:'SIT_TO_STAND', 9:'SIT_TO_LIE',
    10:'LIE_TO_SIT', 11:'STAND_TO_LIE', 12:'LIE_TO_STAND'
}

RANDOM_STATE = 42

In [27]:
df_train = pd.read_csv(TRAIN_CSV)
df_test = pd.read_csv(TEST_CSV)
print('train shape:', df_train.shape)
print('test  shape:', df_test.shape)

X_train = df_train.drop(columns=['activity'])
y_train = df_train['activity'].copy()
X_test = df_test.drop(columns=['activity'])
y_test = df_test['activity'].copy()

train_cols = list(X_train.columns)
test_cols = list(X_test.columns)
X_test = X_test[train_cols]

labels = np.unique(y_test)
label_names = [label_map.get(int(l), str(l)) for l in labels]

train shape: (7767, 562)
test  shape: (3162, 562)


In [28]:
sw = compute_sample_weight(class_weight='balanced', y=y_train)

base_params = dict(objective='multi:softprob',
                   use_label_encoder=False,
                   eval_metric='mlogloss',
                   n_estimators=1000,
                   learning_rate=0.05,
                   max_depth=6,
                   subsample=0.8,
                   colsample_bytree=0.8,
                   random_state=RANDOM_STATE,
                   n_jobs=-1)

In [29]:
cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=RANDOM_STATE)
val_scores = []
best_iters = []

In [30]:
for train_idx, val_idx in cv.split(X_train, y_train):
    Xtr, Xval = X_train.iloc[train_idx], X_train.iloc[val_idx]
    ytr, yval = y_train.iloc[train_idx], y_train.iloc[val_idx]
    swtr = compute_sample_weight(class_weight='balanced', y=ytr)

    clf = xgb.XGBClassifier(**base_params)
    clf.fit(Xtr, ytr, sample_weight=swtr,
            eval_set=[(Xval, yval)], verbose=False)
    preds = clf.predict(Xval)
    val_scores.append(f1_score(yval, preds, average='macro'))
    best_iters.append(clf.best_iteration or base_params['n_estimators'])

print('CV f1_macro mean/std:', np.mean(val_scores), np.std(val_scores))
print('median best_iteration:', int(np.median(best_iters)))

# final train on full training set using median best_iteration
final_n_estimators = int(np.median(best_iters))
clf_final = xgb.XGBClassifier(**{**base_params, 'n_estimators': final_n_estimators})
clf_final.fit(X_train, y_train, sample_weight=sw, verbose=False)

# evaluate on test
y_pred = clf_final.predict(X_test)
print('Test F1 macro:', f1_score(y_test, y_pred, average='macro'))
print(classification_report(y_test, y_pred, digits=4))
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix:\n', cm)

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [ 0  1  2  3  4  5  6  7  8  9 10 11], got [ 1  2  3  4  5  6  7  8  9 10 11 12]