In [None]:
%pip install lightgbm

In [45]:
import json
import numpy as np
import pandas as pd
import lightgbm as lgb
import shap
from pathlib import Path
from sklearn.preprocessing import StandardScaler

#### Feature Selection

In [139]:
# Config
RAW_DATA_PATH = "D:\\DataStorm\\datasets\\raw\\CaddieSet.csv"
OUTPUT_DIR = Path("D:\\DataStorm\\datasets\\processed")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

STAGE1_FEATURE_PATH = OUTPUT_DIR / "selected_features_stage1.json"
STAGE2_DATA_PATH = OUTPUT_DIR / "nam_stage2_classification.csv"

# Proxy-label threshold
PROXY_QUANTILE = 0.6

# SHAP cumulative contribution threshold
SHAP_CUM_RATIO = 0.85

RANDOM_STATE = 42

In [140]:
test_df = pd.read_csv("D:/DataStorm/datasets/raw/test.csv")
df = pd.read_csv("D:/DataStorm/datasets/raw/train.csv")

In [141]:
# Load raw data
df = df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

print(f"   Loaded {len(df)} samples")
print(f"   Loaded {len(test_df)} test samples")

   Loaded 61 samples
   Loaded 49 test samples


In [142]:
df.columns.tolist()

['video_id',
 'view',
 'target',
 '0-SPINE-ANGLE',
 '0-STANCE-RATIO',
 '0-UPPER-TILT',
 '1-HEAD-LOC',
 '1-HIP-LINE',
 '1-HIP-ROTATION',
 '1-HIP-SHIFTED',
 '1-LEFT-ARM-ANGLE',
 '1-RIGHT-ARM-ANGLE',
 '1-SHOULDER-ANGLE',
 '1-SHOULDER-LOC',
 '1-SPINE-ANGLE',
 '2-HEAD-LOC',
 '2-HIP-ANGLE',
 '2-HIP-LINE',
 '2-HIP-ROTATION',
 '2-HIP-SHIFTED',
 '2-LEFT-ARM-ANGLE',
 '2-SHOULDER-ANGLE',
 '2-SHOULDER-LOC',
 '2-UPPER-TILT',
 '3-HEAD-LOC',
 '3-HIP-ANGLE',
 '3-HIP-LINE',
 '3-HIP-ROTATION',
 '3-HIP-SHIFTED',
 '3-LEFT-LEG-ANGLE',
 '3-RIGHT-ARM-ANGLE',
 '3-RIGHT-DISTANCE',
 '3-RIGHT-LEG-ANGLE',
 '3-SHOULDER-ANGLE',
 '3-SHOULDER-LOC',
 '4-HEAD-LOC',
 '4-HIP-ANGLE',
 '4-HIP-HANGING-BACK',
 '4-HIP-LINE',
 '4-HIP-ROTATION',
 '4-HIP-SHIFTED',
 '4-RIGHT-ARM-ANGLE',
 '4-RIGHT-ARMPIT-ANGLE',
 '4-SHOULDER-HANGING-BACK',
 '4-SPINE-ANGLE',
 '5-HEAD-LOC',
 '5-HIP-HANGING-BACK',
 '5-HIP-LINE',
 '5-HIP-SHIFTED',
 '5-LEFT-ARM-ANGLE',
 '5-LEFT-LEG-ANGLE',
 '5-RIGHT-ARM-ANGLE',
 '5-SHOULDER-ANGLE',
 '5-SHOULDER-HANGING

In [143]:
test_df.columns.tolist()

['video_id',
 'view',
 'target',
 '0-SPINE-ANGLE',
 '0-STANCE-RATIO',
 '0-UPPER-TILT',
 '1-HEAD-LOC',
 '1-HIP-LINE',
 '1-HIP-ROTATION',
 '1-HIP-SHIFTED',
 '1-LEFT-ARM-ANGLE',
 '1-RIGHT-ARM-ANGLE',
 '1-SHOULDER-ANGLE',
 '1-SHOULDER-LOC',
 '1-SPINE-ANGLE',
 '2-HEAD-LOC',
 '2-HIP-ANGLE',
 '2-HIP-LINE',
 '2-HIP-ROTATION',
 '2-HIP-SHIFTED',
 '2-LEFT-ARM-ANGLE',
 '2-SHOULDER-ANGLE',
 '2-SHOULDER-LOC',
 '2-UPPER-TILT',
 '3-HEAD-LOC',
 '3-HIP-ANGLE',
 '3-HIP-LINE',
 '3-HIP-ROTATION',
 '3-HIP-SHIFTED',
 '3-LEFT-LEG-ANGLE',
 '3-RIGHT-ARM-ANGLE',
 '3-RIGHT-DISTANCE',
 '3-RIGHT-LEG-ANGLE',
 '3-SHOULDER-ANGLE',
 '3-SHOULDER-LOC',
 '4-HEAD-LOC',
 '4-HIP-ANGLE',
 '4-HIP-HANGING-BACK',
 '4-HIP-LINE',
 '4-HIP-ROTATION',
 '4-HIP-SHIFTED',
 '4-RIGHT-ARM-ANGLE',
 '4-RIGHT-ARMPIT-ANGLE',
 '4-SHOULDER-HANGING-BACK',
 '4-SPINE-ANGLE',
 '5-HEAD-LOC',
 '5-HIP-HANGING-BACK',
 '5-HIP-LINE',
 '5-HIP-SHIFTED',
 '5-LEFT-ARM-ANGLE',
 '5-LEFT-LEG-ANGLE',
 '5-RIGHT-ARM-ANGLE',
 '5-SHOULDER-ANGLE',
 '5-SHOULDER-HANGING

In [144]:
# Convert all numeric columns safely
for col in df.columns:
    if col != "video_id" and col != "view":
        df[col] = pd.to_numeric(df[col], errors="coerce")

# Replace NaN / Inf
df = df.replace([np.inf, -np.inf], np.nan)
df = df.fillna(0.0)

In [145]:
# Convert all numeric columns safely
for col in test_df.columns:
    if col != "video_id" and col != "view":
        test_df[col] = pd.to_numeric(test_df[col], errors="coerce")

# Replace NaN / Inf
test_df = test_df.replace([np.inf, -np.inf], np.nan)
test_df = test_df.fillna(0.0)

In [146]:
df = df.drop(columns=["video_id", "view"])
df.columns.tolist()

['target',
 '0-SPINE-ANGLE',
 '0-STANCE-RATIO',
 '0-UPPER-TILT',
 '1-HEAD-LOC',
 '1-HIP-LINE',
 '1-HIP-ROTATION',
 '1-HIP-SHIFTED',
 '1-LEFT-ARM-ANGLE',
 '1-RIGHT-ARM-ANGLE',
 '1-SHOULDER-ANGLE',
 '1-SHOULDER-LOC',
 '1-SPINE-ANGLE',
 '2-HEAD-LOC',
 '2-HIP-ANGLE',
 '2-HIP-LINE',
 '2-HIP-ROTATION',
 '2-HIP-SHIFTED',
 '2-LEFT-ARM-ANGLE',
 '2-SHOULDER-ANGLE',
 '2-SHOULDER-LOC',
 '2-UPPER-TILT',
 '3-HEAD-LOC',
 '3-HIP-ANGLE',
 '3-HIP-LINE',
 '3-HIP-ROTATION',
 '3-HIP-SHIFTED',
 '3-LEFT-LEG-ANGLE',
 '3-RIGHT-ARM-ANGLE',
 '3-RIGHT-DISTANCE',
 '3-RIGHT-LEG-ANGLE',
 '3-SHOULDER-ANGLE',
 '3-SHOULDER-LOC',
 '4-HEAD-LOC',
 '4-HIP-ANGLE',
 '4-HIP-HANGING-BACK',
 '4-HIP-LINE',
 '4-HIP-ROTATION',
 '4-HIP-SHIFTED',
 '4-RIGHT-ARM-ANGLE',
 '4-RIGHT-ARMPIT-ANGLE',
 '4-SHOULDER-HANGING-BACK',
 '4-SPINE-ANGLE',
 '5-HEAD-LOC',
 '5-HIP-HANGING-BACK',
 '5-HIP-LINE',
 '5-HIP-SHIFTED',
 '5-LEFT-ARM-ANGLE',
 '5-LEFT-LEG-ANGLE',
 '5-RIGHT-ARM-ANGLE',
 '5-SHOULDER-ANGLE',
 '5-SHOULDER-HANGING-BACK',
 '5-SPINE-ANGL

In [147]:
test_df = test_df.drop(columns=["video_id", "view"])
test_df.columns.tolist()

['target',
 '0-SPINE-ANGLE',
 '0-STANCE-RATIO',
 '0-UPPER-TILT',
 '1-HEAD-LOC',
 '1-HIP-LINE',
 '1-HIP-ROTATION',
 '1-HIP-SHIFTED',
 '1-LEFT-ARM-ANGLE',
 '1-RIGHT-ARM-ANGLE',
 '1-SHOULDER-ANGLE',
 '1-SHOULDER-LOC',
 '1-SPINE-ANGLE',
 '2-HEAD-LOC',
 '2-HIP-ANGLE',
 '2-HIP-LINE',
 '2-HIP-ROTATION',
 '2-HIP-SHIFTED',
 '2-LEFT-ARM-ANGLE',
 '2-SHOULDER-ANGLE',
 '2-SHOULDER-LOC',
 '2-UPPER-TILT',
 '3-HEAD-LOC',
 '3-HIP-ANGLE',
 '3-HIP-LINE',
 '3-HIP-ROTATION',
 '3-HIP-SHIFTED',
 '3-LEFT-LEG-ANGLE',
 '3-RIGHT-ARM-ANGLE',
 '3-RIGHT-DISTANCE',
 '3-RIGHT-LEG-ANGLE',
 '3-SHOULDER-ANGLE',
 '3-SHOULDER-LOC',
 '4-HEAD-LOC',
 '4-HIP-ANGLE',
 '4-HIP-HANGING-BACK',
 '4-HIP-LINE',
 '4-HIP-ROTATION',
 '4-HIP-SHIFTED',
 '4-RIGHT-ARM-ANGLE',
 '4-RIGHT-ARMPIT-ANGLE',
 '4-SHOULDER-HANGING-BACK',
 '4-SPINE-ANGLE',
 '5-HEAD-LOC',
 '5-HIP-HANGING-BACK',
 '5-HIP-LINE',
 '5-HIP-SHIFTED',
 '5-LEFT-ARM-ANGLE',
 '5-LEFT-LEG-ANGLE',
 '5-RIGHT-ARM-ANGLE',
 '5-SHOULDER-ANGLE',
 '5-SHOULDER-HANGING-BACK',
 '5-SPINE-ANGL

In [148]:
# Feature matrix

X_stage1 = df.drop(
    columns = ["target"],
    errors="ignore"
)

feature_names = X_stage1.columns.tolist()

# Standardize (important for SHAP stability)
scaler = StandardScaler()
X_stage1_scaled = scaler.fit_transform(X_stage1)

In [149]:
df.isnull().sum()  # Should be 0

target               0
0-SPINE-ANGLE        0
0-STANCE-RATIO       0
0-UPPER-TILT         0
1-HEAD-LOC           0
                    ..
5-RIGHT-LEG-ANGLE    0
6-LEFT-LEG-ANGLE     0
0-SHOULDER-ANGLE     0
6-RIGHT-ARM-ANGLE    0
7-HEAD-LOC           0
Length: 76, dtype: int64

In [150]:
# Training LightGBM model
lgb_model = lgb.LGBMClassifier(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=RANDOM_STATE
)

lgb_model.fit(X_stage1_scaled, df["target"])


[LightGBM] [Info] Number of positive: 31, number of negative: 30
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000485 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 61, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.508197 -> initscore=0.032790
[LightGBM] [Info] Start training from score 0.032790


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,6
,learning_rate,0.05
,n_estimators,400
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [151]:
print("üîç Computing SHAP values...")

explainer = shap.TreeExplainer(lgb_model)
shap_values = explainer.shap_values(X_stage1_scaled)[1]

mean_abs_shap = np.abs(shap_values).mean(axis=0)

shap_df = pd.DataFrame({
    "feature": feature_names,
    "mean_abs_shap": mean_abs_shap
}).sort_values("mean_abs_shap", ascending=False)

shap_df["cum_ratio"] = (
    shap_df["mean_abs_shap"].cumsum()
    / shap_df["mean_abs_shap"].sum()
)

üîç Computing SHAP values...




In [152]:
# Select top features
selected_features = shap_df[
    shap_df["cum_ratio"] <= SHAP_CUM_RATIO
]["feature"].tolist()

print(f"‚úÖ Selected {len(selected_features)} features "
      f"(covering {SHAP_CUM_RATIO*100:.0f}% SHAP importance)")

# Save feature list
with open(STAGE1_FEATURE_PATH, "w") as f:
    json.dump(selected_features, f, indent=2)

‚úÖ Selected 63 features (covering 85% SHAP importance)


In [153]:
df_stage2 = df[selected_features + ["target"]].copy()


df_stage2.to_csv(STAGE2_DATA_PATH, index=False)

print("‚úÖ Stage 2 dataset saved:")
print(f"   {STAGE2_DATA_PATH}")

‚úÖ Stage 2 dataset saved:
   D:\DataStorm\datasets\processed\nam_stage2_classification.csv


In [154]:
# Chia d·ªØ li·ªáu stage 2 th√†nh train/val/test
from sklearn.model_selection import train_test_split
df_stage2 = pd.read_csv(STAGE2_DATA_PATH)
assert "target" in df_stage2.columns, "‚ùå target column not found"

In [155]:

train_df, val_df = train_test_split(
    df_stage2,
    test_size=0.2,
    stratify=df_stage2["target"],
    random_state=RANDOM_STATE
)
test_df = test_df[selected_features + ['target']]

# Save splits
TRAIN_PATH = OUTPUT_DIR / "train_stage2.csv"
VAL_PATH   = OUTPUT_DIR / "val_stage2.csv"
TEST_PATH  = OUTPUT_DIR / "test_stage2.csv"

train_df.to_csv(TRAIN_PATH, index=False)
val_df.to_csv(VAL_PATH, index=False)
test_df.to_csv(TEST_PATH, index=False)


In [156]:
def print_split_stats(name, df):
    print(f"{name}: {len(df)} samples")
    print(df["target"].value_counts(normalize=True).round(3))
    print("-" * 30)

print("üìä Split statistics:")
print_split_stats("Train", train_df)
print_split_stats("Val", val_df)
print_split_stats("Test", test_df)


print("‚úÖ Stage 2 data split completed")

üìä Split statistics:
Train: 48 samples
target
1    0.5
0    0.5
Name: proportion, dtype: float64
------------------------------
Val: 13 samples
target
1    0.538
0    0.462
Name: proportion, dtype: float64
------------------------------
Test: 49 samples
target
0    0.612
1    0.388
Name: proportion, dtype: float64
------------------------------
‚úÖ Stage 2 data split completed
