In [1]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle

In [4]:
FEATURE_DEFS_PATH = "D:\\DataStorm\\datasets\\feature_definitions.json"

with open(FEATURE_DEFS_PATH, "r", encoding="utf-8") as f:
    FEATURE_DEFS = json.load(f)["features"]

FEATURE_NAMES = [f["name"] for f in FEATURE_DEFS]

print(f"Loaded {len(FEATURE_NAMES)} features:")
FEATURE_NAMES


Loaded 17 features:


['spine_tilt',
 'stance_width',
 'shoulder_angle_setup',
 'hip_shoulder_separation',
 'hip_rotation_top',
 'left_arm_angle_top',
 'arm_plane_mid',
 'hip_rotation_mid',
 'right_arm_angle_mid',
 'spine_angle_impact',
 'hip_rotation_impact',
 'head_motion_impact',
 'shaft_lean_impact',
 'spine_angle_release',
 'arm_extension_release',
 'balance_finish',
 'hip_angle_finish']

In [5]:
RAW_DATA_PATH = "D:\\DataStorm\\datasets\\raw\\CaddieSet.csv"

df_raw = pd.read_csv(RAW_DATA_PATH)

print(f"Loaded {len(df_raw)} samples")
df_raw.head()

Loaded 1757 samples


Unnamed: 0,View,ClubType,Distance,Carry,LrDistanceOut,DirectionAngle,SpinBack,SpinSide,SpinAxis,BallSpeed,...,6-RIGHT-LEG-ANGLE,6-SHOULDER-ANGLE,6-SPINE-ANGLE,6-WEIGHT-SHIFT,7-FINISH-ANGLE,7-HIP-ANGLE,7-HIP-LINE,7-HIP-SHIFTED,7-SHOULDER-ANGLE,7-SPINE-ANGLE
0,FACEON,W1,241.0,221.9,0.0,3.9,1705,-331,-10.98645,63.0,...,163.74,,,92.8,72.84,,,0.41,,
1,FACEON,I7,137.7,119.0,0.0,3.0,1868,-571,-16.997057,45.1,...,,,,,,,,,,
2,FACEON,W1,204.7,189.3,0.1,-0.6,2162,55,1.457256,57.9,...,180.0,,,90.77,74.24,,,0.22,,
3,FACEON,I9,125.3,118.5,0.1,3.3,6442,-734,-6.500235,45.2,...,168.52,,,96.78,88.64,,,1.12,,
4,FACEON,W1,214.1,202.9,-32.1,-4.5,2393,-462,-10.927259,59.0,...,160.05,,,95.82,76.96,,,0.53,,


In [None]:
# Handle missing values
def handle_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Replace Excel errors
    df = df.replace("#NAME?", np.nan)

    # Convert object columns to numeric
    for col in df.columns:
        if df[col].dtype == "object":
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # Fill NaN with median
    df = df.fillna(df.median())

    return df


df_clean = handle_missing_values(df_raw)

In [20]:
df_clean.isnull().sum()  # Should be 0

View                1757
ClubType            1757
Distance               0
Carry                  0
LrDistanceOut          0
                    ... 
7-HIP-ANGLE            0
7-HIP-LINE             0
7-HIP-SHIFTED          0
7-SHOULDER-ANGLE       0
7-SPINE-ANGLE          0
Length: 80, dtype: int64

In [8]:
def extract_17_features(df: pd.DataFrame, feature_defs: list) -> pd.DataFrame:
    data = {}

    for feat in feature_defs:
        col = feat["column"]
        name = feat["name"]

        if col in df.columns:
            data[name] = df[col].values
        else:
            print(f"⚠️ Missing column {col}, filled with zeros")
            data[name] = np.zeros(len(df))

    return pd.DataFrame(data)


X_features = extract_17_features(df_clean, FEATURE_DEFS)
X_features.head()

Unnamed: 0,spine_tilt,stance_width,shoulder_angle_setup,hip_shoulder_separation,hip_rotation_top,left_arm_angle_top,arm_plane_mid,hip_rotation_mid,right_arm_angle_mid,spine_angle_impact,hip_rotation_impact,head_motion_impact,shaft_lean_impact,spine_angle_release,arm_extension_release,balance_finish,hip_angle_finish
0,70.525,1.5,10.08,24.295,3.34,124.75,143.13,11.73,73.14,74.94,3.15,-0.66,55.25,77.305,171.89,72.84,5.07
1,70.525,0.96,8.21,24.295,0.0,108.66,143.13,0.0,73.14,74.94,9.3,-0.26,55.25,77.305,148.21,84.48,5.07
2,70.525,1.6,10.19,24.295,1.0,168.52,143.13,3.84,73.14,74.94,3.84,-0.35,55.25,77.305,171.89,74.24,5.07
3,70.525,1.23,12.83,24.295,8.3,171.89,143.13,13.56,73.14,74.94,8.27,0.07,55.25,77.305,165.93,88.64,5.07
4,70.525,2.11,17.72,24.295,12.74,180.0,143.13,12.74,73.14,74.94,177.14,0.76,55.25,77.305,180.0,76.96,5.07


In [9]:
print(f"Extracted features shape: {X_features.shape}")

Extracted features shape: (1757, 17)


In [None]:
# Xây dựng technical score
def compute_technical_score(df: pd.DataFrame) -> np.ndarray:
    distance_score = np.clip((df["Distance"] - 150) / 100 * 3, 0, 3)
    direction_score = np.clip(3 - np.abs(df["DirectionAngle"]) / 5, 0, 3)
    speed_score = np.clip(df["BallSpeed"] / 20, 0, 2)
    spin_score = np.clip(2 - np.abs(df["SpinBack"] - 2500) / 1000, 0, 2)

    total = distance_score + direction_score + speed_score + spin_score
    return np.clip(total, 0, 10)


y_score = compute_technical_score(df_clean)

print(
    f"Score range: {y_score.min():.2f} – {y_score.max():.2f}, "
    f"mean = {y_score.mean():.2f}"
)

Score range: 2.22 – 9.66, mean = 6.29


In [11]:
# Tạo score band
def score_to_band(scores: np.ndarray) -> np.ndarray:
    bands = np.zeros(len(scores), dtype=int)

    bands[scores < 2] = 1
    bands[(scores >= 2) & (scores < 4)] = 2
    bands[(scores >= 4) & (scores < 6)] = 3
    bands[(scores >= 6) & (scores < 8)] = 4
    bands[scores >= 8] = 5

    return bands


y_band = score_to_band(y_score)

In [13]:
# Chia tập train/test
X_temp, X_test, y_temp, y_test, band_temp, band_test = train_test_split(
    X_features, y_score, y_band,
    test_size=0.15,
    random_state=42,
    stratify=y_band
)

val_ratio = 0.15 / 0.85

X_train, X_val, y_train, y_val, band_train, band_val = train_test_split(
    X_temp, y_temp, band_temp,
    test_size=val_ratio,
    random_state=42,
    stratify=band_temp
)

print(f"Train: {len(X_train)} | Val: {len(X_val)} | Test: {len(X_test)}")

Train: 1229 | Val: 264 | Test: 264


In [14]:
# Chuẩn hóa dữ liệu
scaler = StandardScaler()

X_train_norm = pd.DataFrame(
    scaler.fit_transform(X_train),
    columns=X_train.columns
)

X_val_norm = pd.DataFrame(
    scaler.transform(X_val),
    columns=X_val.columns
)

X_test_norm = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_test.columns
)

In [17]:
# Save dữ liệu
Path("D:\DataStorm\datasets\processed").mkdir(parents=True, exist_ok=True)

train_df = X_train_norm.assign(score=y_train, band=band_train)
val_df   = X_val_norm.assign(score=y_val, band=band_val)
test_df  = X_test_norm.assign(score=y_test, band=band_test)

train_df.to_csv("D:\DataStorm\datasets\processed/train.csv", index=False)
val_df.to_csv("D:\DataStorm\datasets\processed/val.csv", index=False)
test_df.to_csv("D:\DataStorm\datasets\processed/test.csv", index=False)

with open("D:\DataStorm\datasets\processed/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

In [18]:
# Kiểm tra phân phối
for name, df in {
    "TRAIN": train_df,
    "VAL": val_df,
    "TEST": test_df
}.items():
    print(f"\n{name}")
    print(df["score"].describe())
    print("Band distribution:")
    print(df["band"].value_counts().sort_index())


TRAIN
count    864.000000
mean       6.301990
std        1.421935
min        2.221000
25%        4.999500
50%        6.292500
75%        7.523000
max        9.662000
Name: score, dtype: float64
Band distribution:
band
2     41
3    507
4    495
5    186
Name: count, dtype: int64

VAL
count    41.000000
mean      6.221024
std       1.692628
min       2.568000
25%       4.900000
50%       6.656000
75%       7.412000
max       9.370000
Name: score, dtype: float64
Band distribution:
band
2      9
3    109
4    106
5     40
Name: count, dtype: int64

TEST
count    49.000000
mean      6.528388
std       1.568011
min       3.972000
25%       4.974000
50%       6.854000
75%       7.785000
max       9.662000
Name: score, dtype: float64
Band distribution:
band
2      9
3    109
4    106
5     40
Name: count, dtype: int64
