In [None]:
import os
from pathlib import Path
os.chdir(Path('~/git/eye_tracking/').expanduser())

import pandas as pd

import json

from utils.data_processing import EyeTrackingProcessor, GazeMetricsProcessor, MouseMetricsProcessor
from models.ensemble_inference import load_xgboost_model, load_jcafnet_model, run_ensemble_inference
from utils.train import evaluate_pytorch_model

: 

****
# Load models and datasets
****

In [2]:
xgb_model_path = "trainings/logs/xgboost_classifier/best_model.pkl"
jcafnet_ckpt = "trainings/logs/jcafnet_classifier/hardy-water-3/epoch40-val_acc0.72.ckpt"
jcafnet_meta = "trainings/logs/jcafnet_classifier/hardy-water-3/model_metadata.json"
data_path = str(Path('~/store/eye_tracking/splits').expanduser())

In [4]:
split_files = [f"{data_path}/{s}.parquet" for s in ["train", "val", "test"]]

if all(Path(f).exists() for f in split_files):
    print("Loading dataset splits...")
    train_df = pd.read_parquet(split_files[0])
    val_df = pd.read_parquet(split_files[1])
    test_df = pd.read_parquet(split_files[2])

Loading dataset splits...


ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.

In [4]:
xg_boost = load_xgboost_model(xgb_model_path)
jcafnet, meta_jcafnet = load_jcafnet_model(jcafnet_ckpt, jcafnet_meta)



****
# Inference
****

## JCAFNet

In [5]:
features = {
    "gaze": ["Gaze point X", "Gaze point Y", "Gaze Velocity", "Gaze Acceleration"],
    "mouse": ["Mouse position X", "Mouse position Y", "Mouse Velocity", "Mouse Acceleration"],
    "joint": ["Gaze-Mouse Distance", "Angle Between Gaze and Mouse"]
}

res_train_pytorch = evaluate_pytorch_model(jcafnet,
                                           test_df,
                                           meta_jcafnet["features"],
                                           meta_jcafnet["num_classes"],
                                           meta_jcafnet["mean"], 
                                           meta_jcafnet["std"])

🧠 PyTorch Evaluation: Loss = 1.3059, Accuracy = 0.7083


In [13]:
res_train_pytorch.keys()

dict_keys(['labels', 'predictions', 'probs', 'correct_flags', 'loss', 'accuracy'])

## XGBoost

In [6]:
def predict_xgboost(test_df: pd.DataFrame,
                             model_path: str,
                             selected_features_path: str,
                             tsfresh_config: dict,
                             label_column: str = "Task_id",
                             label_offset: int = 1) -> pd.DataFrame:
    """
    Applies TSFresh to the test set, selects features used during training,
    and returns predicted class probabilities and predicted classes.

    Returns:
        pd.DataFrame with columns: id, true_label, pred_label, and class probabilities.
    """
    import joblib
    from tsfresh import extract_features
    from tsfresh.utilities.dataframe_functions import impute

    # Load model and selected features
    model = joblib.load(model_path)
    selected_features = joblib.load(selected_features_path)

    # Extract TSFresh features
    test_df = test_df.sort_values(by=["Participant name", "Task_id", "Task_execution", "Recording timestamp"])
    features_test = extract_features(
        test_df[["id", "Recording timestamp"] + tsfresh_config["columns"]],
        column_id="id",
        column_sort="Recording timestamp",
        n_jobs=tsfresh_config.get("n_jobs", 8),
        default_fc_parameters=tsfresh_config.get("fc_parameters")
    )
    impute(features_test)

    # Keep only selected features
    X_test = features_test[selected_features]
    ids = features_test.index.tolist()

    # Ground truth
    y_true = test_df.groupby("id")[label_column].first() - label_offset

    # Predict probabilities and class
    probs = model.predict_proba(X_test)
    preds = model.predict(X_test)

    # Construct result DataFrame
    result_df = pd.DataFrame({
        "id": ids,
        "true_label": y_true.values,
        "pred_label": preds
    })
    
    # Add probability columns
    for i in range(probs.shape[1]):
        result_df[f"class_{i}_prob"] = probs[:, i]

    return result_df


In [None]:
from tsfresh.feature_extraction import MinimalFCParameters

tsfresh_config = {
    "columns": ["Gaze point X", "Gaze point Y", "Mouse position X", "Mouse position Y"],
    "n_jobs": 100,
    "fc_parameters": None # or MinimalFCParameters() if used in training
}

preds_df = predict_xgboost(
    test_df=test_df,
    model_path="trainings/logs/xgboost_classifier/best_model.pkl",
    selected_features_path="trainings/logs/xgboost_classifier/selected_features.pkl",
    tsfresh_config=tsfresh_config
)

print(preds_df.head())

Feature Extraction:   1%|          | 2/288 [00:46<1:49:57, 23.07s/it]Process ForkPoolWorker-22:
Process ForkPoolWorker-76:
Process ForkPoolWorker-92:
Process ForkPoolWorker-27:
Process ForkPoolWorker-100:
Traceback (most recent call last):
Process ForkPoolWorker-23:
Process ForkPoolWorker-1:
Process ForkPoolWorker-86:
  File "/cluster/home/kruu/.local/share/uv/python/cpython-3.12.7-linux-x86_64-gnu/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Process ForkPoolWorker-21:
Traceback (most recent call last):
Process ForkPoolWorker-58:
Process ForkPoolWorker-24:
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-51:
Process ForkPoolWorker-52:
Process ForkPoolWorker-84:
  File "/cluster/home/kruu/.local/share/uv/python/cpython-3.12.7-linux-x86_64-gnu/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Process ForkPoolWorker-13:
Process ForkPoolWorker-25:
Process ForkPoolWorker-75:
  