In [None]:
import pandas as pd
from eyefeatures.features.extractor import Extractor
import eyefeatures.features.stats as eye_stats
import numpy as np
import eyefeatures.features.scanpath_dist as eye_dist

# Extraction of simple features

In [None]:
dyslexia = pd.read_excel('Fixation_report.xlsx', sheet_name=0)
dyslexia.Group = 2
norm = pd.read_excel('Fixation_report.xlsx', sheet_name=1)
norm.Group = 1
risk = pd.read_excel('Fixation_report.xlsx', sheet_name=2)
risk.Group = 0
print(dyslexia.shape, norm.shape, risk.shape)
data = pd.concat([dyslexia, norm, risk], ignore_index=True)
data['timestamp'] = list(range(len(data)))
data

extractor = Extractor(
    features=[                                       # list of features
        eye_stats.SaccadeFeatures(
            features_stats={
                'length': ['mean', 'std', 'max', 'min', 'sum', 'count'],
            }
        ),
        eye_stats.FixationFeatures(
            features_stats={
                'duration': ['mean', 'std', 'min', 'max', 'sum'],
            }
        ),
        eye_stats.RegressionFeatures(
            rule =  (90,),
            deviation = 70,
            features_stats={
                'length': ['mean', 'std', 'max', 'min','count'],
            }
        )
        
    ],
    x='FIX_X',                                  # column with x-coordinate of fixations
    y='FIX_Y',                                  # column with y-coordinate of fixations
    duration='FIX_DURATION',                    # column with duration in ms
    t = 'timestamp',
    pk=['SubjectID', 'Sentence_ID'],                       # list of columns being primary key
    return_df=True,                              # return as pd.DataFrame
    extra=['Group'],
    aggr_extra='mean',
    leave_pk = True
)

simple_stats = extractor.fit_transform(data)

X_train = simple_stats.drop(columns = ['Group', 'SubjectID', 'Sentence_ID'])
y_train = simple_stats['Group'].astype(int) 


norm_demo = pd.read_excel('demo.xlsx', sheet_name=0)
risk_demo = pd.read_excel('demo.xlsx', sheet_name=1)
dyslexia_demo  = pd.read_excel('demo.xlsx', sheet_name=2)
data_demo = pd.concat([dyslexia_demo, norm_demo, risk_demo], ignore_index=True).drop(columns=['Group'])
simple_stats_demo = simple_stats.merge(data_demo, on='SubjectID')
X_train_demo = simple_stats_demo.drop(columns = ['Group', 'SubjectID', 'Sentence_ID'])
y_train_demo = simple_stats_demo['Group'].astype(int) 

In [None]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer, classification_report
from skopt import BayesSearchCV
from skopt.space import Integer, Real, Categorical

X_test, y_test = None, None

f1_micro = make_scorer(f1_score, average="micro")
f1_micro._deprecation_msg ='xui'

rf_search_space = {
    "n_estimators":      Integer(100, 1000),
    "max_depth":         Categorical([None] + list(range(2, 33))),
    "min_samples_split": Integer(2, 20),
    "min_samples_leaf":  Integer(1, 10),
    "max_features":      Categorical(["sqrt", "log2", None]),
    "bootstrap":         Categorical([True, False]),
}

gb_search_space = {
    "n_estimators":      Integer(50, 1000),
    "learning_rate":     Real(1e-3, 0.3, prior="log-uniform"),
    "max_depth":         Integer(1, 10),
    "subsample":         Real(0.5, 1.0),
    "min_samples_split": Integer(2, 20),
    "min_samples_leaf":  Integer(1, 20),
    "max_features":      Categorical(["sqrt", "log2", None]),
}

def bayes_search(
    model,
    search_space,
    name: str,
    X_train,
    y_train,
    X_test=None,
    y_test=None,
    *,
    n_iter=64,
    cv=10,
    random_state=228,
    scoring=f1_micro,
    verbose=0,
    n_jobs=-1,
    return_train_score=False
):
    """
    Run Bayesian hyperparameter optimization with explicit data inputs.

    Parameters
    ----------
    model : estimator
        Base (unfitted) sklearn estimator.
    search_space : dict
        skopt space definition.
    name : str
        Label for prints/logs.
    X_train, y_train : array-like
        Training data.
    X_test, y_test : array-like, optional
        If provided, a classification report will be printed.
    n_iter : int
        Number of parameter evaluations.
    cv : int or CV splitter
        Cross-validation strategy.
    random_state : int
        Seed for BayesSearchCV.
    scoring : str or callable
        Scoring metric.
    verbose : int
        Verbosity (0,1,2 like GridSearchCV).
    n_jobs : int
        Parallel jobs for fitting inside CV.
    return_train_score : bool
        If True, also returns train predictions & scores.

    Returns
    -------
    opt : BayesSearchCV (fitted)
    results : dict
        Contains optional test predictions/report and optionally train predictions.
    """
    opt = BayesSearchCV(
        estimator=model,
        search_spaces=search_space,
        n_iter=n_iter,
        scoring=scoring,
        cv=cv,
        n_jobs=n_jobs,
        random_state=random_state,
        verbose=verbose,
        refit=True,
        return_train_score=return_train_score
    )
    opt.fit(X_train, y_train)

    print(f"\n{name} best parameters → {opt.best_params_}")
    print(f"{name} CV-best ({scoring if isinstance(scoring,str) else 'custom'}) → {opt.best_score_:.4f}")

    results = {"best_estimator": opt.best_estimator_, "cv_best_score": opt.best_score_}

    if X_test is not None and y_test is not None:
        y_pred_test = opt.predict(X_test)
        print("\nTest-set report:")
        print(classification_report(y_test, y_pred_test))
        results["y_test_pred"] = y_pred_test

    if return_train_score:
        y_pred_train = opt.predict(X_train)
        results["y_train_pred"] = y_pred_train

    return opt, results



# Experiments run

In [None]:
rf_opt, rf_res = bayes_search(RandomForestClassifier(random_state=228),
                               rf_search_space,
                               "Random Forest",
                               X_train, y_train, n_iter=256)

gb_opt, gb_res = bayes_search(GradientBoostingClassifier(random_state=228),
                               gb_search_space,
                               "Random Forest",
                               X_train.fillna(-10000), y_train, n_iter=256)

rf_res['cv_best_score'], gb_res['cv_best_score']

In [None]:
rf_opt, rf_res = bayes_search(RandomForestClassifier(random_state=228),
                               rf_search_space,
                               "Random Forest",
                               X_train_demo, y_train_demo, n_iter=256)

gb_opt, gb_res = bayes_search(GradientBoostingClassifier(random_state=228),
                               gb_search_space,
                               "Random Forest",
                               X_train_demo.fillna(-10000), y_train_demo, n_iter=256)

rf_res['cv_best_score'], gb_res['cv_best_score']