# Setup 

## Imports

In [55]:
import pandas as pd
import numpy as np
import time
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from pathlib import Path
from typing import Dict, List, Tuple



## Configs e paths

In [56]:
FEATURE_DIR   = Path("../data/features/hmog_None")
RESULT_DIR    = Path("../data/results/svm")
SEED          = 42
SENSOR        = 'accelerometer'

feature_cols = [
    'acc_x_mean', 'acc_x_median', 'acc_x_std', 'acc_x_min', 'acc_x_max',
    'acc_y_mean', 'acc_y_median', 'acc_y_std', 'acc_y_min', 'acc_y_max',
    'acc_z_mean', 'acc_z_median', 'acc_z_std', 'acc_z_min', 'acc_z_max',
    'magnitude_mean', 'magnitude_std', 'magnitude_min', 'magnitude_max',
    'corr_xz', 'corr_yz'
]
drop_cols = {"accelerometer": ['acc_x_skewness', 'acc_x_kurtosis','acc_y_skewness', 'acc_y_kurtosis',
                                'acc_z_skewness', 'acc_z_kurtosis','magnitude_skewness', 'magnitude_kurtosis'],
            "gyroscope": ['gyro_x_skewness', 'gyro_x_kurtosis','gyro_y_skewness', 'gyro_y_kurtosis',
                                'gyro_z_skewness', 'gyro_z_kurtosis','magnitude_skewness', 'magnitude_kurtosis']
        }


In [57]:
pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ("scaler", StandardScaler()),
    ("svc", SVC(kernel="linear", class_weight="balanced", probability=False, random_state=SEED)),
])
param_grid = {
    'svc__C': [0.1, 1, 10]
}

## Util functions

In [58]:
def compute_eer(y_true: np.ndarray, scores: np.ndarray) -> Tuple[float, float]:
    fpr, tpr, _ = roc_curve(y_true, scores)
    fnr = 1 - tpr
    idx = np.nanargmin(np.abs(fpr - fnr))
    eer = (fpr[idx] + fnr[idx]) / 2.0
    tar = tpr[idx]
    return float(eer), float(tar)

## Main

### Load dataframe

In [59]:
df = pd.read_parquet(FEATURE_DIR / f"{SENSOR}_features.parquet")
df = df.drop(columns=drop_cols.get(SENSOR, []))

### Dataframe Info

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2446046 entries, 0 to 2446045
Data columns (total 26 columns):
 #   Column          Dtype  
---  ------          -----  
 0   sensor          object 
 1   subject_id      object 
 2   session_number  int64  
 3   start_ms        int64  
 4   end_ms          int64  
 5   acc_x_mean      float64
 6   acc_x_median    float64
 7   acc_x_std       float64
 8   acc_x_min       float64
 9   acc_x_max       float64
 10  acc_y_mean      float64
 11  acc_y_median    float64
 12  acc_y_std       float64
 13  acc_y_min       float64
 14  acc_y_max       float64
 15  acc_z_mean      float64
 16  acc_z_median    float64
 17  acc_z_std       float64
 18  acc_z_min       float64
 19  acc_z_max       float64
 20  magnitude_mean  float64
 21  magnitude_std   float64
 22  magnitude_min   float64
 23  magnitude_max   float64
 24  corr_xz         float64
 25  corr_yz         float64
dtypes: float64(21), int64(3), object(2)
memory usage: 485.2+ MB


### User picking

In [61]:
users = df['subject_id'].unique()
rng = np.random.default_rng(SEED)
user = rng.choice(users, 1)[0]

print(f"Selected user: {user}")

results = {}

Selected user: 220962


### Train

In [62]:
user_df = df.copy()
user_df['label'] = (user_df['subject_id'] == user).astype(int)
sessions = user_df.loc[user_df['label'] == 1, 'session_number'].unique()
user_results = []

In [63]:
for test_session in sessions[0:24]: # limitado a 10 sessoes
    train_mask = user_df['session_number'] != test_session
    test_mask = user_df['session_number'] == test_session

    X_train_full = user_df.loc[train_mask, feature_cols]
    y_train_full = user_df.loc[train_mask, 'label']

    ## Cria subamostragem de impostores 
    pos_idx = y_train_full[y_train_full == 1].index
    neg_idx = y_train_full[y_train_full == 0].index
    n_pos = len(pos_idx)
    n_neg_sample = min(len(neg_idx), n_pos * 2) # 2x impostores
    neg_sample_idx = np.random.RandomState(SEED).choice(neg_idx, size=n_neg_sample, replace=False) 
    
    selected_idx = np.concatenate([pos_idx, neg_sample_idx])

    max_samples = 10000
    if len(selected_idx) > max_samples:
        selected_idx = np.random.RandomState(SEED).choice(selected_idx, size=max_samples, replace=False)
    
    X_train = X_train_full.loc[selected_idx]
    y_train = y_train_full.loc[selected_idx]

    X_test = user_df.loc[test_mask, feature_cols]
    y_test = user_df.loc[test_mask, 'label']

    mask = X_test.notna().all(axis=1)
    X_test = X_test.loc[mask]
    y_test = y_test.loc[mask]

    print(f"\nUsuário {user} - Sessão {test_session}")
    print(f" Treino original: {X_train_full.shape[0]} (pos={sum(y_train_full)}, neg={len(y_train_full)-sum(y_train_full)})")
    print(f" Treino subamostrado limitado a {max_samples} amostras: (pos={sum(y_train)}, neg={len(y_train)-sum(y_train)})")
    #print(f" Teste: {X_test.shape[0]} (pos={sum(y_test)}, neg={len(y_test)-sum(y_test)})")
    
    cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)
    grid = GridSearchCV(pipe, param_grid, cv=cv_inner, scoring='roc_auc', n_jobs=-1)
    t0 = time.perf_counter()
    grid.fit(X_train, y_train)
    t1 = time.perf_counter()
    print(f" Treinamento concluído em {t1 - t0:.2f} segundos")

    y_score = grid.decision_function(X_test)
    eer, tar = compute_eer(y_test, y_score)

    user_results.append(
        {
        'sensor': SENSOR,
        'session': test_session,
        'EER': eer,
        'TAR': tar,
        'train_samples': len(X_train),
        'best_C': grid.best_params_['svc__C'],
        }
    )

results[user] = pd.DataFrame(user_results)


Usuário 220962 - Sessão 1
 Treino original: 2319392 (pos=22697, neg=2296695)
 Treino subamostrado limitado a 10000 amostras: (pos=3304, neg=6696)
 Treinamento concluído em 10.03 segundos

Usuário 220962 - Sessão 2
 Treino original: 2318962 (pos=22859, neg=2296103)
 Treino subamostrado limitado a 10000 amostras: (pos=3370, neg=6630)
 Treinamento concluído em 8.66 segundos

Usuário 220962 - Sessão 3
 Treino original: 2339723 (pos=23329, neg=2316394)
 Treino subamostrado limitado a 10000 amostras: (pos=3321, neg=6679)
 Treinamento concluído em 4.90 segundos

Usuário 220962 - Sessão 4
 Treino original: 2326287 (pos=23216, neg=2303071)
 Treino subamostrado limitado a 10000 amostras: (pos=3299, neg=6701)
 Treinamento concluído em 7.94 segundos

Usuário 220962 - Sessão 5
 Treino original: 2342364 (pos=23493, neg=2318871)
 Treino subamostrado limitado a 10000 amostras: (pos=3324, neg=6676)
 Treinamento concluído em 7.98 segundos

Usuário 220962 - Sessão 6
 Treino original: 2342986 (pos=22918,

### Results

In [75]:
print(f"Resultados para usuário {user}:")
print(results[user])
df_results = pd.DataFrame(results[user])
df_results.to_parquet(
    Path(RESULT_DIR / f"{SENSOR}_verification_results.parquet"),
    index=False,
)

Resultados para usuário 220962:
           sensor  session       EER       TAR  train_samples  best_C
0   accelerometer        1  0.028230  0.971622          10000      10
1   accelerometer        2  0.148894  0.851290          10000      10
2   accelerometer        3  0.144864  0.854953          10000       1
3   accelerometer        4  0.086703  0.913632          10000      10
4   accelerometer        5  0.028897  0.970760          10000      10
5   accelerometer        6  0.040657  0.959492          10000      10
6   accelerometer        7  0.279009  0.721145          10000      10
7   accelerometer        8  0.042376  0.957447          10000      10
8   accelerometer        9  0.343627  0.656148          10000      10
9   accelerometer       10  0.167793  0.831956          10000      10
10  accelerometer       11  0.034342  0.965517          10000      10
11  accelerometer       12  0.037234  0.962582          10000      10
12  accelerometer       13  0.027104  0.972452          10

In [79]:
summary = df_results.groupby("sensor")["EER"].agg(["mean", "std"]).sort_values("mean")


In [74]:
sel = results[user].iloc[0]
X_test = df[(df['subject_id'] == user) & (df['session_number'] == sel['session'])][feature_cols]
y_test = (df['subject_id'] == user).astype(int)
y_score = grid.predict_proba(X_test)[:, 1]

AttributeError: This 'GridSearchCV' has no attribute 'predict_proba'