In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import neurokit2 as nk
import matplotlib.pyplot as plt

# Global settings
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:.2f}'.format
plt.rcParams["figure.figsize"] = (20, 6)
plt.style.use('ggplot') # nicer plots
pd.set_option('display.max_columns', None)

# Data loading
df = pd.read_csv('output/combined_feature_engineered_tnt_only.csv')

df['datetime'] = pd.to_datetime(df['datetime'])

In [25]:
df.head()
len(df['participant'].unique())

14

In [3]:
df.drop(['datetime', 'unix_time', 'source', 'response', 'intrusion', 'intrusion_nothink', 'trialcode', 'session_id'], axis=1, inplace=True)
df.columns

Index(['acc_x', 'acc_y', 'acc_z', 'temp', 'eda', 'bvp', 'hr', 'intrusion_tnt',
       'participant', 'eda_tonic', 'eda_phasic', 'eda_scr_onsets',
       'eda_scr_peaks', 'eda_scr_height', 'eda_scr_amplitude',
       'eda_scr_risetime', 'eda_scr_recovery', 'eda_mean', 'eda_std',
       'eda_min', 'eda_max', 'eda_skew', 'eda_kurt', 'eda_tonic_mean',
       'eda_phasic_mean', 'eda_scr_onsets_mean', 'eda_scr_peaks_mean',
       'eda_scr_height_mean', 'eda_scr_amplitude_mean',
       'eda_scr_risetime_mean', 'eda_scr_recovery_mean', 'acc_x_mean',
       'acc_y_mean', 'acc_z_mean', 'acc_x_std', 'acc_y_std', 'acc_z_std',
       'acc_x_min', 'acc_y_min', 'acc_z_min', 'acc_x_max', 'acc_y_max',
       'acc_z_max', 'acc_x_skew', 'acc_y_skew', 'acc_z_skew', 'acc_x_kurt',
       'acc_y_kurt', 'acc_z_kurt', 'temp_mean', 'temp_std', 'temp_min',
       'temp_max', 'temp_skew', 'temp_kurt', 'hr_mean', 'hr_std', 'hr_min',
       'hr_max', 'hr_skew', 'hr_kurt'],
      dtype='object')

In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# # Assuming 'label' is the column with class values
# for col in df.columns:
#     if col != 'intrusion_tnt' and col != 'participant':
#         plt.figure(figsize=(10, 4))
#         sns.boxplot(x='intrusion_tnt', y=col, data=df)
#         plt.title(f'Box plot of {col} by class')
#         plt.show()

In [4]:
from sklearn.model_selection import GroupShuffleSplit

X = df.drop('intrusion_tnt', axis=1)  # Features: All columns except 'intrusion_tnt'
y = df['intrusion_tnt']  # Labels: 'intrusion_tnt' column

# Assuming 'participant' is the column with participant IDs
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

train_idx, test_idx = next(gss.split(X, y, groups=df['participant']))

# Create the training and test sets
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

In [5]:
print("Length of training set:", len(X_train), "Length of test set:", len(X_test))
print("Length of training labels:", len(y_train), "Length of test labels:", len(y_test))

Length of training set: 1966 Length of test set: 602
Length of training labels: 1966 Length of test labels: 602


In [6]:
print('Number of unique participants in training set:', df.iloc[X_train.index]['participant'].nunique())
print('Number of unique participants in test set:', df.iloc[X_test.index]['participant'].nunique())

Number of unique participants in training set: 11
Number of unique participants in test set: 3


### Imputation (KNN)

In [7]:
from sklearn.impute import KNNImputer

# Initialize a new KNNImputer instance
knn_imputer = KNNImputer(n_neighbors=5)

# Fit the imputer on the training data
knn_imputer.fit(X_train)

# Transform the training and test data
X_train_imputed = knn_imputer.transform(X_train)
X_test_imputed = knn_imputer.transform(X_test)

In [9]:
np.isnan(X_train_imputed).sum(), np.isnan(X_test_imputed).sum()

(0, 0)

## To copy to next notebook

In [None]:
from sklearn.model_selection import GroupShuffleSplit
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import neurokit2 as nk
import matplotlib.pyplot as plt

# Global settings
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:.2f}'.format
plt.rcParams["figure.figsize"] = (20, 6)
plt.style.use('ggplot') # nicer plots
pd.set_option('display.max_columns', None)

# Data loading
df = pd.read_csv('output/combined_feature_engineered_tnt_only.csv')

df['datetime'] = pd.to_datetime(df['datetime'])
df.drop(['datetime', 'unix_time', 'source', 'response', 'intrusion', 'intrusion_nothink', 'trialcode', 'session_id'], axis=1, inplace=True)

X = df.drop('intrusion_tnt', axis=1)  # Features: All columns except 'intrusion_tnt'
y = df['intrusion_tnt']  # Labels: 'intrusion_tnt' column

# Assuming 'participant' is the column with participant IDs
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

train_idx, test_idx = next(gss.split(X, y, groups=df['participant']))

# Create the training and test sets
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

from sklearn.impute import KNNImputer

# Initialize a new KNNImputer instance
knn_imputer = KNNImputer(n_neighbors=5)

# Fit the imputer on the training data
knn_imputer.fit(X_train)

# Transform the training and test data
X_train = knn_imputer.transform(X_train)
X_test = knn_imputer.transform(X_test)

# Normalize the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit the scaler on the training data
scaler.fit(X_train)

# Transform the training and test data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
