In [None]:
import pandas as pd
import numpy as np
import sklearn as skl
import matplotlib.pyplot as plt

In [None]:
np.random.seed(42)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [None]:
EPOCHS = 10
STEP_SIZE = 0.1

In [None]:
train_df = pd.read_csv('')
train_df = train_df.sample(frac=1)
train_df = train_df.fillna(train_df.median())
train_df = train_df.drop_duplicates()

In [None]:
models = {
    'AdaBoost': AdaBoostClassifier(algorithm='SAMME', random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Support Vector Classifier': SVC(probability=True, random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Voting Classifier': VotingClassifier(
        estimators=[
            ('rf', RandomForestClassifier(random_state=42)),
            ('svc', SVC(probability=True, random_state=42))
        ],
        voting='soft'
    )
}

In [None]:
model_accuracies = {model_name: [] for model_name in models.keys()}
test_sizes = np.arange(0.1, 1.0 + STEP_SIZE, STEP_SIZE)

for test_size in test_sizes:
    for model_name, model in models.items():
        epoch_acci = []

        for epoch in range(EPOCHS):
            train_set, test_set = train_test_split(train_df, test_size=test_size, random_state=42)
            imputer = SimpleImputer(strategy='median')

            # 'eye_category', 'eye_position', 'gaze_direction', 'heart_rates', 'p2p_intervals', 'sys_peaks', 'dys_peaks'
            X_train = imputer.fit_transform(train_set[['heart_rates', 'p2p_intervals', 'sys_peaks', 'dys_peaks']])
            X_test = imputer.transform(test_set[['heart_rates', 'p2p_intervals', 'sys_peaks', 'dys_peaks']])

            y_train = train_set['engagement_labels']
            y_test = test_set['engagement_labels']

            model.fit(X_train, y_train)
            model_predictions = model.predict(X_test)
            accuracy = accuracy_score(y_test, model_predictions)

            epoch_acci.append(accuracy)

            if test_size == 0.2:
                print(f'{model_name} Epoch {epoch + 1}: {accuracy:.4f}')

        model_accuracies[model_name].append(sum(epoch_acci) / len(epoch_acci))

plt.figure(figsize=(14, 10))
for model_name, accuracies in model_accuracies.items():
    plt.plot(test_sizes, accuracies, label=model_name)

plt.xlabel('Test Size')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Model Accuracy vs Test Size')
plt.show();