# 1. Sleep well (35 points)

In [56]:
import pandas as pd

# Load the data
X_train = pd.read_csv('data\Sleep-EDF-15_U-Time/X_train.csv', header=None)
y_train = pd.read_csv('data\Sleep-EDF-15_U-Time/y_train.csv', header=None)
X_test = pd.read_csv('data\Sleep-EDF-15_U-Time/X_test.csv', header=None)
y_test = pd.read_csv('data\Sleep-EDF-15_U-Time/y_test.csv', header=None)

## 1.1 Data understanding and preprocessing

In [57]:
# Calculate class frequencies for training dat, normalized to [0,1]
class_frequencies = y_train.value_counts(normalize=True)

# Print each class frequency for training data
print("Class frequencies for training data:")
for label, freq in class_frequencies.items():
    print(f"Class {label}: {freq*100:.2f}%")

Class frequencies for training data:
Class (0.0,): 52.09%
Class (2.0,): 25.27%
Class (1.0,): 9.55%
Class (4.0,): 8.39%
Class (3.0,): 4.69%


## 1.2 Classification

In [58]:
from sklearn.metrics import confusion_matrix, zero_one_loss, accuracy_score, precision_score, recall_score, f1_score

def evaluate_classifier(y_true_train, y_pred_train, y_true_test, y_pred_test, classifier_name):
    print(f"Evaluation Metrics for {classifier_name}:")

    # Confusion Matrix
    print(f"\nConfusion Matrix:")
    print(f"Training:\n {confusion_matrix(y_true_train, y_pred_train)}")
    print(f"Test:\n {confusion_matrix(y_true_test, y_pred_test)}")

    # Zero-One Loss
    print(f"\nZero-One Loss:")
    print(f"Training: {zero_one_loss(y_true_train, y_pred_train):.3f}")
    print(f"Test: {zero_one_loss(y_true_test, y_pred_test):.3f}")

    # Accuracy
    print(f"\nAccuracy")
    print(f"Training: {accuracy_score(y_true_train, y_pred_train):.3f}")
    print(f"Test: {accuracy_score(y_true_test, y_pred_test):.3f}")

    # Precision
    print(f"\nPrecision:")
    print(f"Training: {precision_score(y_true_train, y_pred_train, average='macro'):.3f}")
    print(f"Test: {precision_score(y_true_test, y_pred_test, average='macro'):.3f}")

    # Recall
    print(f"\nRecall:")
    print(f"Training: {recall_score(y_true_train, y_pred_train, average='macro'):.3f}")
    print(f"Test: {recall_score(y_true_test, y_pred_test, average='macro'):.3f}")

    # F1 Score
    print(f"\nF1 Score:")
    print(f"Training: {f1_score(y_true_train, y_pred_train, average='macro'):.3f}")
    print(f"Test: {f1_score(y_true_test, y_pred_test, average='macro'):.3f}")

### Multi-nominal Logistic Regression

In [59]:
from sklearn.linear_model import LogisticRegression

# Build and train the model
logistic_model = LogisticRegression(max_iter=10000)  # Default is L2 regularization
logistic_model.fit(X_train, y_train.values.ravel())

# Predict and evaluate
train_pred = logistic_model.predict(X_train)
test_pred = logistic_model.predict(X_test)

evaluate_classifier(y_train, train_pred, y_test, test_pred, "Logistic Regression")

Evaluation Metrics for Logistic Regression:

Confusion Matrix:
Training:
 [[16993   404    37    12   120]
 [  532  1322  1012     5   350]
 [   62   446  7226   314   475]
 [   10     7   662   896     8]
 [   64   180   342     0  2245]]
Test:
 [[7437  124    9    3    6]
 [  78  412  182    1  102]
 [  15  188 3376   83  179]
 [   0    0  135  260    0]
 [  53  161   59    0 1038]]

Zero-One Loss:
Training: 0.150
Test: 0.099

Accuracy
Training: 0.850
Test: 0.901

Precision:
Training: 0.747
Test: 0.775

Recall:
Training: 0.717
Test: 0.768

F1 Score:
Training: 0.727
Test: 0.771


### Random Forests

In [60]:
from sklearn.ensemble import RandomForestClassifier

for n_trees in [50, 100, 200]:
    # Build and train the model
    rf_model = RandomForestClassifier(n_estimators=n_trees)
    rf_model.fit(X_train, y_train.values.ravel())
    
    # Predict and evaluate
    train_pred = rf_model.predict(X_train)
    test_pred = rf_model.predict(X_test)
    
    # Evaluate the model
    evaluate_classifier(y_train, train_pred, y_test, test_pred, f"Random Forest (n_estimators={n_trees})")

Evaluation Metrics for Random Forest (n_estimators=50):

Confusion Matrix:
Training:
 [[17566     0     0     0     0]
 [    6  3214     1     0     0]
 [    0     0  8522     0     1]
 [    0     0     1  1582     0]
 [    0     0     0     0  2831]]
Test:
 [[7448  109   13    2    7]
 [ 110  395  174    1   95]
 [  24  228 3346   78  165]
 [   0    0  177  218    0]
 [ 125  161   95    0  930]]

Zero-One Loss:
Training: 0.000
Test: 0.113

Accuracy
Training: 1.000
Test: 0.887

Precision:
Training: 1.000
Test: 0.759

Recall:
Training: 0.999
Test: 0.725

F1 Score:
Training: 1.000
Test: 0.739


### k-Nearest-Neighbor Classification

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {'n_neighbors': list(range(1, 31))}  # Considering 1 to 30 neighbors

# Use GridSearchCV to find the best number of neighbors
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=5)  # 5-fold cross-validation
grid_search.fit(X_train, y_train.values.ravel())

# Train the model with the best number of neighbors
best_knn = grid_search.best_estimator_
train_pred = best_knn.predict(X_train)
test_pred = best_knn.predict(X_test)

# Evaluate the model
evaluate_classifier(y_train, train_pred, y_test, test_pred, "K-Nearest Neighbors")

KeyboardInterrupt: 

# Invariance and normalization (30 points)