<a href="https://colab.research.google.com/github/kotosham/sleep-stages/blob/main/sleep_phase_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

class SleepPhaseDataset(Dataset):
    def __init__(self, data, labels, k):
        self.data = data
        self.labels = labels
        self.k = k

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
      cumulative_length = 0
      for i, data in enumerate(self.data):
          cumulative_length += len(data)
          if idx < cumulative_length:
              data_idx = idx - (cumulative_length - len(data))

              # Проверка длины данных для извлечения фрагмента
              if len(data) < self.k:
                raise ValueError(f"Data length {len(data)} is less than k {self.k}")

              start_idx = random.randint(0, len(data) - self.k)  # Случайный стартовый индекс
              end_idx = start_idx + self.k

              fragment = data[start_idx:end_idx]  # Извлекаем фрагмент
              label = self.labels[i][data_idx]     # Получаем метку

              return torch.tensor(fragment, dtype=torch.float32), torch.tensor(label, dtype=torch.int64)

# Файлы с данными по классам
file_paths = [
    ['https://raw.githubusercontent.com/kotosham/sleep-stages/refs/heads/main/sleep_phase_0-1.csv',
     'https://raw.githubusercontent.com/kotosham/sleep-stages/refs/heads/main/sleep_phase_0-2.csv'],
    ['https://raw.githubusercontent.com/kotosham/sleep-stages/refs/heads/main/sleep_phase_1.csv'],
    ['https://raw.githubusercontent.com/kotosham/sleep-stages/refs/heads/main/sleep_phase_2.csv'],
    ['https://raw.githubusercontent.com/kotosham/sleep-stages/refs/heads/main/sleep_phase_3.csv']
]

In [2]:
# Инициализация списков для хранения данных и меток
all_data = []
all_labels = []

# Загрузка данных из файлов
for label, file_path in enumerate(file_paths):
    combined_df = pd.concat([pd.read_csv(part) for part in file_path], ignore_index=True)
    all_data.append(combined_df.iloc[:, :2].values)  # Первые две колонки (данные)
    all_labels.append(combined_df.iloc[:, -1].values)  # Последняя колонка (метки)

# Разделение данных на обучающие, валидационные и тестовые наборы
train_data = []
val_data = []
test_data = []
train_labels = []
val_labels = []
test_labels = []

for data, labels in zip(all_data, all_labels):
    # Разделяем данные на тренировочные и тестовые (80% - 20%)
    X_train_val, X_test, y_train_val, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

    # Теперь разделяем тренировочные на тренировочные и валидационные (80% - 20%)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 0.25 от 0.8 дает 0.2 от общего

    # Добавляем данные в общий список
    train_data.append(X_train)
    val_data.append(X_val)
    test_data.append(X_test)

    train_labels.append(y_train)
    val_labels.append(y_val)
    test_labels.append(y_test)

# Объединяем все данные и метки в один массив для каждого набора
train_data_combined = np.vstack(train_data)
val_data_combined = np.vstack(val_data)
test_data_combined = np.vstack(test_data)

train_labels_combined = np.concatenate(train_labels)
val_labels_combined = np.concatenate(val_labels)
test_labels_combined = np.concatenate(test_labels)

# Создание DataLoader для каждого набора
k = 50
train_dataset = SleepPhaseDataset(train_data_combined, train_labels_combined,k)
val_dataset = SleepPhaseDataset(val_data_combined, val_labels_combined,k)
test_dataset = SleepPhaseDataset(test_data_combined, test_labels_combined,k)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

# Проверка уникальных меток в каждом наборе
print("Unique labels in training set:", set(train_labels_combined))
print("Unique labels in validation set:", set(val_labels_combined))
print("Unique labels in test set:", set(test_labels_combined))

Unique labels in training set: {0, 1, 2, 3}
Unique labels in validation set: {0, 1, 2, 3}
Unique labels in test set: {0, 1, 2, 3}


In [9]:
print(len(train_loader))
print(len(val_loader))
print(len(test_loader))

39744
13248
13248


In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Извлечение данных из DataLoader для обучения дерева решений
train_features = []
train_targets = []
for batch_data, batch_labels in train_loader:
    train_features.append(batch_data.numpy())
    train_targets.append(batch_labels.numpy())

# Объединяем все фрагменты в один массив
train_features = np.concatenate(train_features)
train_targets = np.concatenate(train_targets)

# Аналогично для валидационного и тестового наборов
val_features = []
val_targets = []
for batch_data, batch_labels in val_loader:
    val_features.append(batch_data.numpy())
    val_targets.append(batch_labels.numpy())

val_features = np.concatenate(val_features)
val_targets = np.concatenate(val_targets)

test_features = []
test_targets = []
for batch_data, batch_labels in test_loader:
    test_features.append(batch_data.numpy())
    test_targets.append(batch_labels.numpy())

test_features = np.concatenate(test_features)
test_targets = np.concatenate(test_targets)

# Создание и обучение дерева решений
clf = DecisionTreeClassifier(random_state=42)
clf.fit(train_features, train_targets)

# Прогнозирование на валидационном наборе
val_predictions = clf.predict(val_features)
val_accuracy = accuracy_score(val_targets, val_predictions)
print(f'Validation Accuracy: {val_accuracy * 100:.2f}%')

ValueError: Data length 2 is less than k 50

In [None]:
# Прогнозирование на тестовом наборе
test_predictions = clf.predict(test_features)
test_accuracy = accuracy_score(test_targets, test_predictions)
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')