In [2]:
import os

import torch
import numpy as np

from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.nn import functional as F

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


sns.set_theme(style='whitegrid')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"device: {device}")

device: cuda


In [4]:
df = pd.read_csv('denoised.csv', index_col=0)
df.head(3)

Unnamed: 0,name,0.0,0.2,0.4,0.6,0.8,1.0,1.2,1.4,1.6,...,117.2,117.4,117.6,117.8,118.0,118.2,118.4,118.6,stage,target
0,15_10_2022 (Usk DA 67) SL,0.007351,0.007263,0.007172,0.007077,0.006982,0.006884,0.006814,0.006773,0.006749,...,0.168498,0.168498,0.168498,0.168498,0.168498,0.168498,0.168498,0.168498,1,0
1,15_10_2022 (Usk DA 67) SL,0.005696,0.005715,0.005735,0.005755,0.005776,0.005798,0.005821,0.005846,0.005872,...,0.120602,0.120602,0.120602,0.120602,0.120602,0.120602,0.120602,0.120602,2,0
2,29_09_2022 (Kos PV 54) SL,0.022135,0.023749,0.025386,0.027042,0.0288,0.030638,0.032025,0.032918,0.033549,...,0.146193,0.146193,0.146193,0.146193,0.146193,0.146193,0.146193,0.146193,1,0


Данные таблицы - интенсивность сигнала некого силового метода аналитической химии (известно, какого), значение целевого параметра и стадия (до - 1, после - 2)
Суть эксперимента: регистрируется спектр слюны опытной группы, после чего проводится некий психологический эксперимент, затем снимается спект слюны после 
эксперимента. Необходимо построить модель, которая могла бы оценить, болен ли член опытной группы болезнью X (1 - да, 0 - нет).


Что нужно от датасета ниже:
1) выдавать данные по стадиям и таргет (до, после, таргет)
2) возможность добавить аугментацию для данных (например, добавить шум)

In [ ]:
class StagedDataset(Dataset):
    def __init__(self, df: pd.DataFrame,
                 features_cols: list[str] = None,
                 target: str = None, augment: bool = True,
                 noise_lvl=2e-3):
        super(StagedDataset, self).__init__()

        self.data = [*df.groupby('name')]
        self.augment = augment
        self.len_ = len(self.data)
        self.features_cols = features_cols or df.columns.drop(['name', 'target', 'stage'])
        self.target = target or 'target'
        self.T_ = [float(x) for x in self.features_cols]
        self.noise_lvl = noise_lvl

    def apply_noise(self, x, noise_f=np.random.normal):
        return x + noise_f(0, self.noise_lvl, x.shape)

    @staticmethod
    def prepare_data(*data):
        return [torch.tensor(x, dtype=torch.float32) for x in data]

    def __getitem__(self, index):
        name, sample_df = self.data[index]

        s1, s2 = sample_df[self.features_cols].values
        target = sample_df[self.target].values[0].item()
        if self.augment:
            s1 = self.apply_noise(s1)
            s2 = self.apply_noise(s2)

        return self.prepare_data(s1, s2, target)

    def __len__(self):
        return self.len_

    def get_num_features(self):
        return len(self.features_cols)