In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import torch
print(os.listdir('../input/snu-2021-1-ds-project-3'))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
training_dir = '../input/snu-2021-1-ds-project-3/train'
test_dir = '../input/snu-2021-1-ds-project-3/test'

### Helper functions

In [None]:
def extract_age(info_file):
    '''
        info file(###.txt)로부터 나이 정보를 뽑아냅니다.
    '''
    with open(info_file, 'r') as f:
        info = f.read()
        for i, line in enumerate(info.split("\n")):
            if line.startswith("#Age"):
                age = float(line.split(": ")[1].strip())
    return age

def extract_sex(info_file):
    '''
        info file(###.txt)로부터 성별 정보를 뽑아냅니다.
    '''
    with open(info_file, 'r') as f:
            info = f.read()
            for i, line in enumerate(info.split("\n")):
                if line.startswith("#Sex"):
                    sex = line.split(": ")[1].strip()
    return sex

def extract_labels(info_file):
    '''
        info file(###.txt)로부터 label(들) 정보를 뽑아냅니다.
    '''
    with open(info_file, 'r') as f:
            info = f.read()
            for i, line in enumerate(info.split("\n")):
                if line.startswith("#Dx"):
                    labels = line.split(": ")[1].strip()
                    labels = labels.split()
    return labels

def read_files(data_directory, is_training=True):
    '''
        data directory(train 또는 test)로부터 모든 sample들의
        id, age, sex, recording, labels 정보를 읽어들여
        (id, age, sex, recording, labels)의 list를 반환합니다.
        is_training=False일 경우엔 labels 정보를 읽어들이지 않습니다.
    '''
    list_id = []
    list_age = []
    list_sex = []
    list_recording = []
    list_labels = []
    for f in os.listdir(data_directory):
        root, extension = os.path.splitext(f)
        if not root.startswith(".") and extension == ".txt":
            list_id.append(int(root))
            info_file = os.path.join(data_directory, root + ".txt")
            recording_file = os.path.join(data_directory, root + ".npy")
            age = extract_age(info_file)
            list_age.append(age)
            sex = extract_sex(info_file)
            list_sex.append(sex)
            with open(recording_file, 'rb') as g:
                recording = np.load(g)
                list_recording.append(recording)
            if is_training:
                labels = extract_labels(info_file)
                list_labels.append(labels)
    if is_training:
        return list(zip(list_id, list_age, list_sex, list_recording, list_labels))
    else:
        return list(zip(list_id, list_age, list_sex, list_recording))

### PyTorch Custom Dataset
training sample을 batch 단위로 처리할 수 있도록 torch.uitls.data.Dataset을 이용한 custom dataset을 만들어 줍니다.

In [None]:
class Dataset_ECG(torch.utils.data.Dataset):
    """
        Build ECG dataset
    """
    def __init__(self, dataset, num_classes=12):
        """
            dataset을 읽어들여 id, age, sex, recording, labels를 저장한 list를 만들어 줍니다.
        """
        self.sample_id = []
        self.sample_age = []
        self.sample_sex = []
        self.sample_recording = []
        self.sample_labels = []
        self.num_samples = len(dataset)
        
        for idx in range(self.num_samples):
            _id, _age, _sex, _recording, _labels = dataset[idx]
            # model에 input으로 들어가는 data는 torch.Tensor 타입으로 변환해 줍니다.
            age = torch.tensor(_age)
            sex = torch.tensor(0) if _sex == "F" else torch.tensor(1)
            recording = torch.tensor(_recording)
            labels = torch.tensor(np.zeros(num_classes))
            for label in _labels:
                labels[int(label)] = 1

            self.sample_id.append(_id)
            self.sample_age.append(age)
            self.sample_sex.append(sex)
            self.sample_recording.append(recording)
            self.sample_labels.append(labels)

        print(f'Loaded {self.num_samples} samples...')

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        return {
            "id": self.sample_id[idx],
            "age": self.sample_age[idx],
            "sex": self.sample_sex[idx],
            "recording": self.sample_recording[idx],
            "labels": self.sample_labels[idx]
        }

In [None]:
class Dataset_ECG_pytorch(torch.utils.data.Dataset):
    """
        Build ECG dataset
    """
    def __init__(self, list_id, list_age, list_sex, list_recording, list_labels_oh=None, num_classes=12):
        """
            dataset을 읽어들여 id, age, sex, recording, labels를 저장한 list를 만들어 줍니다.
        """
        self.sample_id = torch.tensor(list_id)
        self.sample_age = torch.tensor(list_age)
        self.sample_sex = torch.tensor(list_sex)
        self.sample_recording = torch.tensor(list_recording)

        length = len(list_id)
        assert length==len(self.sample_id)
        assert length==len(self.sample_age)
        assert length==len(self.sample_sex)
        assert length==len(self.sample_recording)

        if not list_labels_oh is None:
            self.train = True
            self.sample_labels = torch.tensor(list_labels_oh)
            assert length==len(self.sample_labels)
        
        self.num_samples = length
        
        print(f'Loaded {self.num_samples} samples...')

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        result = {
            "id": self.sample_id[idx],
            "age": self.sample_age[idx],
            "sex": self.sample_sex[idx],
            "recording": self.sample_recording[idx],
        }
        if self.train:
            result['labels'] = self.sample_labels[idx]
        return result

### PyTorch CNN model
간단한 CNN model을 만들어 보겠습니다.

In [None]:
class Example_CNN_v1(torch.nn.Module):
    def __init__(self, num_classes=12, num_leads=2):
        super(Example_CNN_v1, self).__init__()
        self.num_classes = num_classes
        self.num_leads = num_leads
        self.conv1 = torch.nn.Conv1d(in_channels=self.num_leads, out_channels=32, kernel_size=15, stride=3, padding=2)
        self.relu1 = torch.nn.ReLU()
        self.conv2 = torch.nn.Conv1d(in_channels=32, out_channels=64, kernel_size=13, stride=3, padding=1)
        self.relu2 = torch.nn.ReLU()
        self.conv3 = torch.nn.Conv1d(in_channels=64, out_channels=128, kernel_size=10, stride=2)
        self.relu3 = torch.nn.ReLU()
        self.conv4 = torch.nn.Conv1d(in_channels=128, out_channels=64, kernel_size=8, stride=2)
        self.relu4 = torch.nn.ReLU()
        self.conv5 = torch.nn.Conv1d(in_channels=64, out_channels=32, kernel_size=7, stride=2)
        self.relu5 = torch.nn.ReLU()
        self.fc1 = torch.nn.Linear(32*64, 128)
        self.relu6 = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(128, self.num_classes)

    def forward(self, x):
        # 이 모델은 recording만을 input으로 받습니다. feature를 추가적으로 사용하도록 할 수도 있습니다.
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.conv3(x)
        x = self.relu3(x)
        x = self.conv4(x)
        x = self.relu4(x)
        x = self.conv5(x)
        x = self.relu5(x)
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = self.fc1(x)
        x = self.relu6(x)
        out = self.fc2(x)
        return out

### Setup

In [None]:
import pickle

base_dir = '../input/preprocessed/'
with open(base_dir+'test_data_torch.pkl', 'rb') as f:
    testing_dataset = pickle.load(f)
with open(base_dir+'train_data_torch.pkl', 'rb') as f:
    training_dataset = pickle.load(f)

In [None]:
training_dataset.sample_recording

In [None]:
# cuda gpu를 사용할 수 있을 경우 사용합니다.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

In [None]:
len(training_dataset.sample_recording)

In [None]:
len(training_dataset.sample_recording[0][0])

In [None]:
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (18, 5)
test_graph = training_dataset.sample_recording[0][1]
plt.plot(test_graph)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import find_peaks

peaks, properties = find_peaks(test_graph, height = 0, width =0, distance=200, prominence = 0.25)

print(f'Index of each peak : {peaks}')
print(f'Height of each peak : {properties["peak_heights"]}')


plt.plot(test_graph)
plt.plot(peaks, test_graph[peaks], "x")
      

# height = peaks[1]['peak_heights'] #list of the heights of the peaks
# peak_pos = x[peaks[0]] #list of the peaks positions

In [None]:
rr_interval = []
for j in range(len(peaks)-1):
    rr = (peaks[j+1]-peaks[j-1])
    rr_interval.append(rr)

In [None]:
mean = sum(rr_interval)/len(rr_interval)
print(mean)

In [None]:
import numpy as np
!pip install PyWavelets
import pywt as pw

def QRS_detection(signal,sample_rate,max_bpm):

    ## Stationary Wavelet Transform
    coeffs = pw.swt(signal, wavelet = "haar", level=2, start_level=0, axis=-1)
    d2 = coeffs[1][1] ##2nd level detail coefficients
    
    
    ## Threhold the detail coefficients
    avg = np.mean(d2)
    std = np.std(d2)
    sig_thres = [abs(i) if abs(i)>2.0*std else 0 for i in d2-avg]
    
    ## Find the maximum modulus in each window
    window = int((60.0/max_bpm)*sample_rate)
    sig_len = len(signal)
    n_windows = int(sig_len/window)
    modulus,qrs = [],[]
    
    ##Loop through windows and find max modulus
    for i in range(n_windows):
        start = i*window
        end = min([(i+1)*window,sig_len])
        mx = max(sig_thres[start:end])
        if mx>0:
            modulus.append( (start + np.argmax(sig_thres[start:end]),mx))
    
    
    ## Merge if within max bpm
    merge_width = int((0.2)*sample_rate)
    i=0
    while i < len(modulus)-1:
        ann = modulus[i][0]
        if modulus[i+1][0]-modulus[i][0] < merge_width:
            if modulus[i+1][1]>modulus[i][1]: # Take larger modulus
                ann = modulus[i+1][0]
            i+=1
                
        qrs.append(ann)
        i+=1 
    ## Pin point exact qrs peak
    window_check = int(sample_rate/6)
    #signal_normed = np.absolute((signal-np.mean(signal))/(max(signal)-min(signal)))
    r_peaks = [0]*len(qrs)
    
    for i,loc in enumerate(qrs):
        start = max(0,loc-window_check)
        end = min(sig_len,loc+window_check)
        wdw = np.absolute(signal[start:end] - np.mean(signal[start:end]))
        pk = np.argmax(wdw)
        r_peaks[i] = start+pk
        
    return r_peaks

In [None]:
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (18, 5)
test_graph = training_dataset.sample_recording[507][1]
plt.plot(test_graph)

In [None]:
testing = test_graph.numpy()
peaks = QRS_detection(testing, 600, 220)

In [None]:
plt.plot(test_graph)
plt.plot(peaks, test_graph[peaks], "x")

In [None]:
def outliers_iqr(data):
    q1, q3 = np.percentile(data, [25,75])
    
    iqr = q3 - q1
    lower_bound = q1/2.5
    upper_bound = q3*2.5 
    
    return np.where((data > upper_bound) | (data < lower_bound))

In [None]:
data = test_graph[peaks]
outliers = outliers_iqr(data)

In [None]:
outliers

In [None]:
def is_peak(graph, peaks):
    diff_graph_peaks = np.diff(graph[peaks], n=1, axis=-1)
    to_exclude = []
    # 기울기 검사
    for i in range(len(diff_graph_peaks)):
        if diff_graph_peaks[i]>1 or diff_graph_peaks[i]<-1:
            to_exclude.append(i)
    peaks = np.delete(peaks, to_exclude)
    # 위로볼록/아래로 볼록
    diff_graph =np.diff(graph, n=1, axis=-1)
    validated_peaks = []
    for i in range(len(peaks)):
        if diff_graph[peaks[i]]>0:
            for j in range(10):
                if diff_graph[peaks[i]+j]<0:
                    validated_peaks.append(peaks[i])
                    break
        elif diff_graph[peaks[i]]<0:
            for j in range(10):
                if diff_graph[peaks[i]-j]>0:
                    validated_peaks.append(peaks[i])
                    break
        else:
            flag = False
            for j in range(10):
                if diff_graph[peaks[i]+j]<0:
                    flag = True
                    break
            if flag == True:
                for k in range(10):
                    if diff_graph[peaks[i]-k]>0:
                        validated_peaks.append(peaks[i])
                        break
    return validated_peaks

                            
                    
                
    

In [None]:
def is_peak_2(graph, peaks):
    general_peaks, properties = find_peaks(graph, height = 0, width =0)
    validated_peaks = []
    for val in peaks:
        if val in general_peaks:
            validated_peaks.append(val)
    return validated_peaks
    

In [None]:
validated_peaks = is_peak_2(test_graph, peaks)

In [None]:
validated_peaks

In [None]:
plt.plot(test_graph)
plt.plot(validated_peaks, test_graph[validated_peaks], "x")

In [None]:
graph = training_dataset.sample_recording[0][1]
peaks, properties = find_peaks(graph, height = 0, width =0, distance=150, prominence = 0.25)
rr_interval = []
for j in range(len(peaks)-1):
    rr = (peaks[j+1]-peaks[j-1])*0.002
    rr_interval.append(rr)


In [None]:
peaks

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import find_peaks

rr_intervals = []

for i in range(len(training_dataset.sample_recording)):
    graph = training_dataset.sample_recording[i][1]
    peaks, properties = find_peaks(graph, height = 0, width =0, distance=200, prominence = 0.25)
    rr_interval = []
    for j in range(len(peaks)-1):
        rr_interval.append((peaks[j+1]-peaks[j])*0.002)
    rr_intervals.append(rr_interval)


In [None]:
# 선택적! 
for i in range(len(rr_intervals)):
    if len(rr_intervals[i]) ==0:
        testing = training_dataset.sample_recording[i][1].numpy()
        candidate_peaks = QRS_detection(testing, 600, 220)
        validated_peaks = is_peak_2(testing, candidate_peaks)
        rr_interval2 = []
        for j in range(len(validated_peaks)-1):
            rr_interval2.append((validated_peaks[j+1]-validated_peaks[j])*0.002)
        rr_intervals[i] = rr_interval2
        
        

In [None]:
# rr_interval outlier 제거 -> 선택적! 
for i in range(len(rr_intervals)):
    if len(rr_intervals[i]) !=0:
        outliers = outliers_iqr(rr_intervals[i])
        rr_intervals[i] = np.delete(rr_intervals[i], outliers)

In [None]:
# hr_min
hr_min = []
for i in range(len(rr_intervals)):
    if len(rr_intervals[i]) != 0:
        max_int = max(rr_intervals[i])
        val = 60/max_int
        hr_min.append(val)
    else:
        hr_min.append(10000)
    
# hr_max
hr_max = []
for i in range(len(rr_intervals)):
    if len(rr_intervals[i]) != 0:
        val = 60/min(rr_intervals[i])
        hr_max.append(val)
    else:
        hr_max.append(10000)

# rr_interval_median
rr_interval_median = []
for i in range(len(rr_intervals)):
    if len(rr_intervals[i]) != 0:
        sorted_intervals = sorted(rr_intervals[i])
        val = sorted_intervals[len(sorted_intervals)//2]
        rr_interval_median.append(val)
    else:
        rr_interval_median.append(10000)

# hr_mean
hr_mean = []
for i in range(len(rr_intervals)):
    if len(rr_intervals[i]) != 0:
        sum_hr = 0
        for val in rr_intervals[i]:
            sum_hr+= 60/(val)
        value = sum_hr/len(rr_intervals[i])
        hr_mean.append(value)
    else:
        hr_mean.append(10000)
    
# print(len(hr_min))
# # r peak 못찾은 값 평균값 대체
# cnt_rpeak_zeros = 0
# for i in range(len(rr_intervals)):
#     if hr_min[i] ==-1:
#         hr_min[i] = np.mean(hr_min)
#         cnt_rpeak_zeros+=1
#     if hr_max[i] ==-1:
#         hr_max[i] = np.mean(hr_max)
#     if hr_mean[i] == -1:
#         hr_mean[i] = np.mean(hr_mean)
#     if rr_interval_median[i] == -1:
#         rr_interval_median[i] = np.mean(rr_interval_median)

# print(cnt_rpeak_zeros)
    

In [None]:
import numpy as np

cnt_rpeak_ones = 0
# diff_rr_min
diff_rr_min = []
for i in range(len(rr_intervals)):
    if len(rr_intervals[i]) != 0:
        diff_rr = np.diff(rr_intervals[i], n=1, axis=-1)
        if len(diff_rr) !=0:
            diff_rr_min.append(min(diff_rr))
        else:
            diff_rr_min.append(10000)
            cnt_rpeak_ones+=1
    else:
        diff_rr_min.append(10000) 
    

print(cnt_rpeak_ones)

# # r peak 못찾은 값 or r peak 1개 -> 평균값 대체
# for i in range(len(rr_intervals)):
#     if diff_rr_min[i] ==100:
#         diff_rr_min[i] = np.mean(diff_rr_min)



In [None]:
df = pd.DataFrame(hr_min, columns = ['hr_min'])
df['hr_max'] = hr_max
df['rr_interval_median'] = rr_interval_median
df['hr_mean'] = hr_mean
df['diff_rr_min'] = diff_rr_min
df['id'] = training_dataset.sample_id

In [None]:
df

In [None]:
df.to_csv('feature_extraction_train_data1.csv')

In [None]:
import numpy as np

class real_time_peak_detection():
    def __init__(self, array, lag, threshold, influence):
        self.y = list(array)
        self.length = len(self.y)
        self.lag = lag
        self.threshold = threshold
        self.influence = influence
        self.signals = [0] * len(self.y)
        self.filteredY = np.array(self.y).tolist()
        self.avgFilter = [0] * len(self.y)
        self.stdFilter = [0] * len(self.y)
        self.avgFilter[self.lag - 1] = np.mean(self.y[0:self.lag]).tolist()
        self.stdFilter[self.lag - 1] = np.std(self.y[0:self.lag]).tolist()

    def thresholding_algo(self, new_value):
        self.y.append(new_value)
        i = len(self.y) - 1
        self.length = len(self.y)
        if i < self.lag:
            return 0
        elif i == self.lag:
            self.signals = [0] * len(self.y)
            self.filteredY = np.array(self.y).tolist()
            self.avgFilter = [0] * len(self.y)
            self.stdFilter = [0] * len(self.y)
            self.avgFilter[self.lag - 1] = np.mean(self.y[0:self.lag]).tolist()
            self.stdFilter[self.lag - 1] = np.std(self.y[0:self.lag]).tolist()
            return 0

        self.signals += [0]
        self.filteredY += [0]
        self.avgFilter += [0]
        self.stdFilter += [0]

        if abs(self.y[i] - self.avgFilter[i - 1]) > self.threshold * self.stdFilter[i - 1]:
            if self.y[i] > self.avgFilter[i - 1]:
                self.signals[i] = 1
            else:
                self.signals[i] = -1

            self.filteredY[i] = self.influence * self.y[i] + (1 - self.influence) * self.filteredY[i - 1]
            self.avgFilter[i] = np.mean(self.filteredY[(i - self.lag):i])
            self.stdFilter[i] = np.std(self.filteredY[(i - self.lag):i])
        else:
            self.signals[i] = 0
            self.filteredY[i] = self.y[i]
            self.avgFilter[i] = np.mean(self.filteredY[(i - self.lag):i])
            self.stdFilter[i] = np.std(self.filteredY[(i - self.lag):i])
        print(i)
        return self.signals[i]

In [None]:
signals = real_time_peak_detection(test_graph,5, 3.5, 0.5)
signals = signals.thresholding_algo(10)

In [None]:
#!/usr/bin/env python
# Implementation of algorithm from https://stackoverflow.com/a/22640362/6029703
import numpy as np
import pylab

def thresholding_algo(y, lag, threshold, influence):
    signals = np.zeros(len(y))
    filteredY = np.array(y)
    avgFilter = [0]*len(y)
    stdFilter = [0]*len(y)
    avgFilter[lag - 1] = np.mean(y[0:lag])
    stdFilter[lag - 1] = np.std(y[0:lag])
    for i in range(lag, len(y)):
        if abs(y[i] - avgFilter[i-1]) > threshold * stdFilter [i-1]:
            if y[i] > avgFilter[i-1]:
                signals[i] = 1
            else:
                signals[i] = -1

            filteredY[i] = influence * y[i] + (1 - influence) * filteredY[i-1]
            avgFilter[i] = np.mean(filteredY[(i-lag+1):i+1])
            stdFilter[i] = np.std(filteredY[(i-lag+1):i+1])
        else:
            signals[i] = 0
            filteredY[i] = y[i]
            avgFilter[i] = np.mean(filteredY[(i-lag+1):i+1])
            stdFilter[i] = np.std(filteredY[(i-lag+1):i+1])

    return dict(signals = np.asarray(signals),
                avgFilter = np.asarray(avgFilter),
                stdFilter = np.asarray(stdFilter))

In [None]:
# Data
y = test_graph.numpy()

# Settings: lag = 30, threshold = 5, influence = 0
lag = 30
threshold = 5
influence = 0

# Run algo with settings from above
result = thresholding_algo(y, lag=lag, threshold=threshold, influence=influence)

# Plot result
pylab.subplot(211)
pylab.plot(np.arange(1, len(y)+1), y)

pylab.plot(np.arange(1, len(y)+1),
           result["avgFilter"], color="cyan", lw=2)

pylab.plot(np.arange(1, len(y)+1),
           result["avgFilter"] + threshold * result["stdFilter"], color="green", lw=2)

pylab.plot(np.arange(1, len(y)+1),
           result["avgFilter"] - threshold * result["stdFilter"], color="green", lw=2)

pylab.subplot(212)
pylab.step(np.arange(1, len(y)+1), result["signals"], color="red", lw=2)
pylab.ylim(-1.5, 1.5)
pylab.show()

In [None]:
signals

In [None]:
topid= sorted(range(len(test_graph)),key= lambda i: test_graph[i])[-100:]
print(topid)
test_graph[topid]


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

test_diff = np.diff(test_graph, n=1, axis=-1)

In [None]:
plt.plot(test_diff)

In [None]:
# MIN_HR
min_hr = []
for i in range(len(training_dataset.sample_recording)):
    arr = []
    min_hr_1 = min(training_dataset.sample_recording[i][0])
    min_hr_2 = min(training_dataset.sample_recording[i][1])
    arr.append(min_hr_1)
    arr.append(min_hr_2)
    min_hr.append
    

In [None]:
# MAX_HR


In [None]:
# Training에 사용될 hyperparameter를 정해줍니다.
EPOCHS = 15
BATCH_SIZE = 32
LEARNING_RATE = 0.001

In [None]:
# Training dataset을 batch 단위로 읽어들일 수 있도록 DataLoader를 만들어줍니다.
training_loader = torch.utils.data.DataLoader(training_dataset, pin_memory=True, batch_size=BATCH_SIZE)

### Training

In [None]:
model = Example_CNN_v1(num_classes=12, num_leads=2)

model.to(device)
model.train()

criterion = torch.nn.BCEWithLogitsLoss() # for multi-label classification
optimizer = torch.optim.RMSprop(model.parameters(), lr=LEARNING_RATE)

In [None]:
# Training loop
for epoch in range(1, EPOCHS+1):
    print(f'***** Epoch {epoch} *****')
    epoch_training_loss_sum = 0.0
    for i_batch, sample_batched in enumerate(training_loader):
        b_recording = sample_batched["recording"].to(device)
        b_labels = sample_batched["labels"].to(device)
        optimizer.zero_grad()
        b_out = model(b_recording)
        loss = criterion(b_out, b_labels)
        loss.backward()
        optimizer.step()
        epoch_training_loss_sum += loss.item() * b_labels.shape[0]

    epoch_training_loss = epoch_training_loss_sum / num_training
    print(f'training loss of epoch {epoch}: {epoch_training_loss}\n')

### Evaluation
evalutate on validation set

In [None]:
model.eval()

validation_prediction_df = pd.DataFrame(columns=['labels'])
validation_prediction_df.index.name = 'id'
validation_true_labels_df = pd.DataFrame(columns=['labels'])
validation_true_labels_df.index.name = 'id'

with torch.no_grad():
    for idx in range(len(validation_set)):
        validation_sample = validation_set[idx]
        _, _, _, recording, labels = validation_sample
        out = model(torch.tensor(recording).unsqueeze(0).to(device)) # unsqueeze는 batch dimension을 추가해주기 위함
        sample_prediction = torch.sigmoid(out).squeeze() > 0.5 # Use 0.5 as a threshold / squeeze는 batch dimension을 제거해주기 위함
        indices_of_1s = np.where(sample_prediction.cpu())[0]
        str_indices_of_1s = ' '.join(map(str, indices_of_1s))
        validation_prediction_df.loc[idx] = [str_indices_of_1s]
        
        str_true_labels = ' '.join(labels)
        validation_true_labels_df.loc[idx] = [str_true_labels]

In [None]:
print(validation_prediction_df[:10])

In [None]:
print(validation_true_labels_df[:10])

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score

mlb = MultiLabelBinarizer(classes=['0','1','2','3','4','5','6','7','8','9','10','11'])
mlb.fit(map(str.split, validation_true_labels_df['labels'].values))

macro_f1_validation = f1_score(mlb.transform(map(str.split, validation_true_labels_df['labels'].values)), mlb.transform(map(str.split, validation_prediction_df['labels'].values)), average='macro')
print(f'macro f1 score on validation set: {macro_f1_validation}')

### Test Prediction
학습된 모델로 test_set에 대한 prediction을 진행합니다.

In [None]:
test_set = sorted(read_files(test_dir, is_training=False), key=lambda sample:sample[0])
num_test = len(test_set)
print(f'Number of test samples: {num_test}')

In [None]:
model.eval()

test_prediction_df = pd.DataFrame(columns=['labels'])
test_prediction_df.index.name = 'id'

with torch.no_grad():
    for idx in range(len(test_set)):
        test_sample = test_set[idx]
        _, _, _, recording = test_sample
        out = model(torch.tensor(recording).unsqueeze(0).to(device)) # unsqueeze는 batch dimension을 추가해주기 위함
        sample_prediction = torch.sigmoid(out).squeeze() > 0.5 # Use 0.5 as a threshold / squeeze는 batch dimension을 제거해주기 위함
        indices_of_1s = np.where(sample_prediction.cpu())[0]
        str_indices_of_1s = ' '.join(map(str, indices_of_1s))
        test_prediction_df.loc[idx] = [str_indices_of_1s]

In [None]:
test_prediction_df[:10]

In [None]:
test_prediction_df.to_csv('my_submission.csv')