In [1]:
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import os
import glob
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
# ラベルが格納されているCSVを読み込む

# 一番最新のラベルデータ
path_test = './physionet.org/files/challenge-2017/1.0.0/validation/REFERENCE-v3.csv'
df_label = pd.read_csv(path_test,header=None)
df_label.columns = ['filename','label']
#display(df_label[:2855])

In [3]:
# filenameをインデックスにする
df_label = df_label.set_index('filename')
df_label = df_label.iloc[:2855,0]
display(df_label.head())

filename
A00001    N
A00002    N
A00003    N
A00004    A
A00005    A
Name: label, dtype: object

In [4]:
# ラベル付を行う
# 値を明示的にするためにnumpyで行う
# 正常心電図が'N'
# 心房細動が'A'
# その他不整脈が'O'
# 雑音が含まれるものが'~'
normal_ecg = np.where(df_label == 'N', 1,0)
af_ecg = np.where(df_label == 'A', 2,0)
other = np.where(df_label == 'O', 3,0)
noise = np.where(df_label == '~', 4,0)
labels = normal_ecg + af_ecg + other + noise
display(labels[:22])

array([1, 1, 1, 2, 2, 1, 1, 3, 2, 1, 1, 1, 3, 1, 2, 1, 3, 1, 1, 3, 1, 4])

In [5]:
# データフレーム化
df_labels = pd.DataFrame(labels, columns=['label_num'], index=df_label.index)
display(df_labels.head())

Unnamed: 0_level_0,label_num
filename,Unnamed: 1_level_1
A00001,1
A00002,1
A00003,1
A00004,2
A00005,2


In [6]:
# 正常心電図が0
# 心房細動が1
# その他不整脈が2
# 雑音が含まれるものが3
df_labels = df_labels - 1
df_label = pd.concat([df_label,df_labels], axis=1)
display(df_label.head())

Unnamed: 0_level_0,label,label_num
filename,Unnamed: 1_level_1,Unnamed: 2_level_1
A00001,N,0
A00002,N,0
A00003,N,0
A00004,A,1
A00005,A,1


In [7]:
def count_arrhythmia(df):
    """
    不整脈のデータ数を数える
    """
    N = 0
    A = 0
    O = 0
    noise = 0
    for number in df['label_num']:
        if number == 0:
            N += 1
        elif number == 1:
            A += 1
        elif number == 2:
            O += 1
        else:
            noise += 1
    
    print(f'正常心電図 : {N}個')
    print(f'心房細動 : {A}個')
    print(f'その他の不整脈 : {O}個')
    print(f'雑音 : {noise}個')
    
count_arrhythmia(df_label)

正常心電図 : 148個
心房細動 : 47個
その他の不整脈 : 65個
雑音 : 40個


In [8]:
# 学習データを読み込みラベル付をする

path_test = './test_DataFrame'
path_test = glob.glob(path_test + '/*pkl')
path_test_str = ''.join(path_test)

# データフレームとして取得
df_test = pd.read_pickle(path_test_str)

# ラベルデータと結合
df_test = pd.concat([df_test, df_label], axis=1)
display(df_test.tail())

Unnamed: 0_level_0,rmssd,sdnn,sdsd,nn50,pnn50,mrri,mhr,ln(sdnn),stdhr,cvrr,label,label_num
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
A04452,362.25384,255.891114,364.421572,72.0,85.714286,704.563492,95.075894,5.544752,29.741756,36.319099,~,3
A04522,184.814622,147.266007,186.689746,35.0,68.627451,580.522876,109.35554,4.992241,24.690151,25.367821,~,3
A04701,310.048305,249.421513,316.958329,19.0,79.166667,727.638889,91.82389,5.519144,30.166032,34.2782,~,3
A04735,,,,0.0,0.0,543.333333,110.429448,,,,~,3
A04805,364.435324,264.374645,369.198371,31.0,77.5,708.583333,95.263725,5.577367,30.976125,37.310311,~,3


In [9]:
# 欠損値が含まれるサンプルを削除
df_test.dropna(inplace=True)
df_test.tail()

Unnamed: 0_level_0,rmssd,sdnn,sdsd,nn50,pnn50,mrri,mhr,ln(sdnn),stdhr,cvrr,label,label_num
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
A04282,466.49329,298.864285,472.097852,40.0,95.238095,825.238095,83.606291,5.69999,33.179573,36.215522,~,3
A04452,362.25384,255.891114,364.421572,72.0,85.714286,704.563492,95.075894,5.544752,29.741756,36.319099,~,3
A04522,184.814622,147.266007,186.689746,35.0,68.627451,580.522876,109.35554,4.992241,24.690151,25.367821,~,3
A04701,310.048305,249.421513,316.958329,19.0,79.166667,727.638889,91.82389,5.519144,30.166032,34.2782,~,3
A04805,364.435324,264.374645,369.198371,31.0,77.5,708.583333,95.263725,5.577367,30.976125,37.310311,~,3


In [10]:
# 欠損値削除後の各心電図の個数
count_arrhythmia(df_test)

正常心電図 : 148個
心房細動 : 47個
その他の不整脈 : 65個
雑音 : 38個


In [11]:
# データフレームの保存

save_path_dir = './test_DataFrame'

def dataframe_save(df, save_path_dir, file_name):
    """
    CSVとpickle形式で保存
    pickleは読み書きが高速
    パラメータ
    df : 保存したいデータフレーム
    save_path_dir : 保存先フォルダのパス
    file_name : 保存したいファイルネーム
    """
    # 保存先パス
    path_save = os.path.join(save_path_dir + '/' + file_name)
    
    # CSV形式で保存
    df.to_csv(path_save + '.csv', index=True, header=True, sep=',')
    # pickle形式で保存
    df.to_pickle(path_save + '.pkl')

dataframe_save(df=df_test, save_path_dir=save_path_dir, file_name='labeled_test_dataset')