In [1]:
import numpy as np 
import pandas as pd
import torch

training_dir = '../input/snu-2021-1-ds-project-3/train'
test_dir = '../input/snu-2021-1-ds-project-3/test'

def extract_age(info_file):
    '''
        info file(###.txt)로부터 나이 정보를 뽑아냅니다.
    '''
    with open(info_file, 'r') as f:
        info = f.read()
        for i, line in enumerate(info.split("\n")):
            if line.startswith("#Age"):
                age = float(line.split(": ")[1].strip())
    return age

def extract_sex(info_file):
    '''
        info file(###.txt)로부터 성별 정보를 뽑아냅니다.
    '''
    with open(info_file, 'r') as f:
            info = f.read()
            for i, line in enumerate(info.split("\n")):
                if line.startswith("#Sex"):
                    sex = line.split(": ")[1].strip()
    return sex

def extract_labels(info_file):
    '''
        info file(###.txt)로부터 label(들) 정보를 뽑아냅니다.
    '''
    with open(info_file, 'r') as f:
            info = f.read()
            for i, line in enumerate(info.split("\n")):
                if line.startswith("#Dx"):
                    labels = line.split(": ")[1].strip()
                    labels = labels.split()
    return labels

def read_files(data_directory, is_training=True):
    '''
        data directory(train 또는 test)로부터 모든 sample들의
        id, age, sex, recording, labels 정보를 읽어들여
        (id, age, sex, recording, labels)의 list를 반환합니다.
        is_training=False일 경우엔 labels 정보를 읽어들이지 않습니다.
    '''
    list_id = []
    list_age = []
    list_sex = []
    list_recording = []
    list_labels = []
    for f in os.listdir(data_directory):
        root, extension = os.path.splitext(f)
        if not root.startswith(".") and extension == ".txt":
            list_id.append(int(root))
            info_file = os.path.join(data_directory, root + ".txt")
            recording_file = os.path.join(data_directory, root + ".npy")
            age = extract_age(info_file)
            list_age.append(age)
            sex = extract_sex(info_file)
            list_sex.append(sex)
            with open(recording_file, 'rb') as g:
                recording = np.load(g)
                list_recording.append(recording)
            if is_training:
                labels = extract_labels(info_file)
                list_labels.append(labels)
    if is_training:
        return list(zip(list_id, list_age, list_sex, list_recording, list_labels))
    else:
        return list(zip(list_id, list_age, list_sex, list_recording))
    
class Dataset_ECG(torch.utils.data.Dataset):
    """
        Build ECG dataset
    """
    def __init__(self, dataset, num_classes=12):
        """
            dataset을 읽어들여 id, age, sex, recording, labels를 저장한 list를 만들어 줍니다.
        """
        self.sample_id = []
        self.sample_age = []
        self.sample_sex = []
        self.sample_recording = []
        self.sample_labels = []
        self.num_samples = len(dataset)
        
        for idx in range(self.num_samples):
            _id, _age, _sex, _recording, _labels = dataset[idx]
            
            # model에 input으로 들어가는 data는 torch.Tensor 타입으로 변환해 줍니다.
            age = torch.tensor(_age)
            sex = torch.tensor(0) if _sex == "F" else torch.tensor(1)
            #여자면 0, 남자면 1
            
            recording = torch.tensor(_recording)
            labels = torch.tensor(np.zeros(num_classes))
            
            for label in _labels:
                labels[int(label)] = 1
            #원 핫 인코딩으로 분류
            
            
            self.sample_id.append(_id)
            self.sample_age.append(age)
            self.sample_sex.append(sex)
            self.sample_recording.append(recording)
            self.sample_labels.append(labels)

        print(f'Loaded {self.num_samples} samples...')

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        return {
            "id": self.sample_id[idx],
            "age": self.sample_age[idx],
            "sex": self.sample_sex[idx],
            "recording": self.sample_recording[idx],
            "labels": self.sample_labels[idx]
        }


#num_classes -> label / num_leads -> recording1,2 / out_channel -> recording shape
class Example_CNN_v1(torch.nn.Module):
    def __init__(self, num_classes=12, num_leads=2):
        super(Example_CNN_v1, self).__init__()
        self.num_classes = num_classes
        self.num_leads = num_leads
        self.conv1 = torch.nn.Conv1d(in_channels=self.num_leads, out_channels=32, kernel_size=15, stride=3, padding=2)
        self.relu1 = torch.nn.ReLU()
        self.conv2 = torch.nn.Conv1d(in_channels=32, out_channels=64, kernel_size=13, stride=3, padding=1)
        self.relu2 = torch.nn.ReLU()
        self.conv3 = torch.nn.Conv1d(in_channels=64, out_channels=128, kernel_size=10, stride=2)
        self.relu3 = torch.nn.ReLU()
        self.conv4 = torch.nn.Conv1d(in_channels=128, out_channels=64, kernel_size=8, stride=2)
        self.relu4 = torch.nn.ReLU()
        self.conv5 = torch.nn.Conv1d(in_channels=64, out_channels=32, kernel_size=7, stride=2)
        self.relu5 = torch.nn.ReLU()
        self.fc1 = torch.nn.Linear(32*64, 128)
        self.relu6 = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(128, self.num_classes)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x):
        # 이 모델은 recording만을 input으로 받습니다. feature를 추가적으로 사용하도록 할 수도 있습니다.
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.conv3(x)
        x = self.relu3(x)
        x = self.conv4(x)
        x = self.relu4(x)
        x = self.conv5(x)
        x = self.relu5(x)
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = self.fc1(x)
        x = self.relu6(x)
        x = self.fc2(x)
        x = torch.flatten(x)
        out = self.sigmoid(x)
        return out

print("완료")


완료


In [2]:
class Example_CNN_v2(torch.nn.Module):
    def __init__(self, num_classes=12, num_leads=2):
        super(Example_CNN_v2, self).__init__()
        self.num_classes = num_classes
        self.num_leads = num_leads
        
        self.conv1 = torch.nn.Conv1d(in_channels=self.num_leads, out_channels=32, kernel_size=15, stride=3, padding=2)
        self.relu1 = torch.nn.ReLU()
        self.batch1 = torch.nn.BatchNorm1d(32)
        
        self.conv2 = torch.nn.Conv1d(in_channels=32, out_channels=64, kernel_size=13, stride=3, padding=1)
        self.relu2 = torch.nn.ReLU()
        self.batch2 = torch.nn.BatchNorm1d(64)
        
        
        self.conv3 = torch.nn.Conv1d(in_channels=64, out_channels=128, kernel_size=10, stride=2)
        self.relu3 = torch.nn.ReLU()
        self.batch3 = torch.nn.BatchNorm1d(128)
        
        self.conv4 = torch.nn.Conv1d(in_channels=128, out_channels=128, kernel_size=8, stride=2)
        self.relu4 = torch.nn.ReLU()
        self.batch4 = torch.nn.BatchNorm1d(128)
                                             
        self.conv5 = torch.nn.Conv1d(in_channels=128, out_channels=256, kernel_size=7, stride=2)
        self.relu5 = torch.nn.ReLU()
        self.batch5 = torch.nn.BatchNorm1d(256)
        
        self.conv6 = torch.nn.Conv1d(in_channels=256, out_channels=128, kernel_size=7, stride=2)
        self.relu6 = torch.nn.ReLU()
        self.batch6 = torch.nn.BatchNorm1d(128)
        
        self.conv7 = torch.nn.Conv1d(in_channels=128, out_channels=64, kernel_size=6, stride=2)
        self.relu7 = torch.nn.ReLU()
        self.batch7 = torch.nn.BatchNorm1d(64)
        
        self.conv8 = torch.nn.Conv1d(in_channels=64, out_channels=32, kernel_size=6, stride=2)
        self.relu8 = torch.nn.ReLU()
        self.batch8 = torch.nn.BatchNorm1d(32)
        
        self.conv9 = torch.nn.Conv1d(in_channels=32, out_channels=32, kernel_size=5, stride=2)
        self.relu9 = torch.nn.ReLU()

        self.fc1 = torch.nn.Linear(32*4, 256)
        self.relu_fc = torch.nn.ReLU()

        self.fc2 = torch.nn.Linear(256, 128)
        self.fc3 = torch.nn.Linear(128, 64)
        self.fc4 = torch.nn.Linear(64, 32)
        self.fc5 = torch.nn.Linear(32, self.num_classes)

    def forward(self, x):
        # 이 모델은 recording만을 input으로 받습니다. feature를 추가적으로 사용하도록 할 수도 있습니다.
    
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.batch1(x)
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.batch2(x)
        x = self.conv3(x)
        x = self.relu3(x)
        x = self.batch3(x)
        x = self.conv4(x)
        x = self.relu4(x)
        x = self.batch4(x)
        x = self.conv5(x)
        x = self.relu5(x)
        x = self.batch5(x)
    
        x = self.conv6(x)
        x = self.relu6(x)
        x = self.batch6(x)
        
        x = self.conv7(x)
        x = self.relu7(x)
        x = self.batch7(x)
        x = self.conv8(x)
        x = self.relu8(x)
        x = self.batch8(x)

        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = self.fc1(x)
        x = self.relu_fc(x)
        
        x = self.fc2(x)
        x = self.relu_fc(x)
        
        x = self.fc3(x)
        x = self.relu_fc(x)
        
        x = self.fc4(x)
        x = self.relu_fc(x)
        
        out = self.fc5(x)
       
        
        return out

In [3]:
import os

total_training_set = sorted(read_files(training_dir), key=lambda sample: sample[0])
total_num_training = len(total_training_set)
print(f"Number of total training samples: {total_num_training}")

#num_validation = int(total_num_training * 0.2)
#num_training = total_num_training - num_validation
num_training = total_num_training
#validation_set = total_training_set[:num_validation]
#training_set = total_training_set[num_validation:]
training_set = total_training_set#
#print(f'Number of validation samples: {num_validation}')
print(f'Number of training samples: {num_training}')

Number of total training samples: 19212
Number of training samples: 19212


In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [5]:
training_dataset = Dataset_ECG(training_set, num_classes=12)
print("완료")

Loaded 19212 samples...
완료


In [6]:
EPOCHS = 64
BATCH_SIZE = 32
LEARNING_RATE = 0.001

training_loader = torch.utils.data.DataLoader(training_dataset, pin_memory=True, batch_size=BATCH_SIZE)

model3 = Example_CNN_v2(num_classes=12, num_leads=2)

model3.to(device)
model3.train()

criterion = torch.nn.BCEWithLogitsLoss() # for multi-label classification
optimizer = torch.optim.Adam(model3.parameters(), lr=LEARNING_RATE)

for epoch in range(1, EPOCHS+1):
    print(f'***** Epoch {epoch} *****')
    epoch_training_loss_sum = 0.0
    for i_batch, sample_batched in enumerate(training_loader):
        b_recording = sample_batched["recording"].to(device)
        b_labels = sample_batched["labels"].to(device)
        optimizer.zero_grad()
        b_out = model3(b_recording)
        loss = criterion(b_out, b_labels)
        loss.backward()
        optimizer.step()
        epoch_training_loss_sum += loss.item() * b_labels.shape[0]

    epoch_training_loss = epoch_training_loss_sum / num_training
    print(f'training loss of epoch {epoch}: {epoch_training_loss}\n')

print("완료")


***** Epoch 1 *****
training loss of epoch 1: 0.2507011435826544

***** Epoch 2 *****
training loss of epoch 2: 0.19691715932755727

***** Epoch 3 *****
training loss of epoch 3: 0.18163216320453007

***** Epoch 4 *****
training loss of epoch 4: 0.16973982069067656

***** Epoch 5 *****
training loss of epoch 5: 0.16066806130067782

***** Epoch 6 *****
training loss of epoch 6: 0.15281839563584418

***** Epoch 7 *****
training loss of epoch 7: 0.14658229929146452

***** Epoch 8 *****
training loss of epoch 8: 0.1397230401285928

***** Epoch 9 *****
training loss of epoch 9: 0.13318993072178478

***** Epoch 10 *****
training loss of epoch 10: 0.1266883983988548

***** Epoch 11 *****
training loss of epoch 11: 0.12011686464083242

***** Epoch 12 *****
training loss of epoch 12: 0.11241597621271592

***** Epoch 13 *****
training loss of epoch 13: 0.1047648793599527

***** Epoch 14 *****
training loss of epoch 14: 0.09760419131000317

***** Epoch 15 *****
training loss of epoch 15: 0.091210

In [7]:

idx_set = [[], []]

idx = [i for i in range(len(training_dataset)) if training_dataset[i]["labels"][8]==1]
idx_set[0] = idx
idx = [i for i in range(len(training_dataset)) if training_dataset[i]["labels"][8]==0]
idx_set[1] = idx
    

print(len(idx_set[0]))
print(len(idx_set[1]))
print("완료")

12836
6376
완료


In [8]:
abnormal_train_set = []
for i in idx_set[1] :
    abnormal_train_set.append(training_set[i])
print(len(abnormal_train_set))

6376


In [9]:
abnormal_training_dataset = Dataset_ECG(abnormal_train_set, num_classes=12)

Loaded 6376 samples...


In [10]:
EPOCHS = 64
BATCH_SIZE = 32
LEARNING_RATE = 0.001

training_loader = torch.utils.data.DataLoader(abnormal_training_dataset, pin_memory=True, batch_size=BATCH_SIZE)

model2 = Example_CNN_v2(num_classes=12, num_leads=2)

model2.to(device)
model2.train()

criterion = torch.nn.BCEWithLogitsLoss() # for multi-label classification
optimizer = torch.optim.Adam(model2.parameters(), lr=LEARNING_RATE)


print("완료")

완료


In [11]:
for epoch in range(1, EPOCHS+1):
    print(f'***** Epoch {epoch} *****')
    epoch_training_loss_sum = 0.0
    for i_batch, sample_batched in enumerate(training_loader):
        b_recording = sample_batched["recording"].to(device)
        b_labels = sample_batched["labels"].to(device)
        optimizer.zero_grad()
        b_out = model2(b_recording)
        loss = criterion(b_out, b_labels)
        loss.backward()
        optimizer.step()
        epoch_training_loss_sum += loss.item() * b_labels.shape[0]

    epoch_training_loss = epoch_training_loss_sum / len(abnormal_training_dataset)
    print(f'training loss of epoch {epoch}: {epoch_training_loss}\n')

print("완료")

***** Epoch 1 *****
training loss of epoch 1: 0.36978485797066685

***** Epoch 2 *****
training loss of epoch 2: 0.26933320135299743

***** Epoch 3 *****
training loss of epoch 3: 0.24717542348128238

***** Epoch 4 *****
training loss of epoch 4: 0.23284003305468137

***** Epoch 5 *****
training loss of epoch 5: 0.22502387241549207

***** Epoch 6 *****
training loss of epoch 6: 0.21746177730256683

***** Epoch 7 *****
training loss of epoch 7: 0.21004485032944478

***** Epoch 8 *****
training loss of epoch 8: 0.201786508777551

***** Epoch 9 *****
training loss of epoch 9: 0.19422492597579227

***** Epoch 10 *****
training loss of epoch 10: 0.18562438125453323

***** Epoch 11 *****
training loss of epoch 11: 0.17946087014117074

***** Epoch 12 *****
training loss of epoch 12: 0.1707531334441582

***** Epoch 13 *****
training loss of epoch 13: 0.16582388535208703

***** Epoch 14 *****
training loss of epoch 14: 0.15530063897117707

***** Epoch 15 *****
training loss of epoch 15: 0.14543

In [12]:

train_8 = [[]]*len(training_dataset)
for i in range(len(training_dataset)):
    train_8[i] = training_dataset.sample_labels[i][8]

training_dataset.sample_labels = train_8
print("완료")

완료


In [13]:
training_dataset.sample_labels = torch.FloatTensor(training_dataset.sample_labels)

In [14]:
'''
idxs = []
for i in range(len(validation_set)):
    if '8' in validation_set[i][4] :
        idxs.append(i)
print("완료")
'''

'\nidxs = []\nfor i in range(len(validation_set)):\n    if \'8\' in validation_set[i][4] :\n        idxs.append(i)\nprint("완료")\n'

In [15]:
'''
tmp = list(validation_set)

for i in range(len(validation_set)) :
    validation_set[i] = list(validation_set[i])
    validation_set[i][4] = ['0']
    
for i in idxs :
    validation_set[i][4] = ["1"]

for i in range(len(validation_set)) :
    validation_set[i] = tuple(validation_set[i])

print(validation_set[0][4])
'''

'\ntmp = list(validation_set)\n\nfor i in range(len(validation_set)) :\n    validation_set[i] = list(validation_set[i])\n    validation_set[i][4] = [\'0\']\n    \nfor i in idxs :\n    validation_set[i][4] = ["1"]\n\nfor i in range(len(validation_set)) :\n    validation_set[i] = tuple(validation_set[i])\n\nprint(validation_set[0][4])\n'

In [16]:
EPOCHS = 64
BATCH_SIZE = 32
LEARNING_RATE = 0.001

training_loader = torch.utils.data.DataLoader(training_dataset, pin_memory=True, batch_size=BATCH_SIZE)

model = Example_CNN_v1(num_classes=1, num_leads=2)

model.to(device)
model.train()

criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
#optimizer = Nadam(model.parameters(), lr=LEARNING_RATE)


#adagrad vs. adam

print("완료")

완료


In [17]:

for epoch in range(1, EPOCHS+1):
    print(f'***** Epoch {epoch} *****')
    epoch_training_loss_sum = 0.0
    for i_batch, sample_batched in enumerate(training_loader):
        b_recording = sample_batched["recording"].to(device)
        b_labels = sample_batched["labels"].to(device)
        optimizer.zero_grad()
        b_out = model(b_recording)
        loss = criterion(b_out, b_labels)
        loss.backward()
        optimizer.step()
        epoch_training_loss_sum += loss.item() * b_labels.shape[0]

    epoch_training_loss = epoch_training_loss_sum / num_training
    print(f'training loss of epoch {epoch}: {epoch_training_loss}\n')

    
print("완료")


***** Epoch 1 *****
training loss of epoch 1: 0.5954829398639595

***** Epoch 2 *****
training loss of epoch 2: 0.43833740060835263

***** Epoch 3 *****
training loss of epoch 3: 0.38529905416234295

***** Epoch 4 *****
training loss of epoch 4: 0.34743333690127504

***** Epoch 5 *****
training loss of epoch 5: 0.30217521075976034

***** Epoch 6 *****
training loss of epoch 6: 0.2554659780499595

***** Epoch 7 *****
training loss of epoch 7: 0.20634518938835975

***** Epoch 8 *****
training loss of epoch 8: 0.1726889009980123

***** Epoch 9 *****
training loss of epoch 9: 0.1540869129144932

***** Epoch 10 *****
training loss of epoch 10: 0.11235026998068054

***** Epoch 11 *****
training loss of epoch 11: 0.08639278305900387

***** Epoch 12 *****
training loss of epoch 12: 0.06599922728609502

***** Epoch 13 *****
training loss of epoch 13: 0.055967352855054754

***** Epoch 14 *****
training loss of epoch 14: 0.04746641277570402

***** Epoch 15 *****
training loss of epoch 15: 0.04824

In [18]:
'''
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

model.eval()

validation_prediction_df = pd.DataFrame(columns=['labels'])
validation_prediction_df.index.name = 'id'
validation_true_labels_df = pd.DataFrame(columns=['labels'])
validation_true_labels_df.index.name = 'id'

with torch.no_grad():
    for idx in range(len(validation_set)):
        validation_sample = validation_set[idx]
        _, _, _, recording, labels = validation_sample
        
        sample_prediction = model(torch.tensor(recording).unsqueeze(0).to(device)) > 0.5
        
        if (sample_prediction.cpu()==True) :
            rlt = 1
        else :
            rlt = 0
        validation_prediction_df.loc[idx] = [rlt]
        validation_true_labels_df.loc[idx] = int(labels[0])
        
print("완료")
'''

'\nfrom sklearn.metrics import f1_score, accuracy_score, confusion_matrix\n\nmodel.eval()\n\nvalidation_prediction_df = pd.DataFrame(columns=[\'labels\'])\nvalidation_prediction_df.index.name = \'id\'\nvalidation_true_labels_df = pd.DataFrame(columns=[\'labels\'])\nvalidation_true_labels_df.index.name = \'id\'\n\nwith torch.no_grad():\n    for idx in range(len(validation_set)):\n        validation_sample = validation_set[idx]\n        _, _, _, recording, labels = validation_sample\n        \n        sample_prediction = model(torch.tensor(recording).unsqueeze(0).to(device)) > 0.5\n        \n        if (sample_prediction.cpu()==True) :\n            rlt = 1\n        else :\n            rlt = 0\n        validation_prediction_df.loc[idx] = [rlt]\n        validation_true_labels_df.loc[idx] = int(labels[0])\n        \nprint("완료")\n'

In [19]:
'''
cnt = len(validation_set)
correct = 0
for i in range(len(validation_set)) :
    m = validation_prediction_df["labels"].iloc[i] - validation_true_labels_df["labels"].iloc[i]
    if (m==0) :
        correct+=1
        
print("accuracy : %f" % (correct/cnt))
'''

'\ncnt = len(validation_set)\ncorrect = 0\nfor i in range(len(validation_set)) :\n    m = validation_prediction_df["labels"].iloc[i] - validation_true_labels_df["labels"].iloc[i]\n    if (m==0) :\n        correct+=1\n        \nprint("accuracy : %f" % (correct/cnt))\n'

In [20]:
'''
import sklearn.metrics


for i in range(len(validation_set)) :
    validation_prediction_df["labels"].iloc[i] = bin(validation_prediction_df["labels"].iloc[i])
    validation_true_labels_df["labels"].iloc[i] = bin(validation_true_labels_df["labels"].iloc[i])

    
print(sklearn.metrics.confusion_matrix(validation_true_labels_df["labels"], validation_prediction_df["labels"]))
'''

'\nimport sklearn.metrics\n\n\nfor i in range(len(validation_set)) :\n    validation_prediction_df["labels"].iloc[i] = bin(validation_prediction_df["labels"].iloc[i])\n    validation_true_labels_df["labels"].iloc[i] = bin(validation_true_labels_df["labels"].iloc[i])\n\n    \nprint(sklearn.metrics.confusion_matrix(validation_true_labels_df["labels"], validation_prediction_df["labels"]))\n'

In [21]:
"""
model.eval()

validation_prediction_df = pd.DataFrame(columns=['labels'])
validation_prediction_df.index.name = 'id'
validation_true_labels_df = pd.DataFrame(columns=['labels'])
validation_true_labels_df.index.name = 'id'

with torch.no_grad():
    for idx in range(len(validation_set)):
        validation_sample = validation_set[idx]
        _, _, _, recording, labels = validation_sample
        out = model(torch.tensor(recording).unsqueeze(0).to(device)) # unsqueeze는 batch dimension을 추가해주기 위함
        sample_prediction = torch.nn.functional.sigmoid(out).squeeze() > 0.5 # Use 0.5 as a threshold / squeeze는 batch dimension을 제거해주기 위함
        indices_of_1s = np.where(sample_prediction.cpu())[0]
        str_indices_of_1s = ' '.join(map(str, indices_of_1s))
        validation_prediction_df.loc[idx] = [str_indices_of_1s]
        
        str_true_labels = ' '.join(labels)
        validation_true_labels_df.loc[idx] = [str_true_labels]

        

print("완료")
"""

'\nmodel.eval()\n\nvalidation_prediction_df = pd.DataFrame(columns=[\'labels\'])\nvalidation_prediction_df.index.name = \'id\'\nvalidation_true_labels_df = pd.DataFrame(columns=[\'labels\'])\nvalidation_true_labels_df.index.name = \'id\'\n\nwith torch.no_grad():\n    for idx in range(len(validation_set)):\n        validation_sample = validation_set[idx]\n        _, _, _, recording, labels = validation_sample\n        out = model(torch.tensor(recording).unsqueeze(0).to(device)) # unsqueeze는 batch dimension을 추가해주기 위함\n        sample_prediction = torch.nn.functional.sigmoid(out).squeeze() > 0.5 # Use 0.5 as a threshold / squeeze는 batch dimension을 제거해주기 위함\n        indices_of_1s = np.where(sample_prediction.cpu())[0]\n        str_indices_of_1s = \' \'.join(map(str, indices_of_1s))\n        validation_prediction_df.loc[idx] = [str_indices_of_1s]\n        \n        str_true_labels = \' \'.join(labels)\n        validation_true_labels_df.loc[idx] = [str_true_labels]\n\n        \n\nprint("완료")\

In [22]:
"""
for i in range(len(validation_prediction_df)) :
    
    tmp = validation_prediction_df['labels'].iloc[i]
    
    if ("8" in tmp) :
        if (len(tmp)>=2) :
            validation_prediction_df['labels'].iloc[i]="8"
    '''
    if ("2" in tmp) :
        if ("3" not in tmp) :
             validation_prediction_df['labels'].iloc[i]=tmp.replace("2", "2 3", 1)
    ''' 

print(validation_prediction_df.head())
"""

'\nfor i in range(len(validation_prediction_df)) :\n    \n    tmp = validation_prediction_df[\'labels\'].iloc[i]\n    \n    if ("8" in tmp) :\n        if (len(tmp)>=2) :\n            validation_prediction_df[\'labels\'].iloc[i]="8"\n    \'\'\'\n    if ("2" in tmp) :\n        if ("3" not in tmp) :\n             validation_prediction_df[\'labels\'].iloc[i]=tmp.replace("2", "2 3", 1)\n    \'\'\' \n\nprint(validation_prediction_df.head())\n'

In [23]:
"""
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score


mlb = MultiLabelBinarizer(classes=['0','1','2','3','4','5','6','7','8','9','10','11'])
mlb.fit(map(str.split, validation_true_labels_df['labels'].values))

macro_f1_validation = f1_score(mlb.transform(map(str.split, validation_true_labels_df['labels'].values)), mlb.transform(map(str.split, validation_prediction_df['labels'].values)), average='macro')
print(f'macro f1 score on validation set: {macro_f1_validation}')
"""

"\nfrom sklearn.preprocessing import MultiLabelBinarizer\nfrom sklearn.metrics import f1_score\n\n\nmlb = MultiLabelBinarizer(classes=['0','1','2','3','4','5','6','7','8','9','10','11'])\nmlb.fit(map(str.split, validation_true_labels_df['labels'].values))\n\nmacro_f1_validation = f1_score(mlb.transform(map(str.split, validation_true_labels_df['labels'].values)), mlb.transform(map(str.split, validation_prediction_df['labels'].values)), average='macro')\nprint(f'macro f1 score on validation set: {macro_f1_validation}')\n"

In [24]:

test_set = sorted(read_files(test_dir, is_training=False), key=lambda sample:sample[0])
num_test = len(test_set)
print(f'Number of test samples: {num_test}')


Number of test samples: 7389


In [25]:
model.eval()

test_prediction_df = pd.DataFrame(columns=['labels'])
test_prediction_df.index.name = 'id'

with torch.no_grad():
    for idx in range(len(test_set)):
        test_sample = test_set[idx]
        _, _, _, recording = test_sample
        
        sample_prediction = model(torch.tensor(recording).unsqueeze(0).to(device)) > 0.5
        try :
            if (sample_prediction.cpu()==True) :
                rlt = 1
            else :
                rlt = 0
        except : 
            print(sample_prediction)
            rlt=0
        test_prediction_df.loc[idx] = [rlt]
print(test_prediction_df[:10])

   labels
id       
0       0
1       0
2       0
3       0
4       1
5       1
6       1
7       1
8       1
9       1


In [26]:
unnormal_idx = []
for i in range(len(test_set)) :
    if (test_prediction_df["labels"].iloc[i] == 0) :
        unnormal_idx.append(i)
        
print(unnormal_idx[:5])

[0, 1, 2, 3, 10]


In [27]:
test_set_unnormal = []

for i in unnormal_idx :
    test_set_unnormal.append(test_set[i])


print(test_set_unnormal[0])
print(test_set_unnormal[1])
print(test_set_unnormal[2])


(0, 85.0, 'F', array([[ 0.015,  0.015,  0.015, ..., -0.03 , -0.03 , -0.03 ],
       [ 0.025,  0.025,  0.025, ..., -0.15 , -0.15 , -0.15 ]],
      dtype=float32))
(1, 51.0, 'M', array([[-0.025, -0.025, -0.025, ..., -0.035, -0.035, -0.035],
       [ 0.025,  0.025,  0.025, ..., -0.095, -0.095, -0.095]],
      dtype=float32))
(2, 68.0, 'F', array([[-0.039, -0.039, -0.039, ...,  0.039,  0.039,  0.043],
       [-0.029, -0.029, -0.029, ..., -0.019, -0.019, -0.014]],
      dtype=float32))


In [28]:
model3.eval()

test_prediction_df_3 = pd.DataFrame(columns=['labels'])
test_prediction_df_3.index.name = 'id'

with torch.no_grad():
    for idx in range(len(test_set)):
        test_sample = test_set[idx]
        _, _, _, recording = test_sample
        out = model3(torch.tensor(recording).unsqueeze(0).to(device)) # unsqueeze는 batch dimension을 추가해주기 위함
        sample_prediction = torch.nn.functional.sigmoid(out).squeeze() > 0.5 # Use 0.5 as a threshold / squeeze는 batch dimension을 제거해주기 위함
        indices_of_1s = np.where(sample_prediction.cpu())[0]
        str_indices_of_1s = ' '.join(map(str, indices_of_1s))
        test_prediction_df_3.loc[idx] = [str_indices_of_1s]
        




In [29]:
model2.eval()

test_prediction_df_2 = pd.DataFrame(columns=['labels'])
test_prediction_df_2.index.name = 'id'

with torch.no_grad():
    for idx in range(len(test_set_unnormal)):
        test_sample = test_set_unnormal[idx]
        _, _, _, recording = test_sample
        out = model2(torch.tensor(recording).unsqueeze(0).to(device)) # unsqueeze는 batch dimension을 추가해주기 위함
        sample_prediction = torch.nn.functional.sigmoid(out).squeeze() > 0.5 # Use 0.5 as a threshold / squeeze는 batch dimension을 제거해주기 위함
        indices_of_1s = np.where(sample_prediction.cpu())[0]
        str_indices_of_1s = ' '.join(map(str, indices_of_1s))
        test_prediction_df_2.loc[idx] = [str_indices_of_1s]
        


In [30]:
n=0
for i in range(len(test_prediction_df)) :
    if (test_prediction_df["labels"].iloc[i] == 1) :
        test_prediction_df["labels"].iloc[i] = "8"
    else :
        test_prediction_df["labels"].iloc[i] = test_prediction_df_2["labels"].iloc[n]
        n+=1
        


In [31]:

for i in range(len(test_prediction_df)) :
    
    tmp = test_prediction_df['labels'].iloc[i]
    
    if ((tmp == "") & (test_prediction_df_3 != "")) :
        test_prediction_df["labels"].iloc[i] = test_prediction_df_3["labels"].iloc[i]
    
    if ((tmp == "8") & ("8" in test_prediction_df_3["labels"].iloc[i])) :
        test_prediction_df["labels"].iloc[i] = test_prediction_df_3["labels"].iloc[i]
    
    if ("2" in tmp) :
        if ("3" not in tmp) :
             test_prediction_df['labels'].iloc[i] = tmp.replace("2", "2 3", 1)
    
    if ((tmp == "") & (test_prediction_df_3 == "")) :
        test_prediction_df["labels"].iloc[i] = "6"
        
    
    

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [32]:
test_prediction_df.to_csv('my_submission.csv')