# Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip -qq "/content/drive/MyDrive/Colab Notebooks/open.zip"

In [None]:
!pip install torchmetrics
!pip install audiomentations

Collecting torchmetrics
  Downloading torchmetrics-1.4.0.post0-py3-none-any.whl (868 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m868.8/868.8 kB[0m [31m53.4 kB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.11.3.post0-py3-none-any.whl (26 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.11.3.post0 torchmetrics-1.4.0.post0
Collecting audiomentations
  Downloading audiomentations-0.36.0-py3-none-any.whl (80 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.3/80.3 kB[0m [31m786.1 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: audiomentations
Successfully installed audiomentations-0.36.0


In [None]:
import librosa
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import random
import IPython.display as ipd
from tqdm.notebook import tqdm
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from tqdm.notebook import tqdm  # !!

import torch
import torchmetrics
import os
from torchvision import models

import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Config

In [None]:
class Config:
    SR = 32000
    N_MFCC = 13  # 수정됨!!!: N_MFCC를 원래 값으로 되돌림
    N_MELS = 128  # Number of Mel features
    # Dataset
    ROOT_FOLDER = './'
    # Training
    N_CLASSES = 2
    BATCH_SIZE = 96
    N_EPOCHS = 100
    LR = 3e-4
    NUM_HEADS = 4
    NUM_LAYERS = 2
    EMBED_DIM = 64
    EARLY_STOPPING_PATIENCE = 20  # Early stopping patience 설정
    Scheduler_patience=8
    # Others
    SEED = 42

CONFIG = Config()

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CONFIG.SEED) # Seed 고정

In [None]:
df = pd.read_csv('./train.csv')
train_df, val_df, _, _ = train_test_split(df, df['label'], test_size=0.2, random_state=CONFIG.SEED)

##오디오 로딩

In [97]:
def load_audio_data(df):
    audio_data = []
    labels = []
    for _, row in tqdm(df.iterrows(), total=df.shape[0]):
        y, sr = librosa.load(row['path'], sr=CONFIG.SR)
        audio_data.append(y)
        label = row['label']
        label_vector = np.zeros(CONFIG.N_CLASSES, dtype=float)
        label_vector[0 if label == 'fake' else 1] = 1
        labels.append(label_vector)
    return audio_data, labels

In [None]:
train_audio_data, train_labels = load_audio_data(train_df)
val_audio_data, val_labels = load_audio_data(val_df)

  0%|          | 0/44350 [00:00<?, ?it/s]

  0%|          | 0/11088 [00:00<?, ?it/s]

In [None]:
print (train_labels)

[array([1., 0.]), array([1., 0.]), array([0., 1.]), array([1., 0.]), array([0., 1.]), array([0., 1.]), array([1., 0.]), array([1., 0.]), array([1., 0.]), array([1., 0.]), array([0., 1.]), array([1., 0.]), array([0., 1.]), array([1., 0.]), array([0., 1.]), array([0., 1.]), array([0., 1.]), array([1., 0.]), array([0., 1.]), array([0., 1.]), array([0., 1.]), array([0., 1.]), array([1., 0.]), array([0., 1.]), array([1., 0.]), array([1., 0.]), array([0., 1.]), array([1., 0.]), array([0., 1.]), array([0., 1.]), array([0., 1.]), array([0., 1.]), array([1., 0.]), array([1., 0.]), array([0., 1.]), array([0., 1.]), array([1., 0.]), array([0., 1.]), array([0., 1.]), array([1., 0.]), array([1., 0.]), array([1., 0.]), array([1., 0.]), array([0., 1.]), array([0., 1.]), array([1., 0.]), array([1., 0.]), array([1., 0.]), array([0., 1.]), array([0., 1.]), array([1., 0.]), array([0., 1.]), array([0., 1.]), array([0., 1.]), array([1., 0.]), array([0., 1.]), array([1., 0.]), array([0., 1.]), array([1., 0.

##오디오 합치기

In [None]:
def combine_audios(audio_data, labels, n_samples, silent_ratio=0.1):
    combined_audios = []
    combined_labels = []

    # 오디오와 라벨을 묶어서 리스트로 관리
    audio_label_pairs = list(zip(audio_data, labels))

    # 빈 오디오와 라벨 추가
    num_silent_samples = int(len(audio_data) * silent_ratio)
    for _ in range(num_silent_samples):
        random_audio_length = len(random.choice(audio_data))
        silent_audio = np.zeros(random_audio_length)  # 무작위 길이의 빈 오디오 데이터
        silent_label = np.array([0, 0])
        audio_label_pairs.append((silent_audio, silent_label))

    print("Combining audios randomly, including silent samples")
    for _ in tqdm(range(n_samples), desc="Processing"):
        (audio1, label1), (audio2, label2) = random.sample(audio_label_pairs, 2)

        max_length = max(len(audio1), len(audio2))
        padded_audio1 = np.pad(audio1, (0, max_length - len(audio1)), 'constant')
        padded_audio2 = np.pad(audio2, (0, max_length - len(audio2)), 'constant')
        combined_audio = padded_audio1 + padded_audio2
        combined_audios.append(combined_audio)

        # Combine labels by OR operation
        combined_label = np.logical_or(label1, label2).astype(float)
        combined_labels.append(combined_label)

    return combined_audios, combined_labels

In [None]:
combined_test_audios, combined_test_labels = combine_audios(train_audio_data, train_labels, n_samples= 40000, silent_ratio=0.05)
combined_val_audios, combined_val_labels = combine_audios(val_audio_data, val_labels, n_samples=10000, silent_ratio=0.05)


Combining audios randomly, including silent samples


Processing:   0%|          | 0/40000 [00:00<?, ?it/s]

Combining audios randomly, including silent samples


Processing:   0%|          | 0/10000 [00:00<?, ?it/s]

In [None]:
print(combined_test_labels)

[array([1., 1.]), array([1., 1.]), array([0., 1.]), array([0., 1.]), array([1., 0.]), array([1., 1.]), array([1., 1.]), array([0., 1.]), array([0., 1.]), array([1., 0.]), array([1., 0.]), array([1., 1.]), array([1., 0.]), array([1., 1.]), array([1., 1.]), array([1., 1.]), array([1., 1.]), array([1., 1.]), array([1., 0.]), array([0., 1.]), array([0., 1.]), array([0., 1.]), array([1., 1.]), array([0., 1.]), array([1., 1.]), array([1., 1.]), array([1., 1.]), array([0., 1.]), array([1., 1.]), array([0., 1.]), array([1., 1.]), array([0., 1.]), array([0., 1.]), array([0., 1.]), array([0., 1.]), array([1., 1.]), array([1., 1.]), array([1., 1.]), array([1., 1.]), array([0., 1.]), array([0., 1.]), array([1., 1.]), array([0., 1.]), array([1., 1.]), array([1., 1.]), array([0., 1.]), array([1., 1.]), array([1., 1.]), array([1., 0.]), array([1., 1.]), array([1., 0.]), array([0., 1.]), array([0., 1.]), array([1., 1.]), array([1., 1.]), array([0., 1.]), array([1., 1.]), array([1., 0.]), array([1., 1.

만들어진 소리 들어보기

In [24]:
import IPython.display as ipd
import soundfile as sf
import os

# 오디오 파일 저장 디렉토리 설정
augmented_audio_dir = "./augmented_audios"
os.makedirs(augmented_audio_dir, exist_ok=True)

# 몇 개의 오디오 샘플을 저장하고 재생합니다
num_samples_to_listen = min(3, len(combined_test_audios))  # 최대 3개의 오디오 샘플 재생

for i in range(num_samples_to_listen):
    # 오디오 저장 및 재생
    combined_audio_path = os.path.join(augmented_audio_dir, f"combined_audio_{i}.wav")
    sf.write(combined_audio_path, combined_test_audios[i], CONFIG.SR)
    print(f"Playing combined audio {i}")
    ipd.display(ipd.Audio(combined_audio_path))

Playing combined audio 0


Playing combined audio 1


Playing combined audio 2


augumented data랑 합치기

In [25]:
train_audio_data, train_labels = train_audio_data+combined_test_audios, train_labels+combined_test_labels
val_audio_data, val_labels = val_audio_data+combined_val_audios, val_labels+combined_val_labels

unlabeled data 에서 노이즈 학습

In [26]:
def load_audio_files_from_directory(directory, sr=32000):
    audio_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.ogg'):  # 필요한 경우 확장자 필터를 추가할 수 있습니다.
                file_path = os.path.join(root, file)
                y, _ = librosa.load(file_path, sr=sr)
                audio_files.append(y)
    return audio_files

In [27]:
unlabeled_data_directory = './unlabeled_data'
unlabeled_audio_files = load_audio_files_from_directory(unlabeled_data_directory)
print(f"Number of unlabeled audio files: {len(unlabeled_audio_files)}")

Number of unlabeled audio files: 1264


In [28]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

def extract_noise_segments(unlabeled_audio_files, sr=32000):
    noise_segments = []

    # 각 파일에서 노이즈 추출
    for y in tqdm(unlabeled_audio_files, desc="Extracting noise segments"):
        # STFT를 사용하여 시간-주파수 표현으로 변환
        S = np.abs(librosa.stft(y))
        # HPSS를 사용하여 퍼커시브 성분(노이즈) 분리
        _, noise = librosa.decompose.hpss(S)
        noise_segments.append(noise)

    return noise_segments

def cluster_noise_segments(noise_segments, n_components=10, n_clusters=5):
    features = []

    for noise in tqdm(noise_segments, desc="Extracting features from noise segments"):
        S = np.abs(noise)
        S_flat = S.T.reshape(-1)
        features.append(S_flat)

    features = np.array(features)

    # PCA 변환
    print("Performing PCA transformation...")
    pca = PCA(n_components=n_components)
    features_pca = pca.fit_transform(features)

    # KMeans 클러스터링
    print("Performing KMeans clustering...")
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(features_pca)

    noise_clusters = {i: [] for i in range(n_clusters)}
    for idx, label in tqdm(enumerate(kmeans.labels_), desc="Classifying noises", total=len(kmeans.labels_)):
        noise_clusters[label].append(noise_segments[idx])

    return noise_clusters

In [29]:
# 노이즈 세그먼트 추출
noise_segments = extract_noise_segments(unlabeled_audio_files)

# 노이즈 클러스터링
noise_clusters = cluster_noise_segments(noise_segments)

Extracting noise segments:   0%|          | 0/1264 [00:00<?, ?it/s]

Extracting features from noise segments:   0%|          | 0/1264 [00:00<?, ?it/s]

Performing PCA transformation...
Performing KMeans clustering...


Classifying noises:   0%|          | 0/1264 [00:00<?, ?it/s]

추출된 노이즈 들어보기

In [30]:
import librosa
import soundfile as sf
import IPython.display as ipd
import numpy as np

# 노이즈를 음성 파일에 적용하는 함수
def add_noise_to_audio(original_audio, noise, noise_factor=1):
    noise = np.tile(noise, int(np.ceil(len(original_audio) / len(noise))))[:len(original_audio)]
    noisy_audio = original_audio + noise_factor * noise
    noisy_audio = np.clip(noisy_audio, -1.0, 1.0)
    return noisy_audio

# 예제 음성 파일 경로 (실제 경로로 변경해야 합니다)
original_audio_path = '/content/train/AAACWKPZ.ogg'
y_original, sr = librosa.load(original_audio_path, sr=32000)

def apply_and_play_noise(noise_clusters, y_original, sr=32000, noise_factor=1):
    # 원본 오디오 재생 (한 번만)
    print(f"Playing original audio")
    ipd.display(ipd.Audio(y_original, rate=sr))

    for cluster_id, noises in noise_clusters.items():
        if len(noises) > 0:
            # 노이즈 샘플 선택 및 변환
            S = noises[0]
            noise_sample = librosa.istft(S)

            # 노이즈 추가
            noisy_audio = add_noise_to_audio(y_original, noise_sample, noise_factor)

            # 노이즈가 추가된 오디오를 오디오 파일로 저장
            noisy_audio_path = f'noisy_audio_cluster_{cluster_id}.wav'
            sf.write(noisy_audio_path, noisy_audio, sr)

            # 노이즈가 추가된 오디오 재생
            print(f"Playing noisy audio for cluster {cluster_id}")
            ipd.display(ipd.Audio(noisy_audio, rate=sr))

# 노이즈 클러스터 재생 호출
apply_and_play_noise(noise_clusters, y_original, sr=32000)


Playing original audio


Playing noisy audio for cluster 0


Playing noisy audio for cluster 1


Playing noisy audio for cluster 2


Playing noisy audio for cluster 3


Playing noisy audio for cluster 4


추출된 노이즈로 augumentation 적용

In [31]:
import numpy as np

def generate_white_noise(duration, sr):
    return np.random.normal(0, 1, int(duration * sr))

def generate_pink_noise(duration, sr):
    num_samples = int(duration * sr)
    rows = 16
    array = np.random.randn(rows, num_samples)
    array = np.cumsum(array, axis=-1)
    array = array / np.arange(1, rows + 1)[:, None]
    pink_noise = np.sum(array, axis=0)
    return pink_noise / np.max(np.abs(pink_noise))

def generate_brown_noise(duration, sr):
    num_samples = int(duration * sr)
    white_noise = np.random.normal(0, 1, num_samples)
    brown_noise = np.cumsum(white_noise)
    brown_noise = brown_noise / np.max(np.abs(brown_noise))
    return brown_noise

20%:추출된노이즈, 20% 일반적인 노이즈, 30% 추출+일반적인 노이즈 30%: 기존데이터

In [32]:
# 노이즈를 음성 데이터에 적용하는 함수
def add_noise_to_audio(original_audio, noise, noise_factor=0.6):
    noise = np.tile(noise, int(np.ceil(len(original_audio) / len(noise))))[:len(original_audio)]
    noisy_audio = original_audio + noise_factor * noise
    noisy_audio = np.clip(noisy_audio, -1.0, 1.0)
    return noisy_audio

# 노이즈 클러스터에서 노이즈 샘플을 적용하고 트레인 데이터를 증강
def augment_train_data_with_noise(train_audio_data, noise_clusters, sr=32000, noise_factor=0.6):
    num_files = len(train_audio_data)
    num_extract_noise = int(num_files * 0.2)
    num_general_noise = int(num_files * 0.2)
    num_both_noise = int(num_files * 0.3)

    indices_extract_noise = random.sample(range(num_files), num_extract_noise)
    remaining_indices = list(set(range(num_files)) - set(indices_extract_noise))
    indices_general_noise = random.sample(remaining_indices, num_general_noise)
    remaining_indices = list(set(remaining_indices) - set(indices_general_noise))
    indices_both_noise = random.sample(remaining_indices, num_both_noise)

    augmented_data = []

    for i, y_original in enumerate(tqdm(train_audio_data, desc="Augmenting train data with noise")):
        if i in indices_extract_noise:
            # 추출된 노이즈
            cluster_id = np.random.choice(list(noise_clusters.keys()))
            S = noise_clusters[cluster_id][0]
            noise_sample = librosa.istft(S)
            # 노이즈 추가
            noisy_audio = add_noise_to_audio(y_original, noise_sample, noise_factor)
            augmented_data.append(noisy_audio)

        elif i in indices_general_noise:
            # 일반적인 노이즈 생성
            duration = len(y_original) / sr
            noise_type = random.choice(['white', 'pink', 'brown'])
            if noise_type == 'white':
                noise_sample = generate_white_noise(duration, sr)
            elif noise_type == 'pink':
                noise_sample = generate_pink_noise(duration, sr)
            elif noise_type == 'brown':
                noise_sample = generate_brown_noise(duration, sr)
            # 노이즈 추가
            noisy_audio = add_noise_to_audio(y_original, noise_sample, noise_factor)
            augmented_data.append(noisy_audio)

        elif i in indices_both_noise:
            # 추출된 노이즈
            cluster_id = np.random.choice(list(noise_clusters.keys()))
            S = noise_clusters[cluster_id][0]
            extract_noise_sample = librosa.istft(S)
            # 일반적인 노이즈 생성
            duration = len(y_original) / sr
            noise_type = random.choice(['white', 'pink', 'brown'])
            if noise_type == 'white':
                general_noise_sample = generate_white_noise(duration, sr)
            elif noise_type == 'pink':
                general_noise_sample = generate_pink_noise(duration, sr)
            elif noise_type == 'brown':
                general_noise_sample = generate_brown_noise(duration, sr)
            # 두 노이즈 합성
            combined_noise = add_noise_to_audio(extract_noise_sample, general_noise_sample, noise_factor)
            # 노이즈 추가
            noisy_audio = add_noise_to_audio(y_original, combined_noise, noise_factor)
            augmented_data.append(noisy_audio)

        else:
            augmented_data.append(y_original)

    return augmented_data

In [33]:
augmented_train_audio = augment_train_data_with_noise(train_audio_data, noise_clusters)

Augmenting train data with noise:   0%|          | 0/84350 [00:00<?, ?it/s]

In [34]:
# 첫 번째 원본 및 증강된 오디오 비교
for i in range(3):
  print("Playing original audio")
  ipd.display(ipd.Audio(train_audio_data[i], rate=32000))

  print("Playing augmented audio")
  ipd.display(ipd.Audio(augmented_train_audio[i], rate=32000))

Playing original audio


Playing augmented audio


Playing original audio


Playing augmented audio


Playing original audio


Playing augmented audio


## Data Pre-processing : MFCC+ Spectogram

In [35]:
def extract_features(audio, sr=32000, n_mfcc=13, n_mels=128):
    # MFCC 추출
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    mfcc = np.mean(mfcc.T, axis=0)

    # 스펙트로그램 추출
    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=n_mels)
    mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
    mel_spec = np.mean(mel_spec.T, axis=0)

    # 특징 병합
    features = np.concatenate((mfcc, mel_spec))

    return features

# 여러 오디오 데이터에 대해 특징 추출
def extract_features_from_dataset(audio_dataset, sr=32000, n_mfcc=13, n_mels=128):
    feature_list = []
    for audio in tqdm(audio_dataset, desc="Extracting features"):
        features = extract_features(audio, sr, n_mfcc, n_mels)
        feature_list.append(features)
    return np.array(feature_list)

In [36]:
train_features = extract_features_from_dataset(augmented_train_audio, sr=CONFIG.SR, n_mfcc=CONFIG.N_MFCC, n_mels=CONFIG.N_MELS)
val_features = extract_features_from_dataset(val_audio_data, sr=CONFIG.SR, n_mfcc=CONFIG.N_MFCC, n_mels=CONFIG.N_MELS)

Extracting features:   0%|          | 0/84350 [00:00<?, ?it/s]

Extracting features:   0%|          | 0/21088 [00:00<?, ?it/s]

# Dataset

In [38]:
#feature_vector + label을 가지고 CustomDataset 생성
class CustomDataset(Dataset):
    def __init__(self, mfcc, label):
        self.mfcc = mfcc
        self.label = label

    def __len__(self):
        return len(self.mfcc)

    def __getitem__(self, index):
        if self.label is not None:
            return self.mfcc[index], self.label[index]
        return self.mfcc[index]

In [39]:
train_dataset = CustomDataset(train_features, train_labels)
val_dataset = CustomDataset(val_features, val_labels)

In [40]:
train_loader = DataLoader(
    train_dataset,
    batch_size=CONFIG.BATCH_SIZE,
    shuffle=True
)
val_loader = DataLoader(
    val_dataset,
    batch_size=CONFIG.BATCH_SIZE,
    shuffle=False
)

# Define Model

In [93]:
import torch
import torch.nn as nn
import torch.optim as optim

class CNNLSTMModel(nn.Module):
    def __init__(self, input_dim, cnn_output_dim, lstm_hidden_dim, num_classes):
        super(CNNLSTMModel, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv1d(input_dim, cnn_output_dim, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2)
        )
        self.lstm = nn.LSTM(cnn_output_dim, lstm_hidden_dim, batch_first=True)
        self.fc = nn.Linear(lstm_hidden_dim, num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        if x.dim() == 2:
            x = x.unsqueeze(1)  # (batch_size, 1, sequence_length)
        elif x.dim() == 3:
            x = x.permute(0, 2, 1)  # (batch_size, input_dim, sequence_length)

        x = self.cnn(x)  # (batch_size, cnn_output_dim, new_seq_len)
        x = x.permute(0, 2, 1)  # (batch_size, cnn_output_dim, new_seq_len) -> (batch_size, new_seq_len, cnn_output_dim)
        x, _ = self.lstm(x)  # (batch_size, new_seq_len, lstm_hidden_dim)
        x = self.fc(x[:, -1, :])  # 마지막 시퀀스 출력 사용
        x = self.sigmoid(x)
        return x

# 모델 초기화 예시
input_dim = 1  # Conv1d에서의 input_dim은 실제로 채널 수를 의미합니다.
cnn_output_dim = 64
lstm_hidden_dim = 128
num_classes = 2

In [42]:
class EarlyStopping:
    def __init__(self, patience=7, verbose=False, delta=0):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta

    def __call__(self, val_loss, model):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        self.val_loss_min = val_loss

# Train & Validation

In [94]:
from sklearn.metrics import roc_auc_score

def train(model, optimizer, scheduler, train_loader, val_loader, device):
    model.to(device)
    criterion = nn.BCELoss().to(device)  # Binary Cross-Entropy Loss 사용
    early_stopping = EarlyStopping(patience=CONFIG.EARLY_STOPPING_PATIENCE, verbose=True)

    best_val_score = 0
    best_model = None

    for epoch in range(1, CONFIG.N_EPOCHS+1):
        model.train()
        train_loss = []
        for features, labels in tqdm(iter(train_loader)):
            features = features.float().to(device)
            labels = labels.float().to(device)  # 라벨을 float 타입으로 변환

            optimizer.zero_grad()

            output = model(features)
            loss = criterion(output, labels)

            loss.backward()
            optimizer.step()

            train_loss.append(loss.item())

        _val_loss, _val_score = validate(model, criterion, val_loader, device)
        _train_loss = np.mean(train_loss)
        print(f'Epoch [{epoch}], Train Loss : [{_train_loss:.5f}] Val Loss : [{_val_loss:.5f}] Val AUC : [{_val_score:.5f}]')

        if best_val_score < _val_score:
            best_val_score = _val_score
            best_model = model

        scheduler.step(_val_loss)

        early_stopping(_val_loss, model)
        if early_stopping.early_stop:
            print("Early stopping")
            break

    return best_model


def validate(model, criterion, val_loader, device):
    model.eval()
    val_loss = []
    val_true = []
    val_pred = []
    with torch.no_grad():
        for features, labels in tqdm(iter(val_loader)):
            features = features.float().to(device)
            labels = labels.float().to(device)  # 라벨을 float 타입으로 변환

            output = model(features)
            loss = criterion(output, labels)

            val_loss.append(loss.item())

            val_true.extend(labels.cpu().numpy())
            val_pred.extend(output.cpu().numpy())

    val_loss = np.mean(val_loss)
    val_score = roc_auc_score(val_true, val_pred, average='macro')
    return val_loss, val_score

## Run

In [95]:
model = CNNLSTMModel(input_dim, cnn_output_dim, lstm_hidden_dim, num_classes)
optimizer = optim.Adam(params=model.parameters(), lr=CONFIG.LR)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=CONFIG.Scheduler_patience, verbose=True)
infer_model = train(model, optimizer, scheduler, train_loader, val_loader, device)

  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [1], Train Loss : [0.53913] Val Loss : [0.41960] Val AUC : [0.87977]
Validation loss decreased (inf --> 0.419596).  Saving model ...


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [2], Train Loss : [0.45216] Val Loss : [0.37636] Val AUC : [0.90195]
Validation loss decreased (0.419596 --> 0.376362).  Saving model ...


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [3], Train Loss : [0.41594] Val Loss : [0.34679] Val AUC : [0.91283]
Validation loss decreased (0.376362 --> 0.346786).  Saving model ...


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [4], Train Loss : [0.39079] Val Loss : [0.32396] Val AUC : [0.92003]
Validation loss decreased (0.346786 --> 0.323955).  Saving model ...


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [5], Train Loss : [0.37883] Val Loss : [0.31209] Val AUC : [0.92502]
Validation loss decreased (0.323955 --> 0.312094).  Saving model ...


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [6], Train Loss : [0.36552] Val Loss : [0.30109] Val AUC : [0.92796]
Validation loss decreased (0.312094 --> 0.301086).  Saving model ...


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [7], Train Loss : [0.35607] Val Loss : [0.30668] Val AUC : [0.92962]
EarlyStopping counter: 1 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [8], Train Loss : [0.34743] Val Loss : [0.29421] Val AUC : [0.93203]
Validation loss decreased (0.301086 --> 0.294213).  Saving model ...


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [9], Train Loss : [0.33910] Val Loss : [0.28884] Val AUC : [0.93318]
Validation loss decreased (0.294213 --> 0.288844).  Saving model ...


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [10], Train Loss : [0.33273] Val Loss : [0.28427] Val AUC : [0.93453]
Validation loss decreased (0.288844 --> 0.284272).  Saving model ...


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [11], Train Loss : [0.32694] Val Loss : [0.27911] Val AUC : [0.93716]
Validation loss decreased (0.284272 --> 0.279108).  Saving model ...


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [12], Train Loss : [0.32135] Val Loss : [0.27280] Val AUC : [0.93953]
Validation loss decreased (0.279108 --> 0.272801).  Saving model ...


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [13], Train Loss : [0.31811] Val Loss : [0.29687] Val AUC : [0.93620]
EarlyStopping counter: 1 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [14], Train Loss : [0.31327] Val Loss : [0.27882] Val AUC : [0.93958]
EarlyStopping counter: 2 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [15], Train Loss : [0.30831] Val Loss : [0.26291] Val AUC : [0.94283]
Validation loss decreased (0.272801 --> 0.262914).  Saving model ...


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [16], Train Loss : [0.30429] Val Loss : [0.26245] Val AUC : [0.94340]
Validation loss decreased (0.262914 --> 0.262445).  Saving model ...


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [17], Train Loss : [0.30046] Val Loss : [0.26206] Val AUC : [0.94412]
Validation loss decreased (0.262445 --> 0.262061).  Saving model ...


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [18], Train Loss : [0.29550] Val Loss : [0.26640] Val AUC : [0.94493]
EarlyStopping counter: 1 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [19], Train Loss : [0.29261] Val Loss : [0.25947] Val AUC : [0.94624]
Validation loss decreased (0.262061 --> 0.259467).  Saving model ...


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [20], Train Loss : [0.29090] Val Loss : [0.26517] Val AUC : [0.94490]
EarlyStopping counter: 1 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [21], Train Loss : [0.28737] Val Loss : [0.26663] Val AUC : [0.94452]
EarlyStopping counter: 2 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [22], Train Loss : [0.28439] Val Loss : [0.25238] Val AUC : [0.94768]
Validation loss decreased (0.259467 --> 0.252377).  Saving model ...


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [23], Train Loss : [0.28207] Val Loss : [0.25091] Val AUC : [0.94771]
Validation loss decreased (0.252377 --> 0.250909).  Saving model ...


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [24], Train Loss : [0.27826] Val Loss : [0.26024] Val AUC : [0.94929]
EarlyStopping counter: 1 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [25], Train Loss : [0.27580] Val Loss : [0.24710] Val AUC : [0.95034]
Validation loss decreased (0.250909 --> 0.247104).  Saving model ...


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [26], Train Loss : [0.27458] Val Loss : [0.24395] Val AUC : [0.94992]
Validation loss decreased (0.247104 --> 0.243948).  Saving model ...


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [27], Train Loss : [0.27108] Val Loss : [0.24538] Val AUC : [0.95040]
EarlyStopping counter: 1 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [28], Train Loss : [0.26971] Val Loss : [0.25520] Val AUC : [0.95036]
EarlyStopping counter: 2 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [29], Train Loss : [0.26661] Val Loss : [0.26885] Val AUC : [0.95049]
EarlyStopping counter: 3 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [30], Train Loss : [0.26530] Val Loss : [0.24019] Val AUC : [0.95171]
Validation loss decreased (0.243948 --> 0.240185).  Saving model ...


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [31], Train Loss : [0.26293] Val Loss : [0.24694] Val AUC : [0.95000]
EarlyStopping counter: 1 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [32], Train Loss : [0.25973] Val Loss : [0.23750] Val AUC : [0.95338]
Validation loss decreased (0.240185 --> 0.237501).  Saving model ...


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [33], Train Loss : [0.26087] Val Loss : [0.23792] Val AUC : [0.95181]
EarlyStopping counter: 1 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [34], Train Loss : [0.25706] Val Loss : [0.25096] Val AUC : [0.95108]
EarlyStopping counter: 2 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [35], Train Loss : [0.25566] Val Loss : [0.24364] Val AUC : [0.95197]
EarlyStopping counter: 3 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [36], Train Loss : [0.25304] Val Loss : [0.24074] Val AUC : [0.95327]
EarlyStopping counter: 4 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [37], Train Loss : [0.25249] Val Loss : [0.25218] Val AUC : [0.95366]
EarlyStopping counter: 5 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [38], Train Loss : [0.24990] Val Loss : [0.24399] Val AUC : [0.95270]
EarlyStopping counter: 6 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [39], Train Loss : [0.24803] Val Loss : [0.24098] Val AUC : [0.95311]
EarlyStopping counter: 7 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [40], Train Loss : [0.24692] Val Loss : [0.27205] Val AUC : [0.95316]
EarlyStopping counter: 8 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [41], Train Loss : [0.24546] Val Loss : [0.23883] Val AUC : [0.95284]
EarlyStopping counter: 9 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [42], Train Loss : [0.22657] Val Loss : [0.23128] Val AUC : [0.95612]
Validation loss decreased (0.237501 --> 0.231276).  Saving model ...


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [43], Train Loss : [0.22124] Val Loss : [0.23314] Val AUC : [0.95614]
EarlyStopping counter: 1 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [44], Train Loss : [0.21994] Val Loss : [0.23093] Val AUC : [0.95658]
Validation loss decreased (0.231276 --> 0.230933).  Saving model ...


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [45], Train Loss : [0.21879] Val Loss : [0.23228] Val AUC : [0.95635]
EarlyStopping counter: 1 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [46], Train Loss : [0.21806] Val Loss : [0.23150] Val AUC : [0.95639]
EarlyStopping counter: 2 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [47], Train Loss : [0.21717] Val Loss : [0.23200] Val AUC : [0.95628]
EarlyStopping counter: 3 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [48], Train Loss : [0.21658] Val Loss : [0.23286] Val AUC : [0.95651]
EarlyStopping counter: 4 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [49], Train Loss : [0.21600] Val Loss : [0.23389] Val AUC : [0.95636]
EarlyStopping counter: 5 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [50], Train Loss : [0.21555] Val Loss : [0.23517] Val AUC : [0.95649]
EarlyStopping counter: 6 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [51], Train Loss : [0.21491] Val Loss : [0.23343] Val AUC : [0.95660]
EarlyStopping counter: 7 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [52], Train Loss : [0.21455] Val Loss : [0.23265] Val AUC : [0.95637]
EarlyStopping counter: 8 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [53], Train Loss : [0.21398] Val Loss : [0.23403] Val AUC : [0.95644]
EarlyStopping counter: 9 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [54], Train Loss : [0.21081] Val Loss : [0.23367] Val AUC : [0.95656]
EarlyStopping counter: 10 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [55], Train Loss : [0.21040] Val Loss : [0.23427] Val AUC : [0.95659]
EarlyStopping counter: 11 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [56], Train Loss : [0.21029] Val Loss : [0.23381] Val AUC : [0.95660]
EarlyStopping counter: 12 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [57], Train Loss : [0.21021] Val Loss : [0.23446] Val AUC : [0.95657]
EarlyStopping counter: 13 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [58], Train Loss : [0.21009] Val Loss : [0.23429] Val AUC : [0.95659]
EarlyStopping counter: 14 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [59], Train Loss : [0.21002] Val Loss : [0.23448] Val AUC : [0.95656]
EarlyStopping counter: 15 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [60], Train Loss : [0.20993] Val Loss : [0.23440] Val AUC : [0.95657]
EarlyStopping counter: 16 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [61], Train Loss : [0.20989] Val Loss : [0.23448] Val AUC : [0.95658]
EarlyStopping counter: 17 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [62], Train Loss : [0.20983] Val Loss : [0.23507] Val AUC : [0.95659]
EarlyStopping counter: 18 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [63], Train Loss : [0.20941] Val Loss : [0.23486] Val AUC : [0.95658]
EarlyStopping counter: 19 out of 20


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

Epoch [64], Train Loss : [0.20937] Val Loss : [0.23474] Val AUC : [0.95658]
EarlyStopping counter: 20 out of 20
Early stopping


## Inference

In [99]:
test_td = pd.read_csv('./test.csv')

# 테스트 데이터셋의 오디오 파일 로드
def load_test_audio_data(test_df):
    audio_data = []
    for _, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
        y, sr = librosa.load(row['path'], sr=CONFIG.SR)
        audio_data.append(y)
    return audio_data

test_audio_data= load_test_audio_data(test_td)

  0%|          | 0/50000 [00:00<?, ?it/s]

In [100]:
test_features = extract_features_from_dataset(test_audio_data, sr=CONFIG.SR, n_mfcc=CONFIG.N_MFCC, n_mels=CONFIG.N_MELS)

# 테스트 데이터셋 생성
test_dataset = CustomDataset(test_features, None)

# DataLoader 설정
test_loader = DataLoader(
    test_dataset,
    batch_size=CONFIG.BATCH_SIZE,
    shuffle=False
)

Extracting features:   0%|          | 0/50000 [00:00<?, ?it/s]

In [101]:
def inference(model, test_loader, device):
    model.to(device)
    model.eval()
    predictions = []
    with torch.no_grad():
        for features in tqdm(iter(test_loader)):
            features = features.float().to(device)

            probs = model(features)

            probs  = probs.cpu().detach().numpy()
            predictions += probs.tolist()
    return predictions

In [102]:
preds = inference(infer_model, test_loader, device)

  0%|          | 0/521 [00:00<?, ?it/s]

## Submission

In [103]:
submit = pd.read_csv('./sample_submission.csv')
submit.iloc[:, 1:] = preds
submit.head()

Unnamed: 0,id,fake,real
0,TEST_00000,0.99984,0.000284
1,TEST_00001,0.918741,0.064937
2,TEST_00002,0.998594,0.006905
3,TEST_00003,0.982384,0.016924
4,TEST_00004,0.866201,0.820884


In [104]:
submit.to_csv('./baseline_submit.csv', index=False)

In [105]:
from google.colab import files
files.download('./baseline_submit.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>