In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
import librosa
import numpy as np
import whisper
import pandas as pd
import soundfile as sf
import random
import os
import torch.optim as optim
from sklearn.model_selection import train_test_split
from scipy.signal import resample
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import random_split
from src.models import (
    lcnn,
    specrnet,
    whisper_specrnet,
    rawnet3,
    whisper_lcnn,
    meso_net,
    whisper_meso_net
)
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def update_paths(data, base_path):
    data['path'] = data['path'].apply(lambda p: p.replace('./', f'{base_path}'))
    return data

def load_and_split_data(csv_file, base_path='../SW/'):
    data = pd.read_csv(csv_file)
    data = update_paths(data, base_path)

    # 라벨이 'real'인 데이터를 필터링합니다.
    real_data = data[data['label'] == 'real']

    # 라벨이 'fake'인 데이터를 필터링합니다.
    fake_data = data[data['label'] == 'fake']

    # real_data와 fake_data의 길이를 맞추기 위해 최소 길이를 사용합니다.
    min_length = min(len(real_data), len(fake_data))

    # 각각 real_data와 fake_data에서 min_length만큼 샘플링하여 길이를 맞춥니다.
    real_data = real_data.sample(min_length).reset_index(drop=True)
    fake_data = fake_data.sample(min_length).reset_index(drop=True)

    # 랜덤 인덱스를 생성하여 데이터프레임을 셔플합니다.
    shuffled_real_indices = np.random.permutation(min_length)
    shuffled_fake_indices = np.random.permutation(min_length)

    # 랜덤하게 결합할 데이터프레임을 생성합니다.
    real_data_1 = real_data.iloc[shuffled_real_indices].reset_index(drop=True)
    real_data_2 = real_data.iloc[shuffled_fake_indices].reset_index(drop=True).add_suffix('.1')

    real_real = pd.concat([real_data_1, real_data_2], axis=1)
    
    fake_data_2 = fake_data.iloc[shuffled_fake_indices].reset_index(drop=True).add_suffix('.1')
    real_fake = pd.concat([real_data_1, fake_data_2], axis=1)
    
    fake_data_1 = fake_data.iloc[shuffled_real_indices].reset_index(drop=True)
    fake_fake = pd.concat([fake_data_1, fake_data_2], axis=1)

    return real_real, real_fake, fake_fake

In [6]:
real_real, real_fake, fake_fake = load_and_split_data('../SW/train.csv')

In [7]:
print(real_real.head())
print(real_fake.head())

         id                      path label      id.1  \
0  LAIUZNEC  ../SW/train/LAIUZNEC.ogg  real  HBUZAZXG   
1  TQXMDGGR  ../SW/train/TQXMDGGR.ogg  real  UUKHJKCW   
2  RYPKYFPA  ../SW/train/RYPKYFPA.ogg  real  NXRMXSXE   
3  NPCLVTZZ  ../SW/train/NPCLVTZZ.ogg  real  DFVUEIAC   
4  RIJGVOPW  ../SW/train/RIJGVOPW.ogg  real  QWRMCZOT   

                     path.1 label.1  
0  ../SW/train/HBUZAZXG.ogg    real  
1  ../SW/train/UUKHJKCW.ogg    real  
2  ../SW/train/NXRMXSXE.ogg    real  
3  ../SW/train/DFVUEIAC.ogg    real  
4  ../SW/train/QWRMCZOT.ogg    real  
         id                      path label      id.1  \
0  LAIUZNEC  ../SW/train/LAIUZNEC.ogg  real  OYLDHFON   
1  TQXMDGGR  ../SW/train/TQXMDGGR.ogg  real  NLXQBKQV   
2  RYPKYFPA  ../SW/train/RYPKYFPA.ogg  real  LBJYEANR   
3  NPCLVTZZ  ../SW/train/NPCLVTZZ.ogg  real  GGXVWDDW   
4  RIJGVOPW  ../SW/train/RIJGVOPW.ogg  real  GZTVGYWA   

                     path.1 label.1  
0  ../SW/train/OYLDHFON.ogg    fake  
1  ../SW/t

In [11]:
print("real_real columns:", real_real.columns)
print("real_fake columns:", real_fake.columns)
print("fake_fake columns:", fake_fake.columns)

real_real columns: Index(['id', 'path', 'label', 'id.1', 'path.1', 'label.1'], dtype='object')
real_fake columns: Index(['id', 'path', 'label', 'id.1', 'path.1', 'label.1'], dtype='object')
fake_fake columns: Index(['id', 'path', 'label', 'id.1', 'path.1', 'label.1'], dtype='object')


In [2]:
def resample_audio(data, original_rate, target_rate):
    number_of_samples = round(len(data) * float(target_rate) / original_rate)
    resampled_data = resample(data, number_of_samples)
    return resampled_data

def concatenate_audios_overlap(file_path1, file_path2, output_path):
    data1, samplerate1 = sf.read(file_path1)
    data2, samplerate2 = sf.read(file_path2)
    if samplerate1 != samplerate2:
        if samplerate1 > samplerate2:
            data2 = resample_audio(data2, samplerate2, samplerate1)
            samplerate2 = samplerate1
        else:
            data1 = resample_audio(data1, samplerate1, samplerate2)
            samplerate1 = samplerate2
    if len(data1) > len(data2):
        data1 = data1[:len(data2)]
    else:
        data2 = data2[:len(data1)]
    combined = data1 + data2
    sf.write(output_path, combined, samplerate1)
def create_dir_if_not_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

def create_audio_set(real_real, real_fake, fake_fake):
    create_dir_if_not_exists('combined_audio')
    
    for i, row in tqdm(real_real.iterrows(), total=real_real.shape[0], desc="Processing real-real pairs"):
        file_path1 = row['path']
        file_path2 = row['path.1']
        output_path = f'combined_audio/real_real/{i}.ogg'
        concatenate_audios_overlap(file_path1, file_path2, output_path)

    for i, row in tqdm(real_fake.iterrows(), total=real_fake.shape[0], desc="Processing real-fake pairs"):
        file_path1 = row['path']
        file_path2 = row['path.1']
        output_path = f'combined_audio/real_fake/{i}.ogg'
        concatenate_audios_overlap(file_path1, file_path2, output_path)

    for i, row in tqdm(fake_fake.iterrows(), total=fake_fake.shape[0], desc="Processing fake-fake pairs"):
        file_path1 = row['path']
        file_path2 = row['path.1']
        output_path = f'combined_audio/fake_fake/{i}.ogg'
        concatenate_audios_overlap(file_path1, file_path2, output_path)

def main(csv_file):
    real_real, real_fake, fake_fake = load_and_split_data(csv_file)
    create_audio_set(real_real, real_fake, fake_fake)

if __name__ == '__main__':
    csv_file = '../SW/train.csv'  # 실제 CSV 파일 경로
    main(csv_file)

NameError: name 'load_and_split_data' is not defined

In [3]:
SAMPLING_RATE = 16_000
APPLY_NORMALIZATION = True
APPLY_TRIMMING = True
APPLY_PADDING = True
FRAMES_NUMBER = 480_000
win_length = 400
hop_length = 160

SOX_SILENCE = [
    # Trim silence longer than 0.2s and louder than 1% volume
    ["silence", "1", "0.2", "1%", "-1", "0.2", "1%"],
]

SOX_SILENCE = [
    ["silence", "1", "0.2", "1%", "-1", "0.2", "1%"],
]

def resample_audio(data, original_rate, target_rate):
    number_of_samples = round(len(data) * float(target_rate) / original_rate)
    resampled_data = resample(data, number_of_samples)
    return resampled_data

def apply_preprocessing(waveform, sample_rate):
    if sample_rate != SAMPLING_RATE:
        waveform, sample_rate = resample_wave(waveform, sample_rate, SAMPLING_RATE)

    if waveform.dim() > 1 and waveform.shape[0] > 1:
        waveform = waveform[:1, ...]

    if APPLY_TRIMMING:
        waveform, sample_rate = apply_trim(waveform, sample_rate)

    if APPLY_PADDING:
        waveform = apply_pad(waveform, FRAMES_NUMBER)
    return waveform, sample_rate


def resample_wave(waveform, sample_rate, target_sample_rate):
    waveform, sample_rate = torchaudio.sox_effects.apply_effects_tensor(
        waveform, sample_rate, [["rate", f"{target_sample_rate}"]]
    )
    return waveform, sample_rate


def apply_trim(waveform, sample_rate):
    waveform_trimmed, sample_rate_trimmed = torchaudio.sox_effects.apply_effects_tensor(
        waveform, sample_rate, SOX_SILENCE
    )

    if waveform_trimmed.size()[1] > 0:
        waveform = waveform_trimmed
        sample_rate = sample_rate_trimmed
    
    return waveform, sample_rate


def apply_pad(waveform, cut):
    waveform = waveform.squeeze(0)
    waveform_len = waveform.shape[0]

    if waveform_len >= cut:
        return waveform[:cut]

    num_repeats = int(cut / waveform_len) + 1
    padded_waveform = torch.tile(waveform, (1, num_repeats))[:, :cut][0]

    return padded_waveform

In [4]:
class SimpleAudioDataset(Dataset):
    def __init__(self, real_real_dir, real_fake_dir, fake_fake_dir, csv_file, transform=None, return_meta=False):
        self.real_real_dir = real_real_dir
        self.real_fake_dir = real_fake_dir
        self.fake_fake_dir = fake_fake_dir
        self.samples = pd.read_csv(csv_file)
        self.transform = transform
        self.return_meta = return_meta

        self.data_files = []
        self.labels = []
        
        # 폴더에서 파일 읽기
        self.load_folder_data(self.real_real_dir, [0, 1])
        self.load_folder_data(self.real_fake_dir, [1, 1])
        self.load_folder_data(self.fake_fake_dir, [1, 0])

        # CSV에서 파일 읽기
        for _, row in self.samples.iterrows():
            self.data_files.append('../SW/' + row['path'])
            self.labels.append([1, 0] if row['label'] == 'fake' else [0, 1])

    def load_folder_data(self, folder_path, label):
        for filename in os.listdir(folder_path):
            if filename.endswith('.ogg'):
                self.data_files.append(os.path.join(folder_path, filename))
                self.labels.append(label)
    
    def __len__(self):
        return len(self.data_files)
    
    def __getitem__(self, index):
        path = self.data_files[index]
        label = self.labels[index]

        waveform, sample_rate = torchaudio.load(path, normalize=APPLY_NORMALIZATION)
        real_sec_length = len(waveform[0]) / sample_rate

        waveform, sample_rate = apply_preprocessing(waveform, sample_rate)
        label = torch.tensor(label, dtype=torch.float)
        return_data = [waveform, sample_rate, label]
        if self.return_meta:
            file_id = os.path.basename(path).split('.')[0]
            return_data.append((file_id, path, real_sec_length))

        return return_data

if __name__ == '__main__':
    real_real_dir = './combined_audio/real_real'
    real_fake_dir = './combined_audio/real_fake'
    fake_fake_dir = './combined_audio/fake_fake'
    csv_file = '../SW/train.csv'

    dataset = SimpleAudioDataset(real_real_dir, real_fake_dir, fake_fake_dir, csv_file, return_meta=True)
    train_size = int(0.8 * len(dataset))  # 80% of data for training
    val_size = len(dataset) - train_size  # Remaining 20% for validation
    
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    
    # Create DataLoader for train and validation sets
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)

In [5]:
model_config = {
    'fc1_dim': 1024,
    'frontend_algorithm': ["mfcc"],
    'input_channels': 1
}

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = meso_net.FrontendMesoInception4(fc1_dim=model_config['fc1_dim'],
                       frontend_algorithm=model_config['frontend_algorithm'],
                       input_channels=model_config['input_channels'],
                       device=device)

Using ['mfcc'] frontend


In [7]:
print(model)

FrontendMesoInception4(
  (Incption1_conv1): Conv2d(1, 1, kernel_size=(1, 1), stride=(1, 1), bias=False)
  (Incption1_conv2_1): Conv2d(1, 4, kernel_size=(1, 1), stride=(1, 1), bias=False)
  (Incption1_conv2_2): Conv2d(4, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (Incption1_conv3_1): Conv2d(1, 4, kernel_size=(1, 1), stride=(1, 1), bias=False)
  (Incption1_conv3_2): Conv2d(4, 4, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)
  (Incption1_conv4_1): Conv2d(1, 2, kernel_size=(1, 1), stride=(1, 1), bias=False)
  (Incption1_conv4_2): Conv2d(2, 2, kernel_size=(3, 3), stride=(1, 1), padding=(3, 3), dilation=(3, 3), bias=False)
  (Incption1_bn): BatchNorm2d(11, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (Incption2_conv1): Conv2d(11, 2, kernel_size=(1, 1), stride=(1, 1), bias=False)
  (Incption2_conv2_1): Conv2d(11, 4, kernel_size=(1, 1), stride=(1, 1), bias=False)
  (Incption2_conv2_2): Conv2d(4, 4, kernel_size=(3,

In [18]:
from tqdm import tqdm

def train(model, train_loader, val_loader, criterion, optimizer, device, epochs, save_path):
    best_val_loss = float('inf')
    for epoch in range(epochs):
        model.train()  # 모델을 학습 모드로 설정
        running_loss = 0.0
        train_progress = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} Training")
        
        for batch in train_progress:
            waveforms, sample_rates, labels, *meta = batch
            waveforms = waveforms.to(device)  
            labels = labels.to(device)
            optimizer.zero_grad()  # 옵티마이저 초기화
            outputs = model(waveforms)
            loss = criterion(outputs, labels)

            loss.backward()  # 역전파 수행
            optimizer.step()  # 가중치 업데이트

            running_loss += loss.item()
            train_progress.set_postfix({'loss': loss.item()})

        average_loss = running_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs}, Training Loss: {average_loss:.4f}")

        # 검증 단계
        val_loss = validate(model, val_loader, criterion, device)
        print(f"Epoch {epoch+1}/{epochs}, Validation Loss: {val_loss:.4f}")

        # 최고의 모델 저장
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), save_path)
            print(f"Model saved at {save_path}")

def validate(model, val_loader, criterion, device):
    model.eval()  # 모델을 평가 모드로 설정
    total_val_loss = 0.0
    val_progress = tqdm(val_loader, desc="Validating")
    
    with torch.no_grad():
        for batch in val_progress:
            waveforms, sample_rates, labels, *meta = batch
            waveforms = waveforms.to(device)
            labels = labels.to(device)
            outputs = model(waveforms)
            loss = criterion(outputs, labels)
            total_val_loss += loss.item()
            val_progress.set_postfix({'loss': loss.item()})

    average_val_loss = total_val_loss / len(val_loader)
    return average_val_loss


In [None]:
model = model.to(device)
criterion = nn.CrossEntropyLoss()  # 이진 분류 손실 함수
optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=0.0001)  # 옵티마이저 구성
epochs = 100  # 에폭 수
train(model, train_loader, val_loader, criterion, optimizer, device, epochs, 'model.pth')

Epoch 1/100 Training: 100%|█████████████████████████████████████████████| 3458/3458 [29:46<00:00,  1.94it/s, loss=0.505]


Epoch 1/100, Training Loss: 0.4902


Validating: 100%|█████████████████████████████████████████████████████████| 865/865 [01:15<00:00, 11.49it/s, loss=0.406]


Epoch 1/100, Validation Loss: 0.5853
Model saved at model.pth


Epoch 2/100 Training: 100%|██████████████████████████████████████████████| 3458/3458 [29:43<00:00,  1.94it/s, loss=0.44]


Epoch 2/100, Training Loss: 0.4191


Validating: 100%|█████████████████████████████████████████████████████████| 865/865 [01:16<00:00, 11.35it/s, loss=0.386]


Epoch 2/100, Validation Loss: 0.5590
Model saved at model.pth


Epoch 3/100 Training: 100%|██████████████████████████████████████████████| 3458/3458 [29:41<00:00,  1.94it/s, loss=0.25]


Epoch 3/100, Training Loss: 0.4011


Validating: 100%|█████████████████████████████████████████████████████████| 865/865 [01:16<00:00, 11.33it/s, loss=0.462]


Epoch 3/100, Validation Loss: 0.6995


Epoch 4/100 Training: 100%|█████████████████████████████████████████████| 3458/3458 [29:42<00:00,  1.94it/s, loss=0.641]


Epoch 4/100, Training Loss: 0.3907


Validating: 100%|█████████████████████████████████████████████████████████| 865/865 [01:18<00:00, 10.96it/s, loss=0.401]


Epoch 4/100, Validation Loss: 0.6225


Epoch 5/100 Training: 100%|█████████████████████████████████████████████| 3458/3458 [29:42<00:00,  1.94it/s, loss=0.083]


Epoch 5/100, Training Loss: 0.3833


Validating: 100%|█████████████████████████████████████████████████████████| 865/865 [01:16<00:00, 11.35it/s, loss=0.278]


Epoch 5/100, Validation Loss: 0.4453
Model saved at model.pth


Epoch 6/100 Training: 100%|█████████████████████████████████████████████| 3458/3458 [29:43<00:00,  1.94it/s, loss=0.207]


Epoch 6/100, Training Loss: 0.3785


Validating: 100%|█████████████████████████████████████████████████████████| 865/865 [01:15<00:00, 11.42it/s, loss=0.273]


Epoch 6/100, Validation Loss: 0.4614


Epoch 7/100 Training: 100%|██████████████████████████████████████████████| 3458/3458 [29:43<00:00,  1.94it/s, loss=0.58]


Epoch 7/100, Training Loss: 0.3741


Validating: 100%|█████████████████████████████████████████████████████████| 865/865 [01:22<00:00, 10.51it/s, loss=0.215]


Epoch 7/100, Validation Loss: 0.4422
Model saved at model.pth


Epoch 8/100 Training: 100%|██████████████████████████████████████████████| 3458/3458 [29:43<00:00,  1.94it/s, loss=0.33]


Epoch 8/100, Training Loss: 0.3712


Validating: 100%|█████████████████████████████████████████████████████████| 865/865 [01:16<00:00, 11.35it/s, loss=0.256]


Epoch 8/100, Validation Loss: 0.4461


Epoch 9/100 Training: 100%|█████████████████████████████████████████████| 3458/3458 [29:41<00:00,  1.94it/s, loss=0.293]


Epoch 9/100, Training Loss: 0.3677


Validating: 100%|█████████████████████████████████████████████████████████| 865/865 [01:16<00:00, 11.28it/s, loss=0.238]


Epoch 9/100, Validation Loss: 0.4669


Epoch 10/100 Training: 100%|████████████████████████████████████████████| 3458/3458 [29:42<00:00,  1.94it/s, loss=0.341]


Epoch 10/100, Training Loss: 0.3644


Validating: 100%|█████████████████████████████████████████████████████████| 865/865 [01:20<00:00, 10.80it/s, loss=0.246]


Epoch 10/100, Validation Loss: 0.4599


Epoch 11/100 Training: 100%|████████████████████████████████████████████| 3458/3458 [29:42<00:00,  1.94it/s, loss=0.314]


Epoch 11/100, Training Loss: 0.3629


Validating: 100%|█████████████████████████████████████████████████████████| 865/865 [01:17<00:00, 11.11it/s, loss=0.235]


Epoch 11/100, Validation Loss: 0.5200


Epoch 12/100 Training: 100%|████████████████████████████████████████████| 3458/3458 [29:42<00:00,  1.94it/s, loss=0.337]


Epoch 12/100, Training Loss: 0.3602


Validating: 100%|█████████████████████████████████████████████████████████| 865/865 [01:16<00:00, 11.26it/s, loss=0.234]


Epoch 12/100, Validation Loss: 0.4344
Model saved at model.pth


Epoch 13/100 Training: 100%|████████████████████████████████████████████| 3458/3458 [29:42<00:00,  1.94it/s, loss=0.432]


Epoch 13/100, Training Loss: 0.3578


Validating: 100%|█████████████████████████████████████████████████████████| 865/865 [01:18<00:00, 11.07it/s, loss=0.186]


Epoch 13/100, Validation Loss: 0.4154
Model saved at model.pth


Epoch 14/100 Training: 100%|████████████████████████████████████████████| 3458/3458 [29:42<00:00,  1.94it/s, loss=0.581]


Epoch 14/100, Training Loss: 0.3558


Validating: 100%|█████████████████████████████████████████████████████████| 865/865 [01:16<00:00, 11.36it/s, loss=0.289]


Epoch 14/100, Validation Loss: 0.6039


Epoch 15/100 Training: 100%|█████████████████████████████████████████████| 3458/3458 [29:53<00:00,  1.93it/s, loss=0.16]


Epoch 15/100, Training Loss: 0.3542


Validating: 100%|█████████████████████████████████████████████████████████| 865/865 [01:16<00:00, 11.27it/s, loss=0.208]


Epoch 15/100, Validation Loss: 0.4580


Epoch 16/100 Training: 100%|████████████████████████████████████████████| 3458/3458 [29:43<00:00,  1.94it/s, loss=0.257]


Epoch 16/100, Training Loss: 0.3531


Validating: 100%|█████████████████████████████████████████████████████████| 865/865 [01:17<00:00, 11.12it/s, loss=0.252]


Epoch 16/100, Validation Loss: 0.5375


Epoch 17/100 Training: 100%|████████████████████████████████████████████| 3458/3458 [29:41<00:00,  1.94it/s, loss=0.579]


Epoch 17/100, Training Loss: 0.3517


Validating:  97%|███████████████████████████████████████████████████████▍ | 841/865 [01:14<00:02, 11.49it/s, loss=0.971]