In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio

from torch.utils.data import DataLoader, Dataset, ConcatDataset

import numpy as np
import matplotlib.pyplot as plt

import os

class NoiseDataset(Dataset):
    def __init__(self, audio_file, sample_rate=16000, frame_size=1024, threshold=0.05):
        waveform, orig_sample_rate = torchaudio.load(audio_file)
        self.waveform = torchaudio.transforms.Resample(orig_freq=orig_sample_rate, new_freq=sample_rate)(waveform)
        self.sample_rate = sample_rate
        self.frame_size = frame_size
        self.threshold = threshold
        self.noise_indices = self._detect_noise(self.waveform)
        
    def _detect_noise(self, waveform):
        noise_indices = []
        energy = waveform.pow(2).mean(dim=0)
        for i in range(0, waveform.size(1) - self.frame_size, self.frame_size):
            frame_energy = energy[i:i+self.frame_size].mean().item()
            if frame_energy > self.threshold:
                noise_indices.append(i)
                
        return noise_indices

    def __len__(self):
        return len(self.noise_indices)
    
    def __getitem__(self, idx):
        start_idx = self.noise_indices[idx]
        end_idx = start_idx + self.frame_size

        noisy_segment = self.waveform[:, start_idx:end_idx]
        noisy_segment = noisy_segment.unsqueeze(-1)  

        return noisy_segment, noisy_segment

    def extract_highest_noise_segment(waveform, sample_rate=16000, segment_duration=10):
        """
        가장 높은 소음 구간을 추출하는 정적 메서드.

        Parameters:
        - waveform (torch.Tensor): 오디오 파형.
        - sample_rate (int): 샘플링 속도.
        - segment_duration (int): 추출할 구간의 길이 (초).

        Returns:
        - torch.Tensor: 가장 높은 소음 구간의 파형.
        """
        # 에너지 계산
        energy = waveform.pow(2).mean(dim=0)

        # 구간의 샘플 수 계산
        segment_samples = segment_duration * sample_rate

        # 최고 에너지를 가진 구간 탐색
        max_energy = 0
        max_index = 0

        for i in range(0, waveform.size(1) - segment_samples, segment_samples):
            segment_energy = energy[i:i + segment_samples].sum().item()
            if segment_energy > max_energy:
                max_energy = segment_energy
                max_index = i

        # 가장 높은 소음 구간 추출
        highest_noise_segment = waveform[:, max_index:max_index + segment_samples]

        return highest_noise_segment
    
class AntiNoiseLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(AntiNoiseLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out)
        return out
    
def train_model(dataloader, model, criterion, optimizer, num_epochs = 20, validation_loader = None):
    train_losses = []
    val_losses = []

    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        
        for i, (inputs, targets) in enumerate(dataloader):
            inputs = inputs.to(device)
            targets = targets.to(device)

            if inputs.dim() == 2: 
                inputs = inputs.unsqueeze(1) 
            elif inputs.dim() == 4:  
                inputs = inputs.squeeze(-1) 
            elif inputs.dim() != 3: 
                raise ValueError(f"Unexpected input dimension {inputs.dim()} with shape {inputs.shape}")

            inputs = inputs.float()
            targets = -inputs
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            if (i + 1) % 10 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(dataloader)}], Loss: {loss.item():.4f}')
        
        epoch_loss = running_loss / len(dataloader)
        train_losses.append(epoch_loss)

        if validation_loader is not None:
            val_loss = evaluate_model(validation_loader, model, criterion)
            val_losses.append(val_loss)
            print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {epoch_loss:.4f}, Validation Loss: {val_loss:.4f}')
        else:
            print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {epoch_loss:.4f}')
    
    return train_losses, val_losses

def evaluate_model(dataloader, model, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs = inputs.unsqueeze(-1).float()
            targets = targets.unsqueeze(-1).float()

            outputs = model(inputs)
            outputs = outputs.unsqueeze(-1)
            loss = criterion(outputs, targets)
            total_loss += loss.item()
    
    return total_loss / len(dataloader)

val = {
    'input_size' : 1,
    'hidden_size' : 512,
    'num_layers' : 2,
    'output_size' : 1,
    'learning_rate' : 0.01
}

def load_datasets_from_folder(folder_path, sample_rate=16000, frame_size=1024, threshold=0.05):
    """
    Parameters:
    - folder_path (str): .wav 파일이 있는 폴더의 경로
    - sample_rate (int): 샘플링 속도
    - frame_size (int): 프레임 크기
    - threshold (float): 소음 감지 임계값
    - segment_duration (int): 추출할 소음 구간의 길이 (초)

    Returns:
    - ConcatDataset: 결합된 데이터셋
    """
    datasets = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".wav"):
            audio_file_path = os.path.join(folder_path, file_name)
            dataset = NoiseDataset(audio_file_path, sample_rate, frame_size, threshold)
            datasets.append(dataset)

    combined_dataset = ConcatDataset(datasets)
    return combined_dataset

folder_path = '/Users/junggwonhee/Desktop/programing/오아시스_해커톤/project/data/극한_소리_데이터/Training/Sound'
combined_dataset = load_datasets_from_folder(folder_path)

dataloader = DataLoader(combined_dataset, batch_size = 64, shuffle = True, pin_memory = True)

anti_noise_model = AntiNoiseLSTM(val['input_size'], val['hidden_size'], val['num_layers'], val['output_size'])
criterion = nn.MSELoss()
optimizer = optim.Adam(anti_noise_model.parameters(), lr = val['learning_rate'])

train_model(dataloader, anti_noise_model, criterion, optimizer)

def evaluate_and_plot_with_dataloader(model, dataloader):
    model.eval()
    for inputs, _ in dataloader:
        inputs = inputs.to(device)
        inputs = inputs.squeeze(-1).float()
        with torch.no_grad():
            outputs = model(inputs)
            
        input_waveform = inputs[0].cpu().numpy().flatten()
        output_waveform = outputs[0].cpu().numpy().flatten()

        print(f"Input waveform shape: {input_waveform.shape}")
        print(f"Output waveform shape: {output_waveform.shape}")
        print(f"Input waveform first 10 values: {input_waveform[:10]}")
        print(f"Output waveform first 10 values: {output_waveform[:10]}")

        
        plt.figure(figsize=(14, 6))
        plt.plot(input_waveform, label='Original Noise')
        plt.plot(output_waveform, label='Predicted Anti-Noise', color='orange', alpha=0.7)
        plt.title('Original Noise vs Predicted Anti-Noise')
        plt.legend()
        plt.show()

test_folder_path = '/Users/junggwonhee/Desktop/programing/오아시스_해커톤/project/data/극한_소리_데이터/Validation/sound'
test_combined_dataset = load_datasets_from_folder(test_folder_path)

test_dataloader = DataLoader(test_combined_dataset, batch_size = 64, shuffle=True)

evaluate_and_plot_with_dataloader(anti_noise_model, test_dataloader)

# 선언

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio

from torch.utils.data import DataLoader, Dataset, ConcatDataset

import numpy as np
import matplotlib.pyplot as plt

import os

In [2]:
device = torch.device('mps:0' if torch.backends.mps.is_available() else 'cpu')

In [3]:
print (f"PyTorch version:{torch.__version__}")
print(f"MPS 장치를 지원하도록 build 되었는지: {torch.backends.mps.is_built()}")
print(f"MPS 장치가 사용 가능한지: {torch.backends.mps.is_available()}")
!python -c 'import platform;print(platform.platform())'

PyTorch version:2.4.0
MPS 장치를 지원하도록 build 되었는지: True
MPS 장치가 사용 가능한지: True
macOS-14.6.1-arm64-arm-64bit


# 데이터 전처리

1. 소음 구간 태깅

2. 데이터 분할

In [7]:
class NoiseDataset(Dataset):
    def __init__(self, audio_file, sample_rate=16000, frame_size=1024, threshold=0.05):
        waveform, orig_sample_rate = torchaudio.load(audio_file)
        self.waveform = torchaudio.transforms.Resample(orig_freq=orig_sample_rate, new_freq=sample_rate)(waveform)
        self.sample_rate = sample_rate
        self.frame_size = frame_size
        self.threshold = threshold
        self.noise_indices = self._detect_noise(self.waveform)
        
    def _detect_noise(self, waveform):
        noise_indices = []
        energy = waveform.pow(2).mean(dim=0)
        for i in range(0, waveform.size(1) - self.frame_size, self.frame_size):
            frame_energy = energy[i:i+self.frame_size].mean().item()
            if frame_energy > self.threshold:
                noise_indices.append(i)
                
        return noise_indices

    def __len__(self):
        return len(self.noise_indices)
    
    def __getitem__(self, idx):
        start_idx = self.noise_indices[idx]
        end_idx = start_idx + self.frame_size

        noisy_segment = self.waveform[:, start_idx:end_idx]
        noisy_segment = noisy_segment.unsqueeze(-1)  

        return noisy_segment, noisy_segment

    def extract_highest_noise_segment(waveform, sample_rate=16000, segment_duration=10):
        """
        가장 높은 소음 구간을 추출하는 정적 메서드.

        Parameters:
        - waveform (torch.Tensor): 오디오 파형.
        - sample_rate (int): 샘플링 속도.
        - segment_duration (int): 추출할 구간의 길이 (초).

        Returns:
        - torch.Tensor: 가장 높은 소음 구간의 파형.
        """
        # 에너지 계산
        energy = waveform.pow(2).mean(dim=0)

        # 구간의 샘플 수 계산
        segment_samples = segment_duration * sample_rate

        # 최고 에너지를 가진 구간 탐색
        max_energy = 0
        max_index = 0

        for i in range(0, waveform.size(1) - segment_samples, segment_samples):
            segment_energy = energy[i:i + segment_samples].sum().item()
            if segment_energy > max_energy:
                max_energy = segment_energy
                max_index = i

        # 가장 높은 소음 구간 추출
        highest_noise_segment = waveform[:, max_index:max_index + segment_samples]

        return highest_noise_segment

# LSTM 모델

In [8]:
class AntiNoiseLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(AntiNoiseLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out)
        return out

# 학습 및 평가

In [9]:
def train_model(dataloader, model, criterion, optimizer, num_epochs = 20, validation_loader = None):
    train_losses = []
    val_losses = []

    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        
        for i, (inputs, targets) in enumerate(dataloader):
            inputs = inputs.to(device)
            targets = targets.to(device)

            if inputs.dim() == 2: 
                inputs = inputs.unsqueeze(1) 
            elif inputs.dim() == 4:  
                inputs = inputs.squeeze(-1) 
            elif inputs.dim() != 3: 
                raise ValueError(f"Unexpected input dimension {inputs.dim()} with shape {inputs.shape}")

            inputs = inputs.float()
            targets = -inputs
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            if (i + 1) % 10 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(dataloader)}], Loss: {loss.item():.4f}')
        
        epoch_loss = running_loss / len(dataloader)
        train_losses.append(epoch_loss)

        if validation_loader is not None:
            val_loss = evaluate_model(validation_loader, model, criterion)
            val_losses.append(val_loss)
            print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {epoch_loss:.4f}, Validation Loss: {val_loss:.4f}')
        else:
            print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {epoch_loss:.4f}')
    
    return train_losses, val_losses

def evaluate_model(dataloader, model, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs = inputs.unsqueeze(-1).float()
            targets = targets.unsqueeze(-1).float()

            outputs = model(inputs)
            outputs = outputs.unsqueeze(-1)
            loss = criterion(outputs, targets)
            total_loss += loss.item()
    
    return total_loss / len(dataloader)

# 파라미터 선언

In [10]:
val = {
    'input_size' : 1024,
    'hidden_size' : 512,
    'num_layers' : 2,
    'output_size' : 1024,
    'learning_rate' : 0.01
}

# 데이터 불러오기

In [1]:
def load_datasets_from_folder(folder_path, sample_rate=16000, frame_size=1024, threshold=0.05):
    """
    Parameters:
    - folder_path (str): .wav 파일이 있는 폴더의 경로
    - sample_rate (int): 샘플링 속도
    - frame_size (int): 프레임 크기
    - threshold (float): 소음 감지 임계값
    - segment_duration (int): 추출할 소음 구간의 길이 (초)

    Returns:
    - ConcatDataset: 결합된 데이터셋
    """
    datasets = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".wav"):
            audio_file_path = os.path.join(folder_path, file_name)
            dataset = NoiseDataset(audio_file_path, sample_rate, frame_size, threshold)
            datasets.append(dataset)

    combined_dataset = ConcatDataset(datasets)
    return combined_dataset

folder_path = '/Users/junggwonhee/Desktop/programing/오아시스_해커톤/project/data/극한_소리_데이터/Training/Sound'
combined_dataset = load_datasets_from_folder(folder_path)

dataloader = DataLoader(combined_dataset, batch_size = 64, shuffle = True, pin_memory = True)

SyntaxError: unterminated triple-quoted string literal (detected at line 24) (413871890.py, line 11)

# 모델, 손실 함수, 옵티마이저 초기화

In [12]:
anti_noise_model = AntiNoiseLSTM(val['input_size'], val['hidden_size'], val['num_layers'], val['output_size'])
criterion = nn.MSELoss()
optimizer = optim.Adam(anti_noise_model.parameters(), lr = val['learning_rate'])

# 학습

In [None]:
train_model(dataloader, anti_noise_model, criterion, optimizer)

# 모델 저장

In [14]:
import pickle

anti_noise_model.to('cpu')

with open('Noise_Lower_LSTM_vol_1.pkl', 'wb') as f:
    pickle.dump(anti_noise_model, f)

# 테스트

In [32]:
def evaluate_and_plot_with_dataloader(model, dataloader):
    model.eval()
    for inputs, _ in dataloader:
        inputs = inputs.to(device)
        inputs = inputs.squeeze(-1).float()
        with torch.no_grad():
            outputs = model(inputs)
            
        input_waveform = inputs[0].cpu().numpy().flatten()
        output_waveform = outputs[0].cpu().numpy().flatten()

        print(f"Input waveform shape: {input_waveform.shape}")
        print(f"Output waveform shape: {output_waveform.shape}")
        print(f"Input waveform first 10 values: {input_waveform[:10]}")
        print(f"Output waveform first 10 values: {output_waveform[:10]}")

        
        plt.figure(figsize=(14, 6))
        plt.plot(input_waveform, label='Original Noise')
        plt.plot(output_waveform, label='Predicted Anti-Noise', color='orange', alpha=0.7)
        plt.title('Original Noise vs Predicted Anti-Noise')
        plt.legend()
        plt.show()


# 테스트 실행

In [None]:
test_folder_path = '/Users/junggwonhee/Desktop/programing/오아시스_해커톤/project/data/극한_소리_데이터/Validation/sound'
test_combined_dataset = load_datasets_from_folder(test_folder_path)

test_dataloader = DataLoader(test_combined_dataset, batch_size = 64, shuffle=True)

evaluate_and_plot_with_dataloader(anti_noise_model, test_dataloader)

# 실시간 오디오 테스트