<a href="https://colab.research.google.com/github/knggu/EUANGGG/blob/haikoo/maincode/data/experiment/ASTFeatureExtractor_exp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from transformers import ASTFeatureExtractor, AutoProcessor
import librosa
import zipfile
import torch
from transformers import AutoProcessor
import os
import glob
from tqdm import tqdm
from transformers import ASTConfig, ASTModel
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, random_split
from transformers import ASTForAudioClassification, Trainer, TrainingArguments

In [3]:
import torch

# Check if a GPU is available and set the device accordingly
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("PyTorch is using GPU 🟢")
else:
    device = torch.device("cpu")
    print("PyTorch is using CPU 🟡")

PyTorch is using GPU 🟢


In [4]:
dir_path = '/content/drive/MyDrive/Colab Notebooks/euanggg/dataset/set 2.2'
os.path.exists(dir_path)

True

In [5]:
class_name = next(os.walk(dir_path))[1]
class_name.remove('.ipynb_checkpoints')
class_name

['hungry', 'discomfort', 'bpain', 'tired']

In [6]:
audio_dir = [os.path.join(dir_path, name) for name in class_name]
audio_dir

['/content/drive/MyDrive/Colab Notebooks/euanggg/dataset/set 2.2/hungry',
 '/content/drive/MyDrive/Colab Notebooks/euanggg/dataset/set 2.2/discomfort',
 '/content/drive/MyDrive/Colab Notebooks/euanggg/dataset/set 2.2/bpain',
 '/content/drive/MyDrive/Colab Notebooks/euanggg/dataset/set 2.2/tired']

In [7]:
bpain_audio = glob.glob(os.path.join(audio_dir[2], '*.wav'))
discomf_audio = glob.glob(os.path.join(audio_dir[1], '*.wav'))
hungry_audio = glob.glob(os.path.join(audio_dir[0], '*.wav'))
tired_audio = glob.glob(os.path.join(audio_dir[3], '*.wav'))

In [8]:
len(hungry_audio)

100

In [9]:
audio_path_class = {
    'bpain': bpain_audio,
    'discomf': discomf_audio,
    'hungry': hungry_audio,
    'tired': tired_audio
}

## **Librosa로 오디오 불러오기**

In [10]:
def convert_audio(pathdict):
    audio_load = {}

    for class_name, path in tqdm(pathdict.items()):
        temp = []
        for file in path:
            audio, sr = librosa.load(file, sr = 16000)
            temp.append(audio)
        audio_load[class_name] = temp
    return audio_load

In [11]:
loaded_audio = convert_audio(audio_path_class)

100%|██████████| 4/4 [00:03<00:00,  1.19it/s]


## **AutoProcessor 불러오기**

In [12]:
processor = AutoProcessor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


In [13]:
input_byclass = {}
for class_name, audio_load in loaded_audio.items():
    temp = []
    for audio in audio_load:
        input = processor(audio, sampling_rate = 16000, return_tensor = 'pt')
        temp.append(input['input_values'])
    input_byclass[class_name] = np.array(temp)

In [14]:
all_data = []
all_labels = []
for label, class_data in enumerate(input_byclass):
    all_data.append(input_byclass[class_data])  # Append class data
    all_labels.append(np.full(len(input_byclass[class_data]), label))

In [15]:
all_data = np.concatenate(all_data, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

In [16]:
# Convert to PyTorch tensors
all_data_tensor = torch.tensor(all_data, dtype=torch.float32)
all_labels_tensor = torch.tensor(all_labels, dtype=torch.long)

In [17]:
squeezed_data = all_data_tensor.squeeze(1)

In [18]:
all_labels_tensor.shape

torch.Size([340])

In [19]:
dataset = TensorDataset(squeezed_data, all_labels_tensor)

# Determine the size of each split
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

# Split the dataset into training and validation sets
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=True)

## **모델 불러오기**

In [20]:
model = ASTModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593").to(device)

In [21]:
for name, param in model.named_parameters():
    param.requires_grad = False

In [22]:
model.config.hidden_size

768

In [23]:
import torch.nn as nn

class CustomASTClassifier(nn.Module):
    def __init__(self, ast_model_name, num_labels):
        super().__init__()
        self.ast = ASTModel.from_pretrained(ast_model_name)
        # Adding a dense layer for classification
        self.classifier = nn.Linear(self.ast.config.hidden_size, num_labels)
        self.num_labels = num_labels  # Define num_labels as an attribute

    def forward(self, input_values, labels=None):
        outputs = self.ast(input_values)
        embeddings = outputs.last_hidden_state
        logits = self.classifier(embeddings.mean(dim=1))
        # Add further logic for calculating loss if labels are provided
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            return loss, logits
        return logits

In [24]:
import torch.optim as optim

num_labels = 4
ast_model_name = "MIT/ast-finetuned-audioset-10-10-0.4593"
model = CustomASTClassifier(ast_model_name, num_labels).to(device)

# Define your optimizer, loss function, etc.
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Training loop
for epoch in range(10):
    for batch in train_loader:  # Iterates over the batches
        optimizer.zero_grad()

        # Unpack your data and labels from the batch
        input_values, labels = batch  # Modify this line according to your data structure
        input_values, labels = input_values.to(device), labels.to(device)

        # Forward pass, backward pass, optimize
        loss, logits = model(input_values, labels)
        loss.backward()
        optimizer.step()

In [25]:
all_data_tensor.shape

torch.Size([340, 1, 1024, 128])

In [26]:
model.eval()  # Set the model to evaluation mode

correct = 0
total = 0
with torch.no_grad():  # Disable gradient computation
    for data in val_loader:  # dataloader is your DataLoader for the dataset
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 75.00%
