In [1]:
import torch

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())

True


In [3]:
import torch.nn as nn
from transformers import ASTModel,ASTConfig
import torch.nn.functional as F
import torch.onnx
from transformers import AutoProcessor
import librosa
import io
import numpy as np

In [8]:
model_path = r'c:/Users/Desk_Kang/Desktop/Aiffel/workplace/lib/Aiffelthon/AST_pipe3_epoch50_acc74.pth'
config_path_prep = r'c:/Users/Desk_Kang/Desktop/Aiffel/workplace/lib/Aiffelthon/ast-finetuned-audioset-10-10-0.4593'

In [9]:
class CustomASTClassifier(nn.Module):
    def __init__(self, ast_model_name, num_labels):
        super().__init__()
        self.astconfig = ASTConfig()
        self.ast = ASTModel.from_pretrained(ast_model_name)
        self.num_labels = num_labels
        for param in self.ast.parameters():
            param.requires_grad = False



        self.conv1d1 = nn.Sequential(
            nn.Conv1d(in_channels=768,
                      out_channels = 512,
                      kernel_size = 4,
                      stride = 1),
            nn.BatchNorm1d(512),
            nn.ReLU())


        self.adapted = nn.AdaptiveAvgPool1d(1)



        self.layer_fc3 = nn.Sequential(
            nn.Linear(512,128),
            nn.BatchNorm1d(128),
            nn.GELU(),
            nn.Dropout(0.3))


        self.add_layer_01 = nn.Linear(128, num_labels)
        self.num_labels = num_labels


    def forward(self, input_values, labels=None):
        input_values = input_values.float()
        outputs = self.ast(input_values)
        embeddings = outputs.last_hidden_state


        embeddings_t = embeddings.transpose(1,2)

        conv_output1 = self.conv1d1(embeddings_t)

        adap_out = self.adapted(conv_output1)

        output_sq = torch.squeeze(adap_out, dim=1)

        flattened_output = adap_out.view(output_sq.size(0), -1)


        logits = self.layer_fc3(flattened_output)
        logits = torch.tanh(logits)
        logits = self.add_layer_01(logits)


        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1).long())
            return loss, logits
        return logits

In [10]:
model = CustomASTClassifier(config_path_prep, 4)

In [11]:
model.load_state_dict(torch.load(model_path, map_location='cuda'))

<All keys matched successfully>

In [12]:
model.eval()

CustomASTClassifier(
  (ast): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ASTLayer(
          (attention): ASTAttention(
            (attention): ASTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
            (dense): Linear(in_features=768, ou

In [17]:
example_input = torch.randn(1, 1024, 128)

In [16]:
input_names = [param for param in model.forward.__code__.co_varnames if param != 'self']
print("모델의 입력 이름:", input_names)

모델의 입력 이름: ['input_values', 'labels', 'outputs', 'embeddings', 'embeddings_t', 'conv_output1', 'adap_out', 'output_sq', 'flattened_output', 'logits', 'loss_fct', 'loss']


In [18]:
torch.onnx.export(model,         # model being run
      example_input,       # model input (or a tuple for multiple inputs)
      'c:/Users/Desk_Kang/Desktop/Aiffel/workplace/lib/Aiffelthon/ASTClassifier_fin.onnx',       # where to save the model
      export_params=True,  # store the trained parameter weights inside the model file
      opset_version=10,    # the ONNX version to export the model to
      do_constant_folding=True,  # whether to execute constant folding for optimization
      input_names = ['modelInput'],   # the model's input names
      output_names = ['modelOutput'], # the model's output names
      dynamic_axes={'modelInput' : {0 : 'batch_size'},    # variable length axes
                            'modelOutput' : {0 : 'batch_size'}})



verbose: False, log level: Level.ERROR



In [19]:
import onnxruntime as ort

In [20]:
dummy_input = torch.randn(1, 1024, 128)

In [21]:
input_array = dummy_input.detach().cpu().numpy()

In [22]:
torch_output = model(dummy_input)

In [23]:
ort_session = ort.InferenceSession('c:/Users/Desk_Kang/Desktop/Aiffel/workplace/lib/Aiffelthon/ASTClassifier_fin.onnx')

In [24]:
outputs = ort_session.run(None, {"modelInput":input_array})

In [25]:
print(outputs)

[array([[-0.09938867,  0.66113806, -0.81626517,  0.64111876]],
      dtype=float32)]


In [26]:
np.testing.assert_allclose(torch_output.detach().numpy(), outputs[0], rtol=1e-03, atol=1e-05)

In [27]:
print(torch_output)

tensor([[-0.0994,  0.6611, -0.8163,  0.6411]], grad_fn=<AddmmBackward0>)
