In [1]:
from transformers import AutoProcessor, AutoModelForCTC
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
model = AutoModelForCTC.from_pretrained("STT")
print(model)

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projec

## 신경망 구조

#### 입력값 차원은 (음성 갯수,1,음성 길이) 의 3차원 데이터 입니다. (1번째 차원의 크기 1은 CNN 연산을 위해 늘린 것)

##### Convolution
1. CNN
2. transpose (layernorm 연산을 위해서)
3. layernorm
4. transpose (다음 CNN 연산을 위해서)
5. gelu (활성화 함수)

##### Projection
1. transpose (layernorm 연산을 위해서)
2. layernorm
3. Linear
4. dropout

##### Pos_Embedding
1. transpose (CNN 연산을 위해서)
2. CNN
3. 텐서 자르기 (projection과 텐서 크기 맞춰주기 위해서)
4. gelu (활성화 함수)
5. transpose (projection과 합쳐주기 위해서)

##### Transformer
1. transformer 연산
2. layernorm

##### 단어 크기에 맞춰 가공
1. Linear

In [None]:
# 각 프로세스에 따른 신경망 블럭은 구현하였으니 조립해서 쓰도록 합시다.

class Conv_Layers(nn.Module) :
    def __init__(self, in_channels, out_channels, kernel_size, stride):
        super().__init__()
        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride=stride)
        self.norm = nn.LayerNorm(out_channels)
        self.gelu = nn.GELU
    def forward(self, x) :
        x = self.conv(x)
        x = x.permute(0,2,1)
        x = self.norm(x)
        x = x.permute(0,2,1)
        x = self.gelu(x)
        return x

class Projection(nn.Module) :
    def __init__(self,input_size,output_size,dropout_p=0.1):
        super().__init__()
        self.norm = nn.LayerNorm(input_size)
        self.f = nn.Linear(input_size, output_size)
        self.dropout = nn.Dropout(dropout_p)
    def forward(self, x) :
        x = x.permute(0,2,1)
        x = self.norm(x)
        x = self.f(x)
        x = self.dropout(x)
        return x

class Pos_Embed(nn.Module) :
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, groups) :
        super().__init__()
        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride=stride, padding=padding, groups=groups)
        self.gelu = nn.GELU()
    def forward(self, x) :
        x = x.permute(0,2,1)
        x = self.conv(x)
        x = x[:,:-1,:]
        x = self.gelu(x)
        x = x.permute(0,2,1)
        return x

class Transformer(nn.Module) :
    def __init__(self,d_model,nhead,dim_feedforward=2048,dropout_p=0.1,num_layers=1):
        super().__init__()
        transformer_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout_p, activation="gelu", batch_first=True)
        self.transformer = nn.TransformerEncoder(transformer_layer, num_layers=num_layers)
        self.norm = nn.LayerNorm(d_model)
    def forward(self, x) :
        x = self.transformer(x)
        x = self.norm(x)
        return x

class Output_F(nn.Module) :
    def __init__(self, input_size, output_size):
        super().__init__()
        self.f = nn.Linear(input_size, output_size)
    def forward(self, x) :
        x = self.f(x)
        return x