In [None]:
import os
import re
import math
import time
import json
import torch
import random
import os.path
import torchtext
import matplotlib
import numpy as np
import pandas as pd
import torch.nn as nn
import matplotlib.pyplot as plt
import torch.nn.functional as F


from math import pi
from math import sqrt
from glob import glob
from torch import optim
from tqdm import tqdm
from torchtext import vocab
from konlpy.tag import Mecab
from torchinfo import summary
from torch.nn import Transformer
from torchtext.vocab import vocab
from math import factorial as fact
from torch.utils.data import DataLoader
from timeit import default_timer as timer
from torch.nn.utils.rnn import pad_sequence
from torchtext.data.metrics import bleu_score
from nltk.translate.bleu_score import sentence_bleu
from torchtext.vocab import build_vocab_from_iterator


%matplotlib inline


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.manual_seed(777)

if device == 'cuda':
    torch.cuda.manual_seed_all(777)


## 사전 임베딩 된 단어 벡터 가져오기

In [None]:
question = sorted(glob('../embeddings/*44*'))


In [None]:
def load_embedding(name):
    if name == 'fast':
        que_emb= torchtext.vocab.Vectors(question[1])
    elif name == 'glove':
        que_emb = torchtext.vocab.Vectors(question[3])
    elif name == 'cbow':
        que_emb = torchtext.vocab.Vectors(question[0])
    else:
        que_emb = torchtext.vocab.Vectors(question[2])
    
    que_vocab = torchtext.vocab.vocab(que_emb.stoi, min_freq= 0, specials = ["<unk>", "<pad>", "<sos>", "<eos>"])
    
    vector_dic = {'que': que_emb} #{'que': que_emb, 'equ': equ_emb} # dictionary 형태로 해당 단어에 대한 벡터 값들 저장
    
    vocab_dic = {'que': que_vocab} #{'que' : que_vocab, 'equ' : equ_vocab} # 위와 같은 형태로 단어 사전 저장
    
    for vocab in ['que']:
        a = torch.zeros(4,256, requires_grad = True)                                 ## 해당 스페션 토큰에 대한 임베딩 값을 만들어주기 위함
        vector_dic[vocab].vectors= torch.cat([a, vector_dic[vocab].vectors], dim=0) #              concat 시켜 기존 벡터들에 스페셜 토큰의 임베딩 벡터를 만들어줌
        vector_dic[vocab].stoi = dict(zip(vector_dic[vocab].stoi.keys(), map(lambda x:x[1]+4, vector_dic[vocab].stoi.items()))) # 스페셜 토큰이 추가 되었으니 기존 단어들의 인덱스를 뒤로 밀어줘야함
        for i, j in enumerate(["<unk>", "<pad>", "<sos>", "<eos>"]):
            vector_dic[vocab].stoi[j] = i                  # 사전훈련으로 임베딩 된 단어들은 special 토큰이 없기에 삽입, torchtext 상위 버전에서는 따로 함수  제공
        vocab_dic[vocab].set_default_index(0)                    #여기서 0번 인덱스는 <unk>를 의미함
    
    return vocab_dic, vector_dic

In [None]:
pre_vocab ,pre_emb = load_embedding('cbow')

## 데이터 전처리

In [None]:
with open('../data/train_44_type_150000.json', 'r', encoding = 'utf-8-sig') as json_:
    json_data = json.load(json_)

train_json= [[] for i in range(len(json_data))]

for i in json_data:
    for j in json_data[i]:
        train_json[int(i)].append(json_data[i][j])
        
        

with open('../data/val_44_type_30000.json', 'r', encoding = 'utf-8-sig') as json_:
    json_data = json.load(json_)

valid_json= [[] for i in range(len(json_data))]

for i in json_data:
    for j in json_data[i]:
        valid_json[int(i)].append(json_data[i][j])


In [None]:
with open('../data/test_44_type_30000.json', 'r', encoding = 'utf-8-sig') as json_:
    json_data = json.load(json_)

test_json= [[] for i in range(len(json_data))]

for i in json_data:
    for j in json_data[i]:
        test_json[int(i)].append(json_data[i][j])

In [None]:
def max_length(df):
    '''
        Args:
            df (DataFrame): 데이터 프레임, column 0 은 question, 1은 equation
        
        Returns:
            que_length (int) : 문제를 토큰화 한 후 가장 긴 길이 반환
            equ_length (int) :  식을 토큰화 한 후 가장 긴 길이 반환.
    '''
    length = 0
    list_que = []
    for i in range(len(df)):
        list_que.append(len(que_tokenizer(df[i][0])))
    que_length = max(list_que)
    que_length += 2 ## eos 까지 붙는걸 계산
    return que_length #,equ_length


## 토크나이저

In [None]:
def que_tokenizer(sentence, is_stopwords = True):
    '''
        Args:
            sentence (str) : 입력 문장
    
        Returns: 
            mecab.morphs(sentence) (list[str]) : 리스트에 형태소 단위로 분해됨.
    '''
    mecab = Mecab()
    stopwords = ['은','?','일까요','십시오','입니까','인가요',
             ',인지','한다면','가','다','에는','에서','기록', '순위',
             '앉아', '줄로','.','시','오', '습니다','인지','한다','여라']
    
    if is_stopwords:
        a = mecab.morphs(sentence)
        a = [word for word in a if word not in stopwords]
        return a
    
    else:
        
        return mecab.morphs(sentence) ## 형태소 기준으로 분해

def yield_que_tokenizer(sentence, make_vocab = True):
    '''
        Args:
            sentence (str) : 입력 문장
    
        Returns: 
            mecab.morphs(sentence) (list[str]) : 리스트에 형태소 단위로 분해됨.
    '''
    stopwords = ['은','?','일까요','십시오','입니까','인가요',
             ',인지','한다면','가','다','에는','에서','기록', '순위',
             '앉아', '줄로', '.','시','오', '습니다','인지','한다','여라']

    mecab = Mecab()
    for i in sentence:
        a = mecab.morphs(i[0])
        a = [word for word in a if word not in stopwords]
        yield a   
      
                 

def yield_equ_tokenizer(sentence):
    '''
        Args:
            sentence (str) : 입력 문장
    
        Returns: 
            mecab.morphs(sentence) (list[str]) : 리스트에 형태소 단위로 분해됨.
    '''
    for i in sentence:
        equ = re.sub("([()])", r' \1 ', i[1])
        equ = re.sub("([\+\-\*\%\//\'\[\]\,\>\<])", r' \1 ',equ)
        yield equ.split()

def equ_tokenizer(sentence):
    '''
        Args:
            sentence (str) : 입력 문장
    
        Returns: 
            mecab.morphs(sentence) (list[str]) : 리스트에 형태소 단위로 분해됨.
    '''
    equ = re.sub("([()])", r' \1 ', sentence)
    equ = re.sub("([\+\-\*\%\//\'\[\]\,\>\<])", r' \1 ',equ)
    return equ.split()

In [None]:
# que_len= max_length(train_json)
# valid_len = max_length(valid_json)
# test_len = 0
# que_len =max(que_len, valid_len, test_len)


In [None]:
que_len = 99

## 단어 사전 생성 

In [None]:
equ_vocab = build_vocab_from_iterator(yield_equ_tokenizer(train_json), specials = ["<unk>", "<pad>","<sos>", "<eos>"], min_freq = 1)
equ_vocab.set_default_index(equ_vocab['<unk>'])

In [None]:
from typing import Optional, Tuple

import torch


__all__ = ["Conformer"]


def _lengths_to_padding_mask(lengths: torch.Tensor) -> torch.Tensor:
    batch_size = lengths.shape[0]
    max_length = int(torch.max(lengths).item())
    padding_mask = torch.arange(max_length, device=lengths.device, dtype=lengths.dtype).expand(
        batch_size, max_length
    ) >= lengths.unsqueeze(1)
    return padding_mask


class _ConvolutionModule(torch.nn.Module):
    r"""Conformer convolution module.

    Args:
        input_dim (int): input dimension.
        num_channels (int): number of depthwise convolution layer input channels.
        depthwise_kernel_size (int): kernel size of depthwise convolution layer.
        dropout (float, optional): dropout probability. (Default: 0.0)
        bias (bool, optional): indicates whether to add bias term to each convolution layer. (Default: ``False``)
        use_group_norm (bool, optional): use GroupNorm rather than BatchNorm. (Default: ``False``)
    """

    def __init__(
        self,
        input_dim: int,
        num_channels: int,
        depthwise_kernel_size: int,
        dropout: float = 0.0,
        bias: bool = False,
        use_group_norm: bool = False,
    ) -> None:
        super().__init__()
        if (depthwise_kernel_size - 1) % 2 != 0:
            raise ValueError("depthwise_kernel_size must be odd to achieve 'SAME' padding.")
        self.layer_norm = torch.nn.LayerNorm(input_dim)
        self.sequential = torch.nn.Sequential(
            torch.nn.Conv1d(
                input_dim,
                2 * num_channels,
                1,
                stride=1,
                padding=0,
                bias=bias,
            ),
            torch.nn.GLU(dim=1),
            torch.nn.Conv1d(
                num_channels,
                num_channels,
                depthwise_kernel_size,
                stride=1,
                padding=(depthwise_kernel_size - 1) // 2,
                groups=num_channels,
                bias=bias,
            ),
            torch.nn.GroupNorm(num_groups=1, num_channels=num_channels)
            if use_group_norm
            else torch.nn.BatchNorm1d(num_channels),
            torch.nn.SiLU(),
            torch.nn.Conv1d(
                num_channels,
                input_dim,
                kernel_size=1,
                stride=1,
                padding=0,
                bias=bias,
            ),
            torch.nn.Dropout(dropout),
        )

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        r"""
        Args:
            input (torch.Tensor): with shape `(B, T, D)`.

        Returns:
            torch.Tensor: output, with shape `(B, T, D)`.
        """
        x = self.layer_norm(input)
        x = x.transpose(1, 2)
        x = self.sequential(x)
        return x.transpose(1, 2)


class _FeedForwardModule(torch.nn.Module):
    r"""Positionwise feed forward layer.

    Args:
        input_dim (int): input dimension.
        hidden_dim (int): hidden dimension.
        dropout (float, optional): dropout probability. (Default: 0.0)
    """

    def __init__(self, input_dim: int, hidden_dim: int, dropout: float = 0.0) -> None:
        super().__init__()
        self.sequential = torch.nn.Sequential(
            torch.nn.LayerNorm(input_dim),
            torch.nn.Linear(input_dim, hidden_dim, bias=True),
            torch.nn.SiLU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(hidden_dim, input_dim, bias=True),
            torch.nn.Dropout(dropout),
        )

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        r"""
        Args:
            input (torch.Tensor): with shape `(*, D)`.

        Returns:
            torch.Tensor: output, with shape `(*, D)`.
        """
        return self.sequential(input)


class ConformerLayer(torch.nn.Module):
    r"""Conformer layer that constitutes Conformer.

    Args:
        input_dim (int): input dimension.
        ffn_dim (int): hidden layer dimension of feedforward network.
        num_attention_heads (int): number of attention heads.
        depthwise_conv_kernel_size (int): kernel size of depthwise convolution layer.
        dropout (float, optional): dropout probability. (Default: 0.0)
        use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d``
            in the convolution module. (Default: ``False``)
        convolution_first (bool, optional): apply the convolution module ahead of
            the attention module. (Default: ``False``)
    """

    def __init__(
        self,
        input_dim: int,
        ffn_dim: int,
        num_attention_heads: int,
        depthwise_conv_kernel_size: int,
        dropout: float = 0.0,
        use_group_norm: bool = False,
        convolution_first: bool = False,
    ) -> None:
        super().__init__()

        self.ffn1 = _FeedForwardModule(input_dim, ffn_dim, dropout=dropout)

        self.self_attn_layer_norm = torch.nn.LayerNorm(input_dim)
        self.self_attn = torch.nn.MultiheadAttention(input_dim, num_attention_heads, dropout=dropout)
        self.self_attn_dropout = torch.nn.Dropout(dropout)

        self.conv_module = _ConvolutionModule(
            input_dim=input_dim,
            num_channels=input_dim,
            depthwise_kernel_size=depthwise_conv_kernel_size,
            dropout=dropout,
            bias=True,
            use_group_norm=use_group_norm,
        )

        self.ffn2 = _FeedForwardModule(input_dim, ffn_dim, dropout=dropout)
        self.final_layer_norm = torch.nn.LayerNorm(input_dim)
        self.convolution_first = convolution_first

    def _apply_convolution(self, input: torch.Tensor) -> torch.Tensor:
        residual = input
        input = input.transpose(0, 1)
        input = self.conv_module(input)
        input = input.transpose(0, 1)
        input = residual + input
        return input

    def forward(self, input: torch.Tensor, key_padding_mask: Optional[torch.Tensor]) -> torch.Tensor:
        r"""
        Args:
            input (torch.Tensor): input, with shape `(T, B, D)`.
            key_padding_mask (torch.Tensor or None): key padding mask to use in self attention layer.

        Returns:
            torch.Tensor: output, with shape `(T, B, D)`.
        """
        residual = input
        x = self.ffn1(input)
        x = x * 0.5 + residual

        if self.convolution_first:
            x = self._apply_convolution(x)

        residual = x
        x = self.self_attn_layer_norm(x)
        x, _ = self.self_attn(
            query=x,
            key=x,
            value=x,
            key_padding_mask=key_padding_mask,
            need_weights=False,
        )
        x = self.self_attn_dropout(x)
        x = x + residual

        if not self.convolution_first:
            x = self._apply_convolution(x)

        residual = x
        x = self.ffn2(x)
        x = x * 0.5 + residual

        x = self.final_layer_norm(x)
        return x


class Conformer(torch.nn.Module):
    r"""Conformer architecture introduced in
    *Conformer: Convolution-augmented Transformer for Speech Recognition*
    :cite:`gulati2020conformer`.

    Args:
        input_dim (int): input dimension.
        num_heads (int): number of attention heads in each Conformer layer.
        ffn_dim (int): hidden layer dimension of feedforward networks.
        num_layers (int): number of Conformer layers to instantiate.
        depthwise_conv_kernel_size (int): kernel size of each Conformer layer's depthwise convolution layer.
        dropout (float, optional): dropout probability. (Default: 0.0)
        use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d``
            in the convolution module. (Default: ``False``)
        convolution_first (bool, optional): apply the convolution module ahead of
            the attention module. (Default: ``False``)

    Examples:
        >>> conformer = Conformer(
        >>>     input_dim=80,
        >>>     num_heads=4,
        >>>     ffn_dim=128,
        >>>     num_layers=4,
        >>>     depthwise_conv_kernel_size=31,
        >>> )
        >>> lengths = torch.randint(1, 400, (10,))  # (batch,)
        >>> input = torch.rand(10, int(lengths.max()), input_dim)  # (batch, num_frames, input_dim)
        >>> output = conformer(input, lengths)
    """

    def __init__(
        self,
        input_dim: int,
        num_heads: int,
        ffn_dim: int,
        num_layers: int,
        depthwise_conv_kernel_size: int,
        dropout: float = 0.0,
        use_group_norm: bool = False,
        convolution_first: bool = False,
    ):
        super().__init__()

        self.conformer_layers = torch.nn.ModuleList(
            [
                ConformerLayer(
                    input_dim,
                    ffn_dim,
                    num_heads,
                    depthwise_conv_kernel_size,
                    dropout=dropout,
                    use_group_norm=use_group_norm,
                    convolution_first=convolution_first,
                )
                for _ in range(num_layers)
            ]
        )

    def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        r"""
        Args:
            input (torch.Tensor): with shape `(B, T, input_dim)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``input``.

        Returns:
            (torch.Tensor, torch.Tensor)
                torch.Tensor
                    output frames, with shape `(B, T, input_dim)`
                torch.Tensor
                    output lengths, with shape `(B,)` and i-th element representing
                    number of valid frames for i-th batch element in output frames.
        """
        encoder_padding_mask = _lengths_to_padding_mask(lengths)

        x = input.transpose(0, 1)
        for layer in self.conformer_layers:
            x = layer(x, encoder_padding_mask)
        return x.transpose(0, 1), lengths, encoder_padding_mask 

## 포지셔널 인코딩 

In [None]:
## 위치 인코딩을 위한 포지셔널 인코딩

class PositionalEncoding(nn.Module):
    def __init__(self, emb_size, max_len, dropout ):
        super(PositionalEncoding, self).__init__()
        
        den = torch.exp(- torch.arange(0, emb_size, 2) * math.log(10000) / emb_size) ## shape = (1,256)
        pos = torch.arange(0, max_len).reshape(max_len, 1) ## shape (maxlen, 1)
        
        pos_embedding = torch.zeros((max_len, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)
        
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding) ## opimizer가 업데이트 하지 않음, 그러나 값은 존재, GPU연산 가능, 업데이트 하고싶지 않은 층
        
    def forward(self, token_embedding):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])
    

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super(TokenEmbedding,self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx = 1) ## 단어사전 
        self.emb_size = emb_size
    
    def forward(self, tokens):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)
    
    
class pre_TokenEmbedding(nn.Module): ## pretrain_word embedding
    def __init__(self, vocab, emb_size):
        super(pre_TokenEmbedding,self).__init__()
        self.embedding = nn.Embedding.from_pretrained(vocab.vectors, padding_idx=1, freeze= False)
        self.emb_size = emb_size
    def forward(self, tokens):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)
    

class Seq2SeqTransformer(nn.Module):
    def __init__(self, 
                 num_encoder_layers,
                 num_decoder_layers,
                 emb_size, 
                 nhead,
                 src_vocab_size,
                 tgt_vocab_size,
                 max_len,
                 dim_feedforward = 512, 
                 dropout = 0.4,
                pretrain = False):
        
        self.pretrain = pretrain
        super(Seq2SeqTransformer,self).__init__()
        
        self.conformer = Conformer(input_dim = emb_size,
                                   num_heads=nhead,
                                   ffn_dim = dim_feedforward,
                                   num_layers = num_encoder_layers,
                                   depthwise_conv_kernel_size = 31, 
                                   dropout = dropout)
        
        
        self.decoder_layer = nn.TransformerDecoderLayer(d_model = emb_size,
                                                        dim_feedforward = dim_feedforward,
                                                        dropout= dropout,
                                                       nhead = nhead,
                                                       device = device)
        
        self.transformer_decoder = nn.TransformerDecoder(self.decoder_layer, num_layers=num_decoder_layers)

        
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        
        self.src_tok_emb = pre_TokenEmbedding(pre_emb['que'], emb_size) ##pre_TokenEmbedding  #TokenEmbedding(src_vocab_size, emb_size)
        
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
            
        self.positional_encoding = PositionalEncoding(emb_size, max_len, dropout = dropout)
    
    def forward(self, src, tgt, tgt_mask, tgt_padding_mask, lengths):
       
        src_emb = self.positional_encoding(self.src_tok_emb(src)) # seq, batch, dim
        src_emb = src_emb.permute(1,0,2) ## batch, seq, dim
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(tgt))
        memory = self.conformer(src_emb, lengths) ## batch, seq, dim
        encoder_padding_mask = memory[2]
        memory = memory[0].permute(1,0,2) ## seq, batch, dim
        outs = self.transformer_decoder(tgt_emb, memory, tgt_mask = tgt_mask, 
                                        tgt_key_padding_mask = tgt_padding_mask, 
                                        memory_key_padding_mask = encoder_padding_mask)
        return self.generator(outs)
        
    def encode(self, src, lengths):
        return self.conformer(self.positional_encoding(self.src_tok_emb(src)).permute(1,0,2), lengths)
    
    def decoder(self,tgt, memory, tgt_mask):
        return self.transformer_decoder(self.positional_encoding(self.tgt_tok_emb(tgt)),
                                       memory, tgt_mask)
    
    def ck_pretrain(self):
        return self.pretrain
    

## 마스킹

In [None]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz,sz), device = device ))== 1).T  ## 삼각행렬로 만든 뒤
    mask = mask.float().masked_fill(mask==0, float('-inf')).masked_fill(mask == 1, float(0.0)) ## 0은 -inf 와 같은 작은 값을 줘서 
    return mask

def create_mask(src,tgt):
    src_seq_len = src.shape[0]   ## sequence_length
    tgt_seq_len = tgt.shape[0]   ## sequence_length
    
    tgt_mask = generate_square_subsequent_mask(tgt_seq_len) ## 디코더에서 셀프 어텐션 할때 그 뒤를 못보게 마스킹
    src_mask = torch.zeros((src_seq_len, src_seq_len), device = device).type(torch.bool)
    
    src_padding_mask = (src == 1).T ## (33,128)이 input으로 들어오는데 왜 transpose를 하는지=> 패딩이 위쪽
    tgt_padding_mask = (tgt == 1).T
    return src_mask, tgt_mask,src_padding_mask, tgt_padding_mask

In [None]:
SRC_VOCAB_SIZE = len(pre_vocab['que']) ## vocab_transform[SRC_LANGUAGE] pre_vocab[SRC_LANGUAGE]
TGT_VOCAB_SIZE = len(equ_vocab) ## vocab_transform[TGT_LANGUAGE] pre_vocab[TGT_LANGUAGE]
EMB_SIZE = 256
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 256
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE, NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, que_len, FFN_HID_DIM,
                                 pretrain = False)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(device)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index = 1)

optimizer = torch.optim.Adam(transformer.parameters(), lr = 5e-4, betas = (0.9,0.98), eps = 1e-9)

## Transformer collate_fn

In [None]:
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

def tensor_transform(token_ids):
    return torch.cat((torch.tensor([2]),
                     torch.tensor(token_ids),
                     torch.tensor([3])))
text_transform = {}


text_transform['que'] = sequential_transforms(que_tokenizer, pre_vocab['que'], tensor_transform)
text_transform['equ'] = sequential_transforms(equ_tokenizer, equ_vocab, tensor_transform)


In [None]:
def collate_fn(batch):
    src_batch, tgt_batch, length, ans_list = [], [], [], []
    for src_sample, tgt_sample, ans in batch:
        src_batch.append(text_transform['que'](src_sample))
        tgt_batch.append(text_transform['equ'](tgt_sample))
        ans_list.append(ans)
    for i in src_batch:
        length.append(len(i))

    
    src_batch = pad_sequence(src_batch, padding_value = 1.0)
    tgt_batch = pad_sequence(tgt_batch, padding_value = 1.0)
    length = torch.LongTensor(length)
    
    return src_batch, tgt_batch, length, ans_list

In [None]:
BATCH_SIZE = 256
train_loader = DataLoader(train_json, batch_size = BATCH_SIZE, shuffle = True, collate_fn = collate_fn, drop_last = True,num_workers = 3)
valid_loader = DataLoader(valid_json, batch_size = BATCH_SIZE, shuffle = True, collate_fn = collate_fn, drop_last = True,num_workers = 3)
test_loader = DataLoader(test_json, batch_size = BATCH_SIZE, collate_fn = collate_fn, drop_last = True, num_workers = 3)

In [None]:
def train_epoch(model, optimizer, train_loader, valid_loader):
    '''
        Args:
            model (Module)
            optimizer (Optimizer)
            train_loader (Dataloader)
            valid_loader (Dataloader)
    '''
    model.train()
    losses = 0
    val_losses = 0

    for src,tgt, length, _ in tqdm(train_loader):
        src = src.to(device)
        tgt = tgt.to(device)
        length = length.to(device)
        
        tgt_input = tgt[:-1, :]
        
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)
        
        logits = model(src, tgt_input, tgt_mask, tgt_padding_mask, length)
        
        optimizer.zero_grad()
        
        tgt_out = tgt[1:, :]
        
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()
        
        optimizer.step()
        losses += loss.item()
    
    for src, tgt, length, _ in tqdm(valid_loader):
        model.eval()
        with torch.no_grad():
            src = src.to(device)
            tgt = tgt.to(device)
            length = length.to(device)

            tgt_input = tgt[:-1, :]

            src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

            logits = model(src, tgt_input, tgt_mask, tgt_padding_mask, length)

            tgt_out = tgt[1:, :]

            loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))

            val_losses += loss.item()

    
    return losses / len(train_loader) , val_losses / len(valid_loader)


In [None]:
def new_greedy_decode(model, src, src_mask, length, max_len, start_symbol):
    src = src.to(device)
    src_mask = src_mask.to(device)
    length = length.to(device)
    
    memory = model.encode(src, length)
    memory = memory[0].permute(1,0,2)
    ys = torch.ones(1,src.size(1)).fill_(start_symbol).type(torch.long).to(device) #ys = orch.tensor([[q_vocab['<sos>'] for i in range(src.size(1))]], device = device) ## 처음 시작은 <sos> 토큰
    for i in range(max_len):
        memory = memory.to(device)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0)))
        out = model.decoder(ys, memory, tgt_mask)
        out = out.transpose(0,1) # (sequence ,batch, hidden)  -> (batch,sequence ,hidden)
        prob = model.generator(out[:, -1]) ## 마지막 단어
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.unsqueeze(0)
        ys = torch.cat([ys, next_word], dim = 0)
    return ys

In [None]:
def translate_bleu(model,testset):
    total_bleu = 0
    acc = 0
    model.eval()
    with torch.no_grad():
        for loader_idx ,(src, tgt, length, ans) in enumerate(testset):
            src = src.to(device)
            tgt = tgt.to(device)
            length = length.to(device)
            ans = list(map(float, np.array(ans)))
            target_length = tgt.size(0)
            num_tokens = src.shape[0] ## sequnce_length
            
            src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool) 
            tgt_tokens = new_greedy_decode(model, src, src_mask, length, max_len = num_tokens, start_symbol=2) #.flatten()
 
            decoded_words = []
            
            for batch_idx in range(src.size(1)):         ## 배치의 각 문장마다 돌면서 <eos> 토큰을 만나면 문장을 끊도록
                a = tgt_tokens[:, batch_idx]
                subwords = []
                for sent_idx, k in enumerate(a):
                    if a[sent_idx] == 3:
                        subwords.append("<eos>")
                        subwords = (" ".join(subwords)).replace("<sos>", "").replace("<eos>","").replace("<pad>", "")
                        decoded_words.append(subwords)
                        break
                    else:
                        subwords.append(equ_vocab.lookup_token(a[sent_idx])) ## index to word 
                        if len(subwords) == (len(tgt_tokens)):
                            subwords.append("<eos>")
                            subwords = (" ".join(subwords)).replace("<sos>", "").replace("<eos>","").replace("<pad>", "")
                            decoded_words.append(subwords)
                            break 
                            
            #predict_tgt = " ".join(equ_vocab.lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<sos>", "").replace("<eos>", "")            

            for pred_idx in range(src.size(1)): ## 각 시퀀스를 돌면서 계산
                predict_equ = decoded_words[pred_idx]
                real_equ = (" ".join(equ_vocab.lookup_tokens(tgt.T.tolist()[pred_idx]))).replace("<sos>", "").replace("<eos>","").replace("<pad>", "")
                try:
                    predict_ans = eval(predict_equ.replace('@', '//'))
                    predict_ans = float('%.02f' % predict_ans)
                except (SyntaxError, TypeError, ZeroDivisionError,ValueError, OverflowError):
                    continue
                
                if predict_ans == ans[pred_idx]:
                    acc +=1
                    
                
                decoded_words[pred_idx] = decoded_words[pred_idx].split()
                #decoded_words[pred_idx] = decoded_words[pred_idx].extend(["<eos>"])
                decoded_words[pred_idx].extend(["<eos>"])
                
                real = real_equ.split()
                #real = [real.extend(["<eos>"])]
                real.extend(["<eos>"])
                
                bleu = sentence_bleu([real], decoded_words[pred_idx],smoothing_function =None) # SmoothingFunction().method6
                total_bleu += bleu   ## 배치의 bleu score 저장 나중에 총 배치 개수로 나눠 줘야함
                


             
        
    return total_bleu / len(testset.dataset), acc / len(testset.dataset)

In [None]:
t = open('./transformer_loss/Conformer_FT_train_losses','w')
b = open('./transformer_loss/Conformer_FT_bleu_score','w')
v = open('./transformer_loss/Conformer_FT_valid_losses','w')
a = open('./transformer_loss/Conformer_FT_accuracy','w')
t.close()
b.close()
v.close()
a.close()


In [None]:
import os
NUM_EPOCHS = 300 ## 50
best_val_loss = None
before_loss = 999999
count = 0
'''
train_losses = []
val_losses = []
bleu_scores = []
acc_list = []
'''

for epoch in range(1, 401):
    t = open('./transformer_loss/Conformer_FT_train_losses','a')
    b = open('./transformer_loss/Conformer_FT_bleu_score','a')
    v = open('./transformer_loss/Conformer_FT_valid_losses','a')
    a = open('./transformer_loss/Conformer_FT_accuracy','a')
    try:
        start_time = timer()
        train_loss, val_loss= train_epoch(transformer, optimizer, train_loader, valid_loader)
        end_time = timer()
        bleu_score, acc = translate_bleu(transformer, valid_loader)
        valid_end_time = timer()
        
        t.write(str(train_loss) + '\n')
        b.write(str(bleu_score) + '\n')
        v.write(str(val_loss) + '\n')
        a.write(str(acc) + '\n')
    finally:
        t.close()
        b.close()
        v.close()
        a.close()
            
        
    print(f"EPOCH : {epoch}, Train Loss : {train_loss}, Val Loss: {val_loss}, Val_Bleu_Score :{bleu_score}, Val_acc: {acc} \
train_time : {(end_time - start_time):.3f}s, valid_time :{(valid_end_time -end_time)}s")

    
    before_loss = val_loss
    
    if not best_val_loss or val_loss < best_val_loss:
  
        
        print(f"<<<<model saved>>>>")
        torch.save(transformer.state_dict(), '../models/conformer_FT.pt')
        best_val_loss = val_loss
        count = 0
        
    else:
        count += 1
        if count == 20:
            print("overfitting")
            break
            
    
    

## 모델 평가 Validation

In [None]:
transformer_T = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE, NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, que_len, FFN_HID_DIM,
                                 pretrain = False)
transformer_T = transformer_T.to(device)
transformer_T.load_state_dict(torch.load('../saved_weights/conformer_fasttext.pt'))

In [None]:
test_csv = pd.read_csv("../data/math_test_1000" , header=None)


def data_format(df, is_stopwords = False):
    '''
        Args:
            df (DataFrame) : 데이터프레임 csv
        
        Returns:
            list_ (list): 데이터를 받아서 ['문장', '라벨'] 로 연결
        
            ex) list_= [['민혁이가 잰 직사각형의 둘레는 48m이고, 직사각형의 세로는 가로 길이의 2배 입니다. 세로는 몇 m인지 구하세요.', '((48 / 2) / (2 + 1)) * 2'],
    '''
    list_ = [] 
    for i in range(len(df[0])):
        list_.append([df[0][i]])
    for i in range(len(df[0])):    
        list_[i].append(df[1][i])
    for i in range(len(df[0])):
        list_[i].append(df[2][i])
        
    
    return list_

test_pairs = data_format(test_csv)

real_loader = DataLoader(test_pairs, batch_size = 10, shuffle = False, collate_fn = collate_fn, drop_last = True)


In [None]:
from torchmetrics.text import CharErrorRate
from torchmetrics.text import WordErrorRate
from torchmetrics.text.rouge import ROUGEScore

wer = WordErrorRate()
cer = CharErrorRate()
rouge = ROUGEScore(rouge_keys = ('rougeL'))['rougeL_fmeasure']

In [None]:
def eval_translate_bleu(model,testset):
    total_bleu = 0
    bleu_1gram = 0
    bleu_2gram = 0
    bleu_3gram = 0
    total_cer = 0
    total_rouge = 0
    acc = 0
    
    model.eval()
    with torch.no_grad():
        for loader_idx ,(src, tgt, length, ans) in enumerate(tqdm(testset)):
            src = src.to(device)
            tgt = tgt.to(device)
            length = length.to(device)
            ans = list(map(float, np.array(ans)))
            target_length = tgt.size(0)
            num_tokens = src.shape[0] ## sequnce_length
            
            src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool) 
            tgt_tokens = new_greedy_decode(model, src, src_mask, length, max_len = num_tokens, start_symbol=2) #.flatten()
 
            decoded_words = []
            
            for batch_idx in range(src.size(1)):         ## 배치의 각 문장마다 돌면서 <eos> 토큰을 만나면 문장을 끊도록
                a = tgt_tokens[:, batch_idx]
                subwords = []
                for sent_idx, k in enumerate(a):
                    if a[sent_idx] == 3:
                        subwords.append("<eos>")
                        subwords = (" ".join(subwords)).replace("<sos>", "").replace("<eos>","").replace("<pad>", "")
                        decoded_words.append(subwords)
                        break
                    else:
                        subwords.append(equ_vocab.lookup_token(a[sent_idx])) ## index to word 
                        if len(subwords) == (len(tgt_tokens)):
                            subwords.append("<eos>")
                            subwords = (" ".join(subwords)).replace("<sos>", "").replace("<eos>","").replace("<pad>", "")
                            decoded_words.append(subwords)
                            break 
                            
            #predict_tgt = " ".join(equ_vocab.lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<sos>", "").replace("<eos>", "")            

            for pred_idx in range(src.size(1)): ## 각 시퀀스를 돌면서 계산
                predict_equ = decoded_words[pred_idx]
                real_equ = (" ".join(equ_vocab.lookup_tokens(tgt.T.tolist()[pred_idx]))).replace("<sos>", "").replace("<eos>","").replace("<pad>", "")
                try:
                    predict_ans = eval(predict_equ.replace('@', '//'))
                    predict_ans = float('%.02f' % predict_ans)
                except (SyntaxError, TypeError, ZeroDivisionError,ValueError, OverflowError):
                    continue
                
                if predict_ans == ans[pred_idx]:
                    acc +=1
                    
                
                decoded_words[pred_idx] = decoded_words[pred_idx].split()
                #decoded_words[pred_idx] = decoded_words[pred_idx].extend(["<eos>"])
                decoded_words[pred_idx].extend(["<eos>"])
                
                real = real_equ.split()
                #real = [real.extend(["<eos>"])]
                real.extend(["<eos>"])
                
                total_bleu += sentence_bleu([real], decoded_words[pred_idx]) # SmoothingFunction().method6
                bleu_1gram += sentence_bleu([real], decoded_words[pred_idx], weights = [(1.0)])
                bleu_2gram += sentence_bleu([real], decoded_words[pred_idx], weights = [(0.5,0.5)])
                bleu_3gram += sentence_bleu([real], decoded_words[pred_idx], weights = [(0.333,0.333,0.334)])
                
                total_cer += cer([" ".join(decoded_words[pred_idx])], [' '.join(real)]).item()
                #total_cer += cer(decoded_words[pred_idx],real).item()
                total_rouge += rouge([" ".join(decoded_words[pred_idx])], [' '.join(real)]).item()
                
                ## 배치의 bleu score 저장 나중에 총 배치 개수로 나눠 줘야함
                
                if (loader_idx == len(testset)-1) and (batch_idx == src.size(1) - 1):
                    
                    print(f"sentence {pred_idx}" )
                    print("문장형 문제 =", (' '.join(pre_vocab['que'].lookup_tokens(src.T.tolist()[pred_idx]))).replace("<pad>", "").replace("<eos>",""))
                    print(f"실제 수식  = ", (' '.join(real)))
                    print("실제 정답", ans[pred_idx])
                    print(f"예측 수식  = ", " ".join(decoded_words[pred_idx]))
                    print("예측 정답", predict_ans)

             
        
    return total_bleu / (len(testset) * BATCH_SIZE), acc / (len(testset) * BATCH_SIZE), bleu_1gram / (len(testset) * BATCH_SIZE), bleu_2gram / (len(testset) * BATCH_SIZE), bleu_3gram / (len(testset) * BATCH_SIZE), total_cer / (len(testset) * BATCH_SIZE), total_rouge / (len(testset) * BATCH_SIZE)

In [None]:
bleu_score, acc, bleu_1, bleu_2, bleu_3, all_cer, all_rouge = eval_translate_bleu(transformer_T, test_loader)
print(bleu_score, acc, bleu_1, bleu_2, bleu_3, all_cer, all_rouge)

In [None]:
def new_translate_bleu(model,testset):
    total_bleu = 0
    bleu_1gram = 0
    bleu_2gram = 0
    bleu_3gram = 0
    total_cer = 0
    total_rouge = 0

    acc = 0
    model.eval()
    error_count = 0
    ### 이하는 나중에 지워도 됨
    new_bleu_acc = 0
    correct = 0
    data_index = []
    with torch.no_grad():
        for loader_idx ,(src, tgt,length, ans) in enumerate(testset):
            src = src.to(device)
            tgt = tgt.to(device)
            length = length.to(device)

            ans = list(map(float, np.array(ans)))
            target_length = tgt.size(0)
            num_tokens = src.shape[0] ## sequnce_length
            
            src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool) 
            tgt_tokens = new_greedy_decode(model, src, src_mask, length, max_len = num_tokens, start_symbol=2)
 
            decoded_words = []
            
            for batch_idx in range(src.size(1)):         ## 배치의 각 문장마다 돌면서 <eos> 토큰을 만나면 문장을 끊도록
                a = tgt_tokens[:, batch_idx]
                subwords = []
                for sent_idx, k in enumerate(a):
                    if a[sent_idx] == 3:
                        subwords.append("<eos>")
                        subwords = (" ".join(subwords)).replace("<sos>", "").replace("<eos>","").replace("<pad>", "")
                        decoded_words.append(subwords)
                        break
                    else:
                        subwords.append(equ_vocab.lookup_token(a[sent_idx])) ## index to word 
                        if len(subwords) == (len(tgt_tokens)):
                            subwords.append("<eos>")
                            subwords = (" ".join(subwords)).replace("<sos>", "").replace("<eos>","").replace("<pad>", "")
                            decoded_words.append(subwords)
                            break 
                            
            for pred_idx in range(src.size(1)): ## 각 시퀀스를 돌면서 계산
                predict_equ = decoded_words[pred_idx]
                real_equ = (" ".join(equ_vocab.lookup_tokens(tgt.T.tolist()[pred_idx]))).replace("<sos>", "").replace("<eos>","").replace("<pad>", "")
                try:
                    predict_ans = eval(predict_equ.replace('@', '//'))
                    predict_ans = float('%.02f' % predict_ans)
                except (SyntaxError, TypeError, ZeroDivisionError,ValueError, OverflowError):
                    continue
                
                if predict_ans == ans[pred_idx]:
                    acc +=1
                    
                
                decoded_words[pred_idx] = decoded_words[pred_idx].split()
                decoded_words[pred_idx].extend(["<eos>"])
                
                real = real_equ.split()
                real.extend(["<eos>"])
                
                #bleu = sentence_bleu([real], decoded_words[pred_idx],smoothing_function =None) # SmoothingFunction().method6
                #total_bleu += bleu   ## 배치의 bleu score 저장 나중에 총 배치 개수로 나눠 줘야함

            
                if ans[pred_idx] == predict_ans:
                    correct += 1
                    print("***********정답******************")
                    print(f"데이터 몇번째 :{loader_idx * 10 + pred_idx +1} ") ## 10 = real_dataloader batch size
                    data_index.append(loader_idx * 10 + pred_idx +1)
                    print("문장형 문제 =", (' '.join(pre_vocab['que'].lookup_tokens(src.T.tolist()[pred_idx]))).replace("<pad>", "").replace("<eos>",""))
                    print(f"실제 수식  = ", (' '.join(real)))
                    print(f"예측 수식  = ", " ".join(decoded_words[pred_idx]))
                    print("실제 정답", ans[pred_idx])
                    print("예측 정답", predict_ans)
                    bleu_score_acc = sentence_bleu([real], decoded_words[pred_idx],smoothing_function =None)
                    
                    bleu_1gram += sentence_bleu([real], decoded_words[pred_idx], weights = [(1.0)])
                    bleu_2gram += sentence_bleu([real], decoded_words[pred_idx], weights = [(0.5,0.5)])
                    bleu_3gram += sentence_bleu([real], decoded_words[pred_idx], weights = [(0.333,0.333,0.334)])
                    total_cer += cer([" ".join(decoded_words[pred_idx])], [' '.join(real)]).item()
                    total_rouge += rouge([" ".join(decoded_words[pred_idx])], [' '.join(real)]).item()
                    #total_cer += cer(decoded_words[pred_idx],real ).item()
                    new_bleu_acc += bleu_score_acc
                    
                    print("*********************************")
       
                else:
                    if error_count < 175:
                        print("^^^^^^^^^^오답^^^^^^^^^^^^^^^^^^^^^^^^^")
                        error_count += 1
                        print("문장형 문제 =", (' '.join(pre_vocab['que'].lookup_tokens(src.T.tolist()[pred_idx]))).replace("<pad>", "").replace("<eos>",""))
                        print(f"실제 수식  = ", (' '.join(real)))
                        print(f"예측 수식  = ", " ".join(decoded_words[pred_idx]))
                        print("실제 정답", ans[pred_idx])
                        print("예측 정답", predict_ans)
                        bleu_score_acc = sentence_bleu([real], decoded_words[pred_idx],smoothing_function =None)

                        bleu_1gram += sentence_bleu([real], decoded_words[pred_idx], weights = [(1.0)])
                        bleu_2gram += sentence_bleu([real], decoded_words[pred_idx], weights = [(0.5,0.5)])
                        bleu_3gram += sentence_bleu([real], decoded_words[pred_idx], weights = [(0.333,0.333,0.334)])
                        total_cer += cer([" ".join(decoded_words[pred_idx])], [' '.join(real)]).item()
                        total_rouge += rouge([" ".join(decoded_words[pred_idx])], [' '.join(real)]).item()
                        #total_cer += cer(decoded_words[pred_idx],real).item()
                        new_bleu_acc += bleu_score_acc
    
        print(f"total 311/ correct:{correct/311} ")
        print(f"correct : {correct}")
        print(f"data index",data_index)
        print(f"total 311 bleu :{new_bleu_acc/ 311}")
        print(f"total 311 bleu_1 :{bleu_1gram/ 311}")
        print(f"total 311 bleu_2 :{bleu_2gram/ 311}")
        print(f"total 311 bleu_3:{bleu_3gram/ 311}")
        print(f"total cer :{total_cer}")
        print(f"total 311 cer :{total_cer/ 311}")
        print(f"total 311 rouge :{total_rouge/ 311}")

        print(f"error count :{error_count}")
        

                    
                    
                

             
        
    return

In [None]:
s_score = timer()
new_translate_bleu(transformer_T, real_loader)
e_score = timer()