## Padding
- 자연어 처리에서 각 문장(문서)의 길이는 다를 수 있음
- 그러나 언어모델은 고정된 길이의 데이터를 효율적으로 처리함 -> 모든 문장의 길이를 동일하게 맞춰주는 작업이 필요함 == 패딩

**패딩 이점**
1. 일관된 입력 형식
2. 병렬 연산 최적화
3. 유연한 데이터 처리

In [1]:
preprocessed_sentences = [['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'],
                          ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'],
                          ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'],
                          ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'],
                          ['barber', 'went', 'huge', 'mountain']]

### 직접 구현

In [2]:
import torch
from collections import Counter

class TokenizerForPadding:
    def __init__(self, num_worlds=None, oov_token='<oov'):
        self.num_words = num_worlds
        self.oov_token = oov_token
        self.word_index = {}
        self.index_word = {}
        self.word_counts = Counter()
        
    def fit_on_texts(self, texts):
        # 빈도수 세기
        for sentence in texts:
            self.word_counts.update(word for word in sentence if word)
            
        # 빈도수 기반  vocabulary 생성 (num_words 만큼만)    
        vocab = [self.oov_token] + [word for word, _ in self.word_counts.most_common(self.num_words - 2 if self.num_words else None)]
        
        self.word_index = {word: i + 1 for i, word in enumerate(vocab)}
        self.index_word = {i + 1: word for i, word in enumerate(vocab)}
        
    def texts_to_sequences(self, texts):
        return [[self.word_index.get(word, self.word_index[self.oov_token]) for word in sentence] for sentence in texts]


In [3]:
def pad_sequences(sequenes, maxlen=None, padding='pre', truncating='pre', value=0):
    if maxlen is None:
        maxlen = max(len(seq) for seq in sequenes)
        
    padded_sequences = []
    for seq in sequenes:
        if len(seq) > maxlen:
            if truncating == 'pre':
                seq = seq[-maxlen:]
            else:
                seq = seq[:maxlen]
        else:
            pad_length = maxlen - len(seq)
            if padding == 'pre':
                seq = [value] * pad_length + seq
            else:
                seq = seq + value * pad_length
        padded_sequences.append(seq)
        
    return torch.tensor(padded_sequences, dtype=torch.long)

In [4]:
tokenizer = TokenizerForPadding(num_worlds=15)
tokenizer.fit_on_texts(preprocessed_sentences)
sequenes = tokenizer.texts_to_sequences(preprocessed_sentences)
sequenes

[[2, 6],
 [2, 9, 6],
 [2, 4, 6],
 [10, 3],
 [3, 5, 4, 3],
 [4, 3],
 [2, 5, 7],
 [2, 5, 7],
 [2, 5, 3],
 [8, 8, 4, 3, 11, 2, 12],
 [2, 13, 4, 14]]

In [5]:
padded = pad_sequences(sequenes, padding='post', maxlen=5, truncating='post', value=[0])
padded

tensor([[ 2,  6,  0,  0,  0],
        [ 2,  9,  6,  0,  0],
        [ 2,  4,  6,  0,  0],
        [10,  3,  0,  0,  0],
        [ 3,  5,  4,  3,  0],
        [ 4,  3,  0,  0,  0],
        [ 2,  5,  7,  0,  0],
        [ 2,  5,  7,  0,  0],
        [ 2,  5,  3,  0,  0],
        [ 8,  8,  4,  3, 11],
        [ 2, 13,  4, 14,  0]])

### keras Tokenizer 이용

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_sentences)
sequenes = tokenizer.texts_to_sequences(preprocessed_sentences)
sequenes

[[1, 5],
 [1, 8, 5],
 [1, 3, 5],
 [9, 2],
 [2, 4, 3, 2],
 [3, 2],
 [1, 4, 6],
 [1, 4, 6],
 [1, 4, 2],
 [7, 7, 3, 2, 10, 1, 11],
 [1, 12, 3, 13]]

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded = pad_sequences(sequenes, padding='post', maxlen=3, truncating='post', value=[0])
padded

array([[ 1,  5,  0],
       [ 1,  8,  5],
       [ 1,  3,  5],
       [ 9,  2,  0],
       [ 2,  4,  3],
       [ 3,  2,  0],
       [ 1,  4,  6],
       [ 1,  4,  6],
       [ 1,  4,  2],
       [ 7,  7,  3],
       [ 1, 12,  3]], dtype=int32)

##### 어린왕자 데이터 샘플 패딩처리

1. 텍스트 전처리 (토큰화/불용어처리/정제/정규화)
2. 정수 인코딩 Tokenizer (tensorflow.keras)
3. 패딩 처리 pad_sequences (tensorflow.keras)

In [8]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re

In [9]:
raw_text = """The Little Prince, written by Antoine de Saint-Exupéry, is a poetic tale about a young prince who travels from his home planet to Earth. The story begins with a pilot stranded in the Sahara Desert after his plane crashes. While trying to fix his plane, he meets a mysterious young boy, the Little Prince.

The Little Prince comes from a small asteroid called B-612, where he lives alone with a rose that he loves deeply. He recounts his journey to the pilot, describing his visits to several other planets. Each planet is inhabited by a different character, such as a king, a vain man, a drunkard, a businessman, a geographer, and a fox. Through these encounters, the Prince learns valuable lessons about love, responsibility, and the nature of adult behavior.

On Earth, the Little Prince meets various creatures, including a fox, who teaches him about relationships and the importance of taming, which means building ties with others. The fox's famous line, "You become responsible, forever, for what you have tamed," resonates with the Prince's feelings for his rose.

Ultimately, the Little Prince realizes that the essence of life is often invisible and can only be seen with the heart. After sharing his wisdom with the pilot, he prepares to return to his asteroid and his beloved rose. The story concludes with the pilot reflecting on the lessons learned from the Little Prince and the enduring impact of their friendship.

The narrative is a beautifully simple yet profound exploration of love, loss, and the importance of seeing beyond the surface of things."""

In [10]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

# 문장 토큰화
sentences = sent_tokenize(raw_text)

# 영문 불용어 처리
en_stopwords = stopwords.words('english')

# 단어사전
vocab = {}

# 토큰화/정제/정규화 처리 결과
preprocessed_sentences = []

for sentence in sentences:
    sentence = sentence.lower()
    tokens = word_tokenize(sentence)
    token = [token for token in tokens if token not in en_stopwords]        # 불용어 제거
    token = [token for token in token if len(token) > 2]                   # 단어 길이가 2 이하면 제거
    
    for token in tokens:
        if token not in vocab:
            vocab[token] = len(vocab) + 1
        else:
            vocab[token] += 1
            
        preprocessed_sentences.append(token)
        

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=15, oov_token='<oov>')
tokenizer.fit_on_texts(preprocessed_sentences)
sequences = tokenizer.texts_to_sequences(preprocessed_sentences)

sequences

[[2],
 [9],
 [4],
 [],
 [1],
 [1],
 [1],
 [1],
 [1, 1],
 [],
 [12],
 [3],
 [1],
 [1],
 [14],
 [3],
 [1],
 [4],
 [1],
 [1],
 [1],
 [5],
 [1],
 [1],
 [10],
 [1],
 [],
 [2],
 [1],
 [1],
 [6],
 [3],
 [13],
 [1],
 [1],
 [2],
 [1],
 [1],
 [1],
 [5],
 [1],
 [1],
 [],
 [1],
 [1],
 [10],
 [1],
 [5],
 [1],
 [],
 [11],
 [1],
 [3],
 [1],
 [1],
 [1],
 [],
 [2],
 [9],
 [4],
 [],
 [2],
 [9],
 [4],
 [1],
 [1],
 [3],
 [1],
 [1],
 [1],
 [1, 1],
 [],
 [1],
 [11],
 [1],
 [1],
 [6],
 [3],
 [1],
 [1],
 [11],
 [1],
 [1],
 [],
 [11],
 [1],
 [5],
 [1],
 [10],
 [2],
 [13],
 [],
 [1],
 [5],
 [1],
 [10],
 [1],
 [1],
 [1],
 [],
 [1],
 [1],
 [12],
 [1],
 [1],
 [3],
 [1],
 [1],
 [],
 [1],
 [1],
 [3],
 [1],
 [],
 [3],
 [1],
 [1],
 [],
 [3],
 [1],
 [],
 [3],
 [1],
 [],
 [3],
 [1],
 [],
 [7],
 [3],
 [1],
 [],
 [1],
 [1],
 [1],
 [],
 [2],
 [4],
 [1],
 [1],
 [1],
 [14],
 [1],
 [],
 [1],
 [],
 [7],
 [2],
 [1],
 [8],
 [1],
 [1],
 [],
 [1],
 [1],
 [],
 [2],
 [9],
 [4],
 [1],
 [1],
 [1],
 [],
 [1],
 [3],
 [1],
 [],
 [1],
 [1

In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded = pad_sequences(sequences, padding='post', maxlen=5, truncating='post', value=[0])
padded

array([[2, 0, 0, 0, 0],
       [9, 0, 0, 0, 0],
       [4, 0, 0, 0, 0],
       ...,
       [8, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]], dtype=int32)