# GPT-1

### 1. gpt1: 디코더만 사용하는 아키텍처: gpt-1은 디코더만 사용하며, 오로지 이전 토큰들을 기반으로 다음 토큰을 예측

### 2. 인코더-디코더 어텐션 없음: GPT-1은 인코더를 사용하지 않으며, 이전 토큰들만을 기반으로 예측을 수행

### 3. Look-Ahead Mask: GPT-1은 미래의 토큰을 보지 못하게 하는 마스크를 적용

In [68]:
import pandas as pd
import urllib.request
import tensorflow_datasets as tfds
import tensorflow as tf
import time
import numpy as np
import matplotlib.pyplot as plt
import re

urllib.request.urlretrieve("https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv", filename="ChatBotData.csv")

train_data = pd.read_csv('ChatBotData.csv')
train_data.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


In [69]:
questions = []
for sentence in train_data['Q']:
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = sentence.strip()
    questions.append(sentence)

In [70]:
answers = []
for sentence in train_data['A']:
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = sentence.strip()
    answers.append(sentence)

In [71]:
from transformers import GPT2Tokenizer

# GPT-1 모델에 사용할 토크나이저 로드 (GPT2 토크나이저가 하위 호환 가능)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# 임의의 입력 문장을 sample_string에 저장
sample_string = questions[19]

# encode() : 텍스트 시퀀스 --> 정수 시퀀스
tokenized_string = tokenizer.encode(sample_string, add_special_tokens=False)
print(f'정수 인코딩 후의 문장: {tokenized_string}')

# decode() : 정수 시퀀스 --> 텍스트 시퀀스
original_string = tokenizer.decode(tokenized_string)
print(f'기존 문장: {original_string}')

정수 인코딩 후의 문장: [166, 108, 222, 168, 232, 97, 167, 117, 226, 31619, 226, 230, 167, 105, 112, 31619, 100, 236, 35975, 112, 31619, 224, 246, 168, 247, 242, 46695, 97, 764]
기존 문장: 가스비 너무 많이 나왔다.


In [72]:
#tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Special tokens
START_TOKEN = [tokenizer.bos_token_id]  # Beginning of sequence token
END_TOKEN = [tokenizer.eos_token_id]    # End of sequence token

MAX_LENGTH = 8

# 토큰화 / 정수 인코딩 / 시작 토큰과 종료 토큰 추가 / 패딩
def tokenize_and_filter(inputs, outputs):
    tokenized_inputs, tokenized_outputs = [], []
  
    for (sentence1, sentence2) in zip(inputs, outputs):
        # encode(토큰화 + 정수 인코딩), 시작 토큰과 종료 토큰 추가
        sentence1 = START_TOKEN + tokenizer.encode(sentence1, add_special_tokens=False) + END_TOKEN
        sentence2 = START_TOKEN + tokenizer.encode(sentence2, add_special_tokens=False) + END_TOKEN

        tokenized_inputs.append(sentence1)
        tokenized_outputs.append(sentence2)
    
    # 패딩 (시퀀스 길이를 MAX_LENGTH로 맞춤)
    tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences(
        tokenized_inputs, maxlen=MAX_LENGTH, padding='post')
    tokenized_outputs = tf.keras.preprocessing.sequence.pad_sequences(
        tokenized_outputs, maxlen=MAX_LENGTH, padding='post')
    
    return tokenized_inputs, tokenized_outputs

# 예시 호출
inputs, outputs = tokenize_and_filter(questions, answers)

In [73]:
questions, answers = tokenize_and_filter(questions, answers)

In [88]:
BATCH_SIZE = 64
BUFFER_SIZE = 20000

# 입력과 출력 시퀀스에서 동일한 데이터를 사용하며, 출력은 예측할 토큰입니다.
# GPT-1은 단방향 모델이므로 입력 시퀀스로부터 다음 토큰을 예측합니다.
dataset = tf.data.Dataset.from_tensor_slices((
    {
        'inputs': inputs,  # 토크나이즈된 입력 문장들
    },
    {
        'outputs': outputs[:, 1:]  # 출력에서 첫 번째 토큰(시작 토큰)을 제거한 것. 즉, 다음 토큰 예측을 위해 사용
    },
))

# 데이터셋을 캐시하고 섞은 뒤 배치 처리
dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

# dataset을 출력해 확인 가능
#for batch in dataset.take(1):
#    print(batch)

In [89]:
questions = list(questions)  # Ensure this is a list of strings
answers = list(answers)      # Ensure this is a list of strings

# Combine questions and answers to form a complete corpus
corpus = questions + answers

# Create the vocabulary using SubwordTextEncoder
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    corpus, target_vocab_size=2**13
)

# Assign integer values to start and end tokens
START_TOKEN = [tokenizer.vocab_size]         # Start token ID
END_TOKEN = [tokenizer.vocab_size + 1]       # End token ID

# Update the vocabulary size to account for the special tokens
VOCAB_SIZE = tokenizer.vocab_size + 2

print("Vocabulary Size:", VOCAB_SIZE)

TypeError: Expected binary or unicode string, got array([  168,   233,   250, 31619,   243,    94,  5145, 50256],
      dtype=int32)

In [90]:
print(type(questions), type(answers))  # Should be <class 'list'> for both
print(type(questions[0]), type(answers[0]))  # Should be <class 'str'> for both

<class 'list'> <class 'list'>
<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [67]:
# 서브워드텍스트인코더를 사용하여 질문과 답변을 모두 포함한 단어 집합(Vocabulary) 생성
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    questions + answers, target_vocab_size=2**13)

# 시작 토큰과 종료 토큰에 대한 정수 부여.
START_TOKEN, END_TOKEN = [tokenizer.vocab_size], [tokenizer.vocab_size + 1]

# 시작 토큰과 종료 토큰을 고려하여 단어 집합의 크기를 + 2
VOCAB_SIZE = tokenizer.vocab_size + 2

TypeError: Expected binary or unicode string, got array([   335,    459,    347,  31787,    491,    336,   5909, 100512],
      dtype=int32)

In [91]:
class GPT1(tf.keras.Model):
    def __init__(self, vocab_size, num_layers, units, d_model, num_heads, dropout):
        super(GPT1, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        # Token embedding and positional encoding
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
        self.pos_encoding = self.positional_encoding(10000, d_model)
        
        self.dec_layers = [self.decoder_layer(d_model, num_heads, units, dropout) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(dropout)

    def get_config(self):
        pass

    def positional_encoding(self, position, d_model):
        angle_rads = self.get_angles(np.arange(position)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model)

        # Apply sine to even indices and cosine to odd indices
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

        pos_encoding = angle_rads[np.newaxis, ...]

        return tf.cast(pos_encoding, dtype=tf.float32)

    def get_angles(self, pos, i, d_model):
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        return pos * angle_rates

    def decoder_layer(self, d_model, num_heads, dff, dropout_rate):
        inputs = tf.keras.layers.Input(shape=(None, d_model))

        # Masked multi-head attention (causal masking)
        attention_output = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(inputs, inputs, attention_mask=self.causal_mask(inputs))
        attention_output = tf.keras.layers.Dropout(dropout_rate)(attention_output)
        attention_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention_output + inputs)

        # Feed Forward Network
        ffn_output = tf.keras.layers.Dense(dff, activation='relu')(attention_output)
        ffn_output = tf.keras.layers.Dense(d_model)(ffn_output)
        ffn_output = tf.keras.layers.Dropout(dropout_rate)(ffn_output)
        output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(ffn_output + attention_output)

        return tf.keras.Model(inputs=inputs, outputs=output)

    def causal_mask(self, inputs):
        seq_len = tf.shape(inputs)[1]
        causal_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
        return causal_mask

    def call(self, inputs, training):
        seq_len = tf.shape(inputs)[1]

        # Embed tokens and apply positional encoding
        embeddings = self.embedding(inputs)
        embeddings *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        embeddings += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(embeddings, training=training)

        # Apply decoder layers
        for i in range(self.num_layers):
            x = self.dec_layers[i](x, training=training)

        return x

In [92]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        
        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [93]:
learning_rate = CustomSchedule(D_MODEL)

In [94]:
# 옵티마이저 설정 (Adam)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

# 손실 함수 설정
loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

# 정확도 메트릭 정의
def accuracy(y_true, y_pred):
    y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1))
    return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)

In [95]:
NUM_LAYERS = 2
D_MODEL = 256
NUM_HEADS = 8
DFF = 512
DROPOUT = 0.1

gpt1_model = GPT1(vocab_size=VOCAB_SIZE, num_layers=NUM_LAYERS, units=DFF, d_model=D_MODEL, num_heads=NUM_HEADS, dropout=DROPOUT)

In [96]:
gpt1_model.compile(optimizer=optimizer, loss=loss_function, metrics=[accuracy])

In [98]:
#outputs = tokenized_answers  # Ensure this is your tokenized answer data

# Create dataset from tensor slices, ensuring outputs are shifted appropriately
dataset = tf.data.Dataset.from_tensor_slices((inputs, outputs[:, 1:]))  # Remove the first token from outputs

# Cache, shuffle, batch, and prefetch the dataset
dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [99]:
EPOCHS = 100
gpt1_model.fit(dataset, epochs=EPOCHS)

Epoch 1/100


ValueError: in user code:

    /opt/conda/lib/python3.9/site-packages/keras/engine/training.py:853 train_function  *
        return step_function(self, iterator)
    /tmp/ipykernel_262/277250556.py:10 accuracy  *
        return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
    /opt/conda/lib/python3.9/site-packages/tensorflow/python/util/dispatch.py:206 wrapper  **
        return target(*args, **kwargs)
    /opt/conda/lib/python3.9/site-packages/keras/metrics.py:3537 sparse_categorical_accuracy
        return tf.cast(tf.equal(y_true, y_pred), backend.floatx())
    /opt/conda/lib/python3.9/site-packages/tensorflow/python/util/dispatch.py:206 wrapper
        return target(*args, **kwargs)
    /opt/conda/lib/python3.9/site-packages/tensorflow/python/ops/math_ops.py:1864 equal
        return gen_math_ops.equal(x, y, name=name)
    /opt/conda/lib/python3.9/site-packages/tensorflow/python/ops/gen_math_ops.py:3217 equal
        _, _, _op, _outputs = _op_def_library._apply_op_helper(
    /opt/conda/lib/python3.9/site-packages/tensorflow/python/framework/op_def_library.py:748 _apply_op_helper
        op = g._create_op_internal(op_type_name, inputs, dtypes=None,
    /opt/conda/lib/python3.9/site-packages/tensorflow/python/framework/func_graph.py:599 _create_op_internal
        return super(FuncGraph, self)._create_op_internal(  # pylint: disable=protected-access
    /opt/conda/lib/python3.9/site-packages/tensorflow/python/framework/ops.py:3561 _create_op_internal
        ret = Operation(
    /opt/conda/lib/python3.9/site-packages/tensorflow/python/framework/ops.py:2041 __init__
        self._c_op = _create_c_op(self._graph, node_def, inputs,
    /opt/conda/lib/python3.9/site-packages/tensorflow/python/framework/ops.py:1883 _create_c_op
        raise ValueError(str(e))

    ValueError: Dimensions must be equal, but are 7 and 8 for '{{node Equal}} = Equal[T=DT_FLOAT, incompatible_shape_error=true](Reshape, Cast_2)' with input shapes: [?,7], [?,8].
