# 언어 모델의 기초적 이해 2

### 지난 시간 복습

지난 시간에 우리는 텍스트 데이터를 분석해서, 각 단어 다음에 등장할 확률을 직접 세어서 구한 다음, 계산한 확률을 토대로 문장을 생성해 나갔습니다.

여기에서, 우리는 수많은 문제점을 체험할 수 있었습니다:

1. 우리가 가지고 있는 텍스트 데이터에 시작 단어가 존재하지 않는 경우, 문장을 생성하지 못합니다.
2. 문장을 생성하더라도, 문맥의 흐름을 파악하지 못하고 자연스럽지 않은 문장이 생성되기도 합니다.

### 언어 모델 토큰화 다시 복습하기

In [1]:
from transformers import AutoTokenizer

sample_text = "선생님 너무 잘생겼어요!"
tokenizer = AutoTokenizer.from_pretrained("skt/kogpt2-base-v2")

tokens = tokenizer.tokenize(sample_text)
print(tokens)

actual_tokens = tokenizer(sample_text, return_tensors="pt").input_ids
print(actual_tokens)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.83M [00:00<?, ?B/s]

['▁선생', '님', '▁너무', '▁잘', '생', '겼', '어', '요', '!']
tensor([[22662,  7177, 12371,  9443,  7777,  6883,  8006,  8084,   376]])


### 지난 시간의 코드

In [2]:
!pip install wikipedia-api

Collecting wikipedia-api
  Downloading wikipedia_api-0.8.1.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia-api
  Building wheel for wikipedia-api (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia-api: filename=Wikipedia_API-0.8.1-py3-none-any.whl size=15383 sha256=e9df0d21fb2335a1c2cc8449c789a819a90ce0e0eb7ade97183be6d18d7c78a4
  Stored in directory: /root/.cache/pip/wheels/0b/0f/39/e8214ec038ccd5aeb8c82b957289f2f3ab2251febeae5c2860
Successfully built wikipedia-api
Installing collected packages: wikipedia-api
Successfully installed wikipedia-api-0.8.1


In [3]:
import re
import random
from tqdm import tqdm
from collections import defaultdict, Counter
from transformers import AutoTokenizer

def split_text_to_sentences(text):
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    return sentences

def remove_text_from_start_end_marker(text, start_marker='(', end_marker=')'):
    return re.sub(r'\{}.*?\{}'.format(re.escape(start_marker), re.escape(end_marker)), '', text).strip()

def clean_text_data(text):
    print("Cleaning text data...")
    sentences = split_text_to_sentences(text)
    sentences = [i.lower() for i in sentences] # make sentence lower cased. e.g. "Hello World" -> "hello world"
    sentences = [remove_text_from_start_end_marker(i) for i in sentences] # remove parentheses and their content. e.g. "hello world (test)" -> "hello world"
    to_replace = ["!", ";", '\n', '</p>', '<a', 'id=', "href=", 'title=', 'class=', '</a>', '(', ')', '}', '{',
                  '</sup>', '<p>', '</b>', '<sup', '>', '<', '\\', '-']
    replace_with = ''
    cleaned_sentences = []
    for i in sentences:
        word_array = i.split()
        word_array_new = []
        for word in word_array:
            for to_replace_val in to_replace:
                word = word.replace(to_replace_val, replace_with)
            word_array_new.append(word)
        cleaned_sentence = ' '.join(word_array_new).strip()
        cleaned_sentence = re.sub(r'\s+', ' ', cleaned_sentence) # Remove extra whitespaces
        cleaned_sentences.append(cleaned_sentence)
    print("Cleaning complete.")
    return cleaned_sentences

def compute_next_token_probabilities(sentences, given_token_text, tokenizer=None):
    if tokenizer is None:
        print("No tokenizer provided. Creating a new tokenizer.")
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    tokens = []
    for sentence in sentences:
        token_ids = tokenizer.encode(sentence, add_special_tokens=False)
        tokens.extend(token_ids)
    given_token_id = tokenizer.convert_tokens_to_ids(given_token_text)
    next_token_counts = defaultdict(Counter)
    for current_token, next_token in zip(tokens[:-1], tokens[1:]):
        next_token_counts[current_token][next_token] += 1
    total_next = sum(next_token_counts[given_token_id].values())
    if total_next == 0:
        return {}
    probabilities = {
        tokenizer.convert_ids_to_tokens(token_id): count / total_next
        for token_id, count in next_token_counts[given_token_id].items()
    }
    return probabilities

def compute_next_token_counts(tokens):
    next_token_counts = defaultdict(Counter)
    for current_token, next_token in zip(tokens[:-1], tokens[1:]):
        next_token_counts[current_token][next_token] += 1
    return next_token_counts

def prepare_token_data(sentences, tokenizer=None):
    if tokenizer is None:
        print("No tokenizer provided. Creating a new tokenizer.")
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    tokens = []
    for sentence in tqdm(sentences, desc="Tokenizing..."):
        token_ids = tokenizer.encode(sentence, add_special_tokens=False)
        tokens.extend(token_ids)
    return tokens

def sample_next_token(next_counts):
    tokens, counts = zip(*next_counts.items())
    total = sum(counts)
    probabilities = [count / total for count in counts]
    return random.choices(tokens, weights=probabilities, k=1)[0]

def random_sample_generate_sentence(next_token_counts, start_token_text, tokenizer=None, max_length=20):
    if tokenizer is None:
        print("No tokenizer provided. Creating a new tokenizer.")
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    current_token_id = tokenizer.convert_tokens_to_ids(start_token_text.lower())
    generated_tokens = [current_token_id]
    print("Generating a sentence with random sampling...")
    for _ in tqdm(range(max_length)):
        next_counts = next_token_counts.get(current_token_id, None)
        if not next_counts:
            break  # No next token found
        next_token_id = sample_next_token(next_counts)
        generated_tokens.append(next_token_id)
        current_token_id = next_token_id
        token_text = tokenizer.convert_ids_to_tokens(current_token_id)
        if token_text in ['.', '!', '?', tokenizer.sep_token, tokenizer.pad_token]:
            break
    generated_text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(generated_tokens))
    return generated_text


In [4]:
# Example usage:
import wikipediaapi

wiki_wiki = wikipediaapi.Wikipedia('MyProjectName', 'en',
        extract_format=wikipediaapi.ExtractFormat.WIKI
)

p_wiki = wiki_wiki.page("Breakfast")
text = p_wiki.text
sentences = clean_text_data(text)

start_token_text = 'breakfast'
tokens = prepare_token_data(sentences, tokenizer=tokenizer)
next_token_counts = compute_next_token_counts(tokens)
generated_sentence = random_sample_generate_sentence(next_token_counts, start_token_text)
print("Generated sentence (random sample):")
print(generated_sentence)

Cleaning text data...
Cleaning complete.


Tokenizing...: 100%|██████████| 146/146 [00:00<00:00, 8793.09it/s]

No tokenizer provided. Creating a new tokenizer.





tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Generating a sentence with random sampling...


  0%|          | 0/20 [00:00<?, ?it/s]

Generated sentence (random sample):
breakfast





### 더 많은 데이터!

간단한 해결책은, 더 많은 텍스트 데이터를 사용하는 것입니다! 데이터는 많을 수록, 실제 언어 모델과 비슷해집니다.


In [5]:
!pip install mwparserfromhell
!pip install datasets

Collecting mwparserfromhell
  Downloading mwparserfromhell-0.6.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.3 kB)
Downloading mwparserfromhell-0.6.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (196 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/196.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m102.4/196.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m196.3/196.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mwparserfromhell
Successfully installed mwparserfromhell-0.6.6
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.m

In [6]:
from datasets import load_dataset

wikipedia_dataset = load_dataset('wikipedia', '20220301.en')
print(wikipedia_dataset)

README.md:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

wikipedia.py:   0%|          | 0.00/36.7k [00:00<?, ?B/s]

The repository for wikipedia contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/wikipedia.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0/41 [00:00<?, ?files/s]

train-00000-of-00041.parquet:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

train-00001-of-00041.parquet:   0%|          | 0.00/705M [00:00<?, ?B/s]

train-00002-of-00041.parquet:   0%|          | 0.00/558M [00:00<?, ?B/s]

train-00003-of-00041.parquet:   0%|          | 0.00/491M [00:00<?, ?B/s]

train-00004-of-00041.parquet:   0%|          | 0.00/431M [00:00<?, ?B/s]

train-00005-of-00041.parquet:   0%|          | 0.00/391M [00:00<?, ?B/s]

train-00006-of-00041.parquet:   0%|          | 0.00/366M [00:00<?, ?B/s]

train-00007-of-00041.parquet:   0%|          | 0.00/326M [00:00<?, ?B/s]

train-00008-of-00041.parquet:   0%|          | 0.00/329M [00:00<?, ?B/s]

train-00009-of-00041.parquet:   0%|          | 0.00/312M [00:00<?, ?B/s]

train-00010-of-00041.parquet:   0%|          | 0.00/267M [00:00<?, ?B/s]

train-00011-of-00041.parquet:   0%|          | 0.00/247M [00:00<?, ?B/s]

train-00012-of-00041.parquet:   0%|          | 0.00/229M [00:00<?, ?B/s]

train-00013-of-00041.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

train-00014-of-00041.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

train-00015-of-00041.parquet:   0%|          | 0.00/236M [00:00<?, ?B/s]

train-00016-of-00041.parquet:   0%|          | 0.00/215M [00:00<?, ?B/s]

train-00017-of-00041.parquet:   0%|          | 0.00/229M [00:00<?, ?B/s]

train-00018-of-00041.parquet:   0%|          | 0.00/241M [00:00<?, ?B/s]

train-00019-of-00041.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

train-00020-of-00041.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

train-00021-of-00041.parquet:   0%|          | 0.00/255M [00:00<?, ?B/s]

train-00022-of-00041.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

train-00023-of-00041.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

train-00024-of-00041.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00025-of-00041.parquet:   0%|          | 0.00/218M [00:00<?, ?B/s]

train-00026-of-00041.parquet:   0%|          | 0.00/212M [00:00<?, ?B/s]

train-00027-of-00041.parquet:   0%|          | 0.00/206M [00:00<?, ?B/s]

train-00028-of-00041.parquet:   0%|          | 0.00/199M [00:00<?, ?B/s]

train-00029-of-00041.parquet:   0%|          | 0.00/219M [00:00<?, ?B/s]

train-00030-of-00041.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

train-00031-of-00041.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

train-00032-of-00041.parquet:   0%|          | 0.00/200M [00:00<?, ?B/s]

train-00033-of-00041.parquet:   0%|          | 0.00/203M [00:00<?, ?B/s]

train-00034-of-00041.parquet:   0%|          | 0.00/201M [00:00<?, ?B/s]

train-00035-of-00041.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00036-of-00041.parquet:   0%|          | 0.00/199M [00:00<?, ?B/s]

train-00037-of-00041.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00038-of-00041.parquet:   0%|          | 0.00/203M [00:00<?, ?B/s]

train-00039-of-00041.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00040-of-00041.parquet:   0%|          | 0.00/185M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6458670 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/41 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 6458670
    })
})


In [7]:
from tqdm import tqdm

num_total_data = 10000
wikipedia_text_combined = ''
for i, data in enumerate(tqdm(wikipedia_dataset['train'])):
    wikipedia_text_combined += data['text']
    if i > num_total_data:
        break

  0%|          | 3795/6458670 [07:14<205:26:23,  8.73it/s]


KeyboardInterrupt: 

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
wikipedia_sentences = clean_text_data(wikipedia_text_combined)
tokens = prepare_token_data(wikipedia_sentences, tokenizer=tokenizer)
next_token_counts = compute_next_token_counts(tokens)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Cleaning text data...
Cleaning complete.


Tokenizing...:   4%|▍         | 24986/578005 [00:03<01:08, 8053.14it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1235 > 1024). Running this sequence through the model will result in indexing errors
Tokenizing...: 100%|██████████| 578005/578005 [01:20<00:00, 7200.83it/s]


In [9]:
start_token_text = 'the'
generated_sentence = random_sample_generate_sentence(
    next_token_counts, start_token_text,
    tokenizer=tokenizer
)
print("Generated sentence (random sample):")
print(generated_sentence)

Generating a sentence with random sampling...


 10%|█         | 2/20 [00:00<00:00, 108.54it/s]

Generated sentence (random sample):
the two milliseconds.





### 학습으로 넘어가기

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


# Prepare dataset
class NextTokenDataset(Dataset):
    def __init__(self, sentences, tokenizer):
        self.inputs = []
        self.targets = []
        for sentence in sentences:
            tokens = tokenizer.encode(sentence, add_special_tokens=True)
            for i in range(len(tokens) - 1):
                self.inputs.append(tokens[i])
                self.targets.append(tokens[i+1])

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return torch.tensor(self.inputs[idx]), torch.tensor(self.targets[idx])

# Define simple MLP model
class SimpleMLP(nn.Module):
    def __init__(self, vocab_size, hidden_dim=128):
        super(SimpleMLP, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.mlp = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, vocab_size)
        )

    def forward(self, x):
        x = self.embedding(x)
        return self.mlp(x)

In [None]:
# Hyperparameters
batch_size = 64
epochs = 10
learning_rate = 1e-3

# Dataset and DataLoader
dataset = NextTokenDataset(wikipedia_sentences, tokenizer)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Model, Loss, Optimizer
model = SimpleMLP(vocab_size=tokenizer.vocab_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    total_loss = 0
    for prev_token, next_token in tqdm(dataloader, desc=f"Epoch {epoch+1}"):
        optimizer.zero_grad()
        logits = model(prev_token)
        loss = criterion(logits, next_token)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader):.4f}")

Epoch 1:  34%|███▍      | 91948/270059 [5:22:28<13:42:18,  3.61it/s]

In [None]:
# Inference example
def generate_sequence(model, tokenizer, start_token, max_len=10):
    model.eval()
    token = tokenizer.encode(start_token, add_special_tokens=False)[-1]
    output_tokens = [token]

    for _ in range(max_len):
        input_tensor = torch.tensor([token])
        with torch.no_grad():
            logits = model(input_tensor)
            next_token = torch.argmax(logits, dim=-1).item()
            output_tokens.append(next_token)
            token = next_token

    return tokenizer.decode(output_tokens)

# Example generation
print(generate_sequence(model, tokenizer, "The"))

### (선택) 문맥의 흐름을 파악하여 자연스러운 문장 만들기

In [None]:
# Prepare dataset
class NextTokenDataset(Dataset):
    def __init__(self, sentences, tokenizer, max_seq_len=32):
        self.inputs = []
        self.targets = []
        for sentence in sentences:
            tokens = tokenizer.encode(sentence, add_special_tokens=True)
            for i in range(1, len(tokens)):
                input_seq = tokens[:i]
                target = tokens[i]
                if len(input_seq) > max_seq_len:
                    input_seq = input_seq[-max_seq_len:]
                self.inputs.append(input_seq)
                self.targets.append(target)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_seq = self.inputs[idx]
        target = self.targets[idx]
        return torch.tensor(input_seq), torch.tensor(target)

def collate_fn(batch):
    inputs, targets = zip(*batch)
    inputs = nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=tokenizer.pad_token_id)
    targets = torch.stack(targets)
    return inputs, targets

# Define LSTM model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, hidden_dim=128, num_layers=1):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        output, _ = self.lstm(x)
        output = output[:, -1, :]  # take the output of the last token
        return self.fc(output)

In [None]:
# Hyperparameters
batch_size = 64
epochs = 10
learning_rate = 1e-3

# Dataset and DataLoader
dataset = NextTokenDataset(sentences, tokenizer)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# Model, Loss, Optimizer
lstm = LSTMModel(vocab_size=tokenizer.vocab_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    total_loss = 0
    for input_seq, next_token in tqdm(dataloader, desc=f"Epoch {epoch+1}"):
        optimizer.zero_grad()
        logits = lstm(input_seq)
        loss = criterion(logits, next_token)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader):.4f}")

In [None]:
# Inference example
def generate_sequence(model, tokenizer, start_text, max_len=10):
    model.eval()
    tokens = tokenizer.encode(start_text, add_special_tokens=False)
    tokens = torch.tensor(tokens).unsqueeze(0)

    for _ in range(max_len):
        with torch.no_grad():
            logits = model(tokens)
            next_token = torch.argmax(logits, dim=-1).item()
            tokens = torch.cat([tokens, torch.tensor([[next_token]])], dim=1)

    return tokenizer.decode(tokens.squeeze().tolist())

# Example generation
print(generate_sequence(lstm, tokenizer, "The"))

### 문제점 논의하기