# 언어 모델의 기초적 이해 2

### 지난 시간 복습

지난 시간에 우리는 텍스트 데이터를 분석해서, 각 단어 다음에 등장할 확률을 직접 세어서 구한 다음, 계산한 확률을 토대로 문장을 생성해 나갔습니다. 

여기에서, 우리는 수많은 문제점을 체험할 수 있었습니다: 

1. 우리가 가지고 있는 텍스트 데이터에 시작 단어가 존재하지 않는 경우, 문장을 생성하지 못합니다.
2. 문장을 생성하더라도, 문맥의 흐름을 파악하지 못하고 자연스럽지 않은 문장이 생성되기도 합니다.

### 지난 시간의 코드

In [None]:
!pip install wikipedia-api

In [None]:
import re
import random
from collections import defaultdict, Counter
from transformers import AutoTokenizer

def split_text_to_sentences(text):
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    return sentences

def remove_text_from_start_end_marker(text, start_marker='(', end_marker=')'):
    return re.sub(r'\{}.*?\{}'.format(re.escape(start_marker), re.escape(end_marker)), '', text).strip()

def clean_text_data(text):
    sentences = split_text_to_sentences(text)
    sentences = [i.lower() for i in sentences] # make sentence lower cased. e.g. "Hello World" -> "hello world"
    sentences = [remove_text_from_start_end_marker(i) for i in sentences] # remove parentheses and their content. e.g. "hello world (test)" -> "hello world"
    short_sentences = []
    for i in sentences:
        temp = i.split(',')
        for j in temp:
            short_sentences.append(j.strip())
    to_replace = ["!", ";", '\n', '</p>', '<a', 'id=', "href=", 'title=', 'class=', '</a>', '(', ')', '}', '{',
                  '</sup>', '<p>', '</b>', '<sup', '>', '<', '\\', '-']
    replace_with = ''
    cleaned_sentences = []
    for i in short_sentences:
        word_array = i.split()
        word_array_new = []
        for word in word_array:
            for to_replace_val in to_replace:
                word = word.replace(to_replace_val, replace_with)
            word_array_new.append(word)
        cleaned_sentence = ' '.join(word_array_new).strip()
        cleaned_sentence = re.sub(r'\s+', ' ', cleaned_sentence) # Remove extra whitespaces
        cleaned_sentences.append(cleaned_sentence)
    cleaned_sentences = [i for i in cleaned_sentences if len(i.split()) > 10]
    return cleaned_sentences

def compute_next_token_probabilities(sentences, given_token_text, tokenizer=None):
    if tokenizer is None:
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    tokens = []
    for sentence in sentences:
        token_ids = tokenizer.encode(sentence, add_special_tokens=False)
        tokens.extend(token_ids)
    given_token_id = tokenizer.convert_tokens_to_ids(given_token_text)
    next_token_counts = defaultdict(Counter)
    for current_token, next_token in zip(tokens[:-1], tokens[1:]):
        next_token_counts[current_token][next_token] += 1
    total_next = sum(next_token_counts[given_token_id].values())
    if total_next == 0:
        return {}
    probabilities = {
        tokenizer.convert_ids_to_tokens(token_id): count / total_next
        for token_id, count in next_token_counts[given_token_id].items()
    }
    return probabilities

def compute_next_token_counts(tokens):
    next_token_counts = defaultdict(Counter)
    for current_token, next_token in zip(tokens[:-1], tokens[1:]):
        next_token_counts[current_token][next_token] += 1
    return next_token_counts

def prepare_token_data(sentences, tokenizer=None):
    if tokenizer is None:
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    tokens = []
    for sentence in sentences:
        token_ids = tokenizer.encode(sentence, add_special_tokens=False)
        tokens.extend(token_ids)
    return tokens

def sample_next_token(next_counts):
    tokens, counts = zip(*next_counts.items())
    total = sum(counts)
    probabilities = [count / total for count in counts]
    return random.choices(tokens, weights=probabilities, k=1)[0]

def random_sample_generate_sentence(sentences, start_token_text, tokenizer=None, max_length=20):
    if tokenizer is None:
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    tokens = prepare_token_data(sentences, tokenizer=tokenizer)
    next_token_counts = compute_next_token_counts(tokens)
    current_token_id = tokenizer.convert_tokens_to_ids(start_token_text)
    generated_tokens = [current_token_id]
    for _ in range(max_length):
        next_counts = next_token_counts.get(current_token_id, None)
        if not next_counts:
            break  # No next token found
        next_token_id = sample_next_token(next_counts)
        generated_tokens.append(next_token_id)
        current_token_id = next_token_id
        token_text = tokenizer.convert_ids_to_tokens(current_token_id)
        if token_text in ['.', '!', '?', tokenizer.sep_token, tokenizer.pad_token]:
            break
    generated_text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(generated_tokens))
    return generated_text


In [None]:
# Example usage:
import wikipediaapi

wiki_wiki = wikipediaapi.Wikipedia('MyProjectName', 'en',
        extract_format=wikipediaapi.ExtractFormat.WIKI
)

p_wiki = wiki_wiki.page("Breakfast")
text = p_wiki.text
sentences = clean_text_data(text)

start_token_text = 'breakfast'
generated_sentence = random_sample_generate_sentence(sentences, start_token_text)
print("Generated sentence (random sample):")
print(generated_sentence)

### 더 많은 데이터!


### 학습으로 넘어가기

### 문맥의 흐름을 파악하여 자연스러운 문장 만들기

### 문제점 논의하기