<a href="https://colab.research.google.com/github/hail-members/llm-based-services/blob/main/chapter2_%E1%84%8B%E1%85%A5%E1%86%AB%E1%84%8B%E1%85%A5%E1%84%86%E1%85%A9%E1%84%83%E1%85%A6%E1%86%AF%E1%84%8B%E1%85%A6_%E1%84%83%E1%85%A2%E1%84%92%E1%85%A1%E1%86%AB_%E1%84%8B%E1%85%B5%E1%84%92%E1%85%A2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Bag‑of‑Words 예제 코드

다음 코드를 Colab의 셀에 복사하여 실행하면 Bag‑of‑Words 모델의 기본 개념을 실습할 수 있습니다.


In [None]:
!pip install nltk



In [None]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
text = corpus[0]
text = text.lower()
print(text)

text = text.translate(str.maketrans("", "", string.punctuation))  # 구두점 제거
print(text)

tokens = nltk.word_tokenize(text)
print(tokens)



llm has transformed natural language processing.
llm has transformed natural language processing
['llm', 'has', 'transformed', 'natural', 'language', 'processing']


In [None]:
# 필요한 라이브러리 import 및 NLTK 데이터 다운로드
import nltk # 자연어 처리를 위한 라이브러리
import string # 문자열 처리를 위한 기본 라이브러리
import numpy as np
import pandas as pd

# NLTK의 punkt 토크나이저 다운로드
# 문자열 처리해주는 코드
nltk.download('punkt_tab')

# 예제용 문장 데이터 (코퍼스)
corpus = [
    "LLM has transformed natural language processing.",
    "LLM such as GPT-4 and ChatGPT demonstrate remarkable understanding and creative capabilities.",
    "Prompt engineering plays a key role in enhancing LLM performance.",
    "Bag-of-words models, despite their simplicity, offer a baseline for feature extraction in text analysis."
]

## 1. Bag-of-Words 구현
# 텍스트 전처리 함수: 소문자 변환, 구두점 제거, 토큰화
def preprocess(text):
    text = text.lower()  # 소문자 변환
    text = text.translate(str.maketrans("", "", string.punctuation))  # 구두점 제거
    tokens = nltk.word_tokenize(text)  # 토큰화
    return tokens

# 각 문장을 전처리하여 토큰화된 코퍼스 생성
processed_corpus = [preprocess(sentence) for sentence in corpus]
print("Tokenized Corpus:")
for tokens in processed_corpus:
    print(tokens)


Tokenized Corpus:
['llm', 'has', 'transformed', 'natural', 'language', 'processing']
['llm', 'such', 'as', 'gpt4', 'and', 'chatgpt', 'demonstrate', 'remarkable', 'understanding', 'and', 'creative', 'capabilities']
['prompt', 'engineering', 'plays', 'a', 'key', 'role', 'in', 'enhancing', 'llm', 'performance']
['bagofwords', 'models', 'despite', 'their', 'simplicity', 'offer', 'a', 'baseline', 'for', 'feature', 'extraction', 'in', 'text', 'analysis']


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:

# 전체 코퍼스에서 중복없이 단어목록(vocabulary) 생성 후 정렬
vocabulary = sorted(list({token for sentence in processed_corpus for token in sentence}))
print("\nVocabulary:")
print(vocabulary)

# 각 문장에 대한 Bag-of-Words 벡터 생성 함수
def create_bow_vector(tokens, vocabulary):
    vector = [0] * len(vocabulary)
    for token in tokens:
        if token in vocabulary:
            idx = vocabulary.index(token)
            vector[idx] += 1
    return vector

# 코퍼스의 각 문장에 대해 Bag-of-Words 벡터를 생성하고 DataFrame으로 변환
bow_matrix = [create_bow_vector(tokens, vocabulary) for tokens in processed_corpus]
bow_df_manual = pd.DataFrame(bow_matrix, columns=vocabulary)
print("\nBag-of-Words Matrix:")
bow_df_manual_ = bow_df_manual.sum()
print(bow_df_manual_)



Vocabulary:
['a', 'analysis', 'and', 'as', 'bagofwords', 'baseline', 'capabilities', 'chatgpt', 'creative', 'demonstrate', 'despite', 'engineering', 'enhancing', 'extraction', 'feature', 'for', 'gpt4', 'has', 'in', 'key', 'language', 'llm', 'models', 'natural', 'offer', 'performance', 'plays', 'processing', 'prompt', 'remarkable', 'role', 'simplicity', 'such', 'text', 'their', 'transformed', 'understanding']

Bag-of-Words Matrix:
a                2
analysis         1
and              2
as               1
bagofwords       1
baseline         1
capabilities     1
chatgpt          1
creative         1
demonstrate      1
despite          1
engineering      1
enhancing        1
extraction       1
feature          1
for              1
gpt4             1
has              1
in               2
key              1
language         1
llm              3
models           1
natural          1
offer            1
performance      1
plays            1
processing       1
prompt           1
remarkable    

In [None]:
bow_df_manual_ = bow_df_manual_/bow_df_manual_.sum()
print(bow_df_manual_)

a                0.047619
analysis         0.023810
and              0.047619
as               0.023810
bagofwords       0.023810
baseline         0.023810
capabilities     0.023810
chatgpt          0.023810
creative         0.023810
demonstrate      0.023810
despite          0.023810
engineering      0.023810
enhancing        0.023810
extraction       0.023810
feature          0.023810
for              0.023810
gpt4             0.023810
has              0.023810
in               0.047619
key              0.023810
language         0.023810
llm              0.071429
models           0.023810
natural          0.023810
offer            0.023810
performance      0.023810
plays            0.023810
processing       0.023810
prompt           0.023810
remarkable       0.023810
role             0.023810
simplicity       0.023810
such             0.023810
text             0.023810
their            0.023810
transformed      0.023810
understanding    0.023810
dtype: float64


In [None]:
# sample 10개짜리 단어들
for i in range(10):
  print(np.random.choice(bow_df_manual_.index, p=bow_df_manual_.values))


key
their
and
chatgpt
offer
transformed
simplicity
as
engineering
has



코드 설명

전처리 및 토큰화:
각 문장을 소문자로 변환하고 구두점을 제거한 후 NLTK의 word_tokenize 함수를 사용하여 토큰화합니다.

수동 Bag-of-Words 구현:
전체 코퍼스에서 고유 단어 목록을 생성한 다음, 각 문장에 대해 해당 단어가 몇 번 등장하는지를 세어 벡터로 만듭니다.

그 다음 확률로 만들었습니다.


#N-gram 예제코드


In [None]:
# n-gram 생성 (여기서는 2-gram, 즉 bigram)
def generate_ngrams(tokens, n=2):
    """토큰 리스트에서 n-gram 리스트를 생성하는 함수"""
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

n = 2  # bigram 설정
corpus_ngrams = [generate_ngrams(tokens, n) for tokens in processed_corpus]
print("\nGenerated n-grams (Bigrams):")
for ngrams in corpus_ngrams:
    print(ngrams)


Generated n-grams (Bigrams):
['llm has', 'has transformed', 'transformed natural', 'natural language', 'language processing']
['llm such', 'such as', 'as gpt4', 'gpt4 and', 'and chatgpt', 'chatgpt demonstrate', 'demonstrate remarkable', 'remarkable understanding', 'understanding and', 'and creative', 'creative capabilities']
['prompt engineering', 'engineering plays', 'plays a', 'a key', 'key role', 'role in', 'in enhancing', 'enhancing llm', 'llm performance']
['bagofwords models', 'models despite', 'despite their', 'their simplicity', 'simplicity offer', 'offer a', 'a baseline', 'baseline for', 'for feature', 'feature extraction', 'extraction in', 'in text', 'text analysis']


In [None]:
# 전체 코퍼스의 n-gram vocabulary 생성
vocabulary_ngrams = sorted(list({ngram for ngrams in corpus_ngrams for ngram in ngrams}))
print("\nVocabulary (Bigrams):")
print(vocabulary_ngrams)


Vocabulary (Bigrams):
['a baseline', 'a key', 'and chatgpt', 'and creative', 'as gpt4', 'bagofwords models', 'baseline for', 'chatgpt demonstrate', 'creative capabilities', 'demonstrate remarkable', 'despite their', 'engineering plays', 'enhancing llm', 'extraction in', 'feature extraction', 'for feature', 'gpt4 and', 'has transformed', 'in enhancing', 'in text', 'key role', 'language processing', 'llm has', 'llm performance', 'llm such', 'models despite', 'natural language', 'offer a', 'plays a', 'prompt engineering', 'remarkable understanding', 'role in', 'simplicity offer', 'such as', 'text analysis', 'their simplicity', 'transformed natural', 'understanding and']


In [None]:
# 각 문장에 대한 Bag-of-n-grams 벡터 생성 (수동 구현)
def create_ngram_vector(ngrams, vocabulary):
    vector = [0] * len(vocabulary)
    for gram in ngrams:
        if gram in vocabulary:
            idx = vocabulary.index(gram)
            vector[idx] += 1
    return vector

# 코퍼스의 각 문장에 대해 n-gram 벡터 생성 후 DataFrame으로 변환
ngram_matrix = [create_ngram_vector(ngrams, vocabulary_ngrams) for ngrams in corpus_ngrams]
ngram_df_manual = pd.DataFrame(ngram_matrix, columns=vocabulary_ngrams)
print("\nN-gram Matrix (Manual Implementation):")
print(ngram_df_manual)


N-gram Matrix (Manual Implementation):
   a baseline  a key  and chatgpt  and creative  as gpt4  bagofwords models  \
0           0      0            0             0        0                  0   
1           0      0            1             1        1                  0   
2           0      1            0             0        0                  0   
3           1      0            0             0        0                  1   

   baseline for  chatgpt demonstrate  creative capabilities  \
0             0                    0                      0   
1             0                    1                      1   
2             0                    0                      0   
3             1                    0                      0   

   demonstrate remarkable  ...  plays a  prompt engineering  \
0                       0  ...        0                   0   
1                       1  ...        0                   0   
2                       0  ...        1                   1

In [None]:
# 전체 n-gram 빈도수 및 확률 계산
ngram_total = ngram_df_manual.sum()
print("\nTotal N-gram Frequencies:")
print(ngram_total)

total_ngrams = ngram_total.sum()
ngram_probabilities = ngram_total / total_ngrams
print("\nN-gram Probabilities:")
print(ngram_probabilities)


Total N-gram Frequencies:
a baseline                  1
a key                       1
and chatgpt                 1
and creative                1
as gpt4                     1
bagofwords models           1
baseline for                1
chatgpt demonstrate         1
creative capabilities       1
demonstrate remarkable      1
despite their               1
engineering plays           1
enhancing llm               1
extraction in               1
feature extraction          1
for feature                 1
gpt4 and                    1
has transformed             1
in enhancing                1
in text                     1
key role                    1
language processing         1
llm has                     1
llm performance             1
llm such                    1
models despite              1
natural language            1
offer a                     1
plays a                     1
prompt engineering          1
remarkable understanding    1
role in                     1
simplicity of

In [None]:
# 확률에 따라 n-gram을 랜덤 샘플링 (예: 10번 샘플링)
print("\nSampled n-grams based on probability:")
for i in range(10):
    print(np.random.choice(ngram_probabilities.index, p=ngram_probabilities.values))


Sampled n-grams based on probability:
bagofwords models
their simplicity
llm has
bagofwords models
a key
gpt4 and
prompt engineering
and creative
prompt engineering
engineering plays


더 길게 한다면?

In [None]:
n = 3 # 3-gram 설정
corpus_ngrams = [generate_ngrams(tokens, n) for tokens in processed_corpus]
print("\nGenerated n-grams (Bigrams):")
for ngrams in corpus_ngrams:
    print(ngrams)


Generated n-grams (Bigrams):
['llm has transformed', 'has transformed natural', 'transformed natural language', 'natural language processing']
['llm such as', 'such as gpt4', 'as gpt4 and', 'gpt4 and chatgpt', 'and chatgpt demonstrate', 'chatgpt demonstrate remarkable', 'demonstrate remarkable understanding', 'remarkable understanding and', 'understanding and creative', 'and creative capabilities']
['prompt engineering plays', 'engineering plays a', 'plays a key', 'a key role', 'key role in', 'role in enhancing', 'in enhancing llm', 'enhancing llm performance']
['bagofwords models despite', 'models despite their', 'despite their simplicity', 'their simplicity offer', 'simplicity offer a', 'offer a baseline', 'a baseline for', 'baseline for feature', 'for feature extraction', 'feature extraction in', 'extraction in text', 'in text analysis']


In [None]:
# 전체 코퍼스의 n-gram vocabulary 생성
vocabulary_ngrams = sorted(list({ngram for ngrams in corpus_ngrams for ngram in ngrams}))
print("\nVocabulary (Bigrams):")
print(vocabulary_ngrams)

# 코퍼스의 각 문장에 대해 n-gram 벡터 생성 후 DataFrame으로 변환
ngram_matrix = [create_ngram_vector(ngrams, vocabulary_ngrams) for ngrams in corpus_ngrams]
ngram_df_manual = pd.DataFrame(ngram_matrix, columns=vocabulary_ngrams)
print("\nN-gram Matrix (Manual Implementation):")
print(ngram_df_manual)

# 전체 n-gram 빈도수 및 확률 계산
ngram_total = ngram_df_manual.sum()
print("\nTotal N-gram Frequencies:")
print(ngram_total)

total_ngrams = ngram_total.sum()
ngram_probabilities = ngram_total / total_ngrams
print("\nN-gram Probabilities:")
print(ngram_probabilities)


Vocabulary (Bigrams):
['a baseline for', 'a key role', 'and chatgpt demonstrate', 'and creative capabilities', 'as gpt4 and', 'bagofwords models despite', 'baseline for feature', 'chatgpt demonstrate remarkable', 'demonstrate remarkable understanding', 'despite their simplicity', 'engineering plays a', 'enhancing llm performance', 'extraction in text', 'feature extraction in', 'for feature extraction', 'gpt4 and chatgpt', 'has transformed natural', 'in enhancing llm', 'in text analysis', 'key role in', 'llm has transformed', 'llm such as', 'models despite their', 'natural language processing', 'offer a baseline', 'plays a key', 'prompt engineering plays', 'remarkable understanding and', 'role in enhancing', 'simplicity offer a', 'such as gpt4', 'their simplicity offer', 'transformed natural language', 'understanding and creative']

N-gram Matrix (Manual Implementation):
   a baseline for  a key role  and chatgpt demonstrate  \
0               0           0                        0   


In [None]:
# 확률에 따라 n-gram을 랜덤 샘플링 (예: 10번 샘플링)
print("\nSampled n-grams based on probability:")
for i in range(10):
    print(np.random.choice(ngram_probabilities.index, p=ngram_probabilities.values))


Sampled n-grams based on probability:
such as gpt4
llm has transformed
key role in
feature extraction in
and chatgpt demonstrate
demonstrate remarkable understanding
transformed natural language
enhancing llm performance
offer a baseline
despite their simplicity
