In [1]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (488 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m488.6/488.6 kB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.0 konlpy-0.6.0


In [2]:
from collections import Counter

def tokenize_text(text):
    # 텍스트를 공백 기준으로 토큰화
    tokens = text.split()
    # 각 토큰의 빈도수 계산
    token_counts = Counter(tokens)
    return token_counts

# 예제 문장
example_text = "안녕하세요. 쿠글 9기 이혜승, 이태형입니다. 파이썬으로 텍스트를 토큰화해보세요. 좋은 하루 보내세요!"
print(tokenize_text(example_text))

Counter({'안녕하세요.': 1, '쿠글': 1, '9기': 1, '이혜승,': 1, '이태형입니다.': 1, '파이썬으로': 1, '텍스트를': 1, '토큰화해보세요.': 1, '좋은': 1, '하루': 1, '보내세요!': 1})


In [3]:
def create_vocabulary(text):
    # 문장을 공백을 기준으로 분리하여 단어 집합 생성
    words = text.split()
    # 중복 제거를 위해 집합으로 변환 후 다시 리스트로 변환
    vocabulary = list(set(words))
    # 단어집합을 알파벳순으로 정렬
    vocabulary.sort()
    return vocabulary

# 예제 문장
example_text = "안녕하세요. 쿠글 9기 이혜승, 이태형입니다. 파이썬으로 텍스트를 토큰화해보세요. 좋은 하루 보내세요!"

# 단어집합 생성
vocabulary = create_vocabulary(example_text)

# 단어집합 출력
print("단어집합:", vocabulary)

단어집합: ['9기', '보내세요!', '안녕하세요.', '이태형입니다.', '이혜승,', '좋은', '쿠글', '텍스트를', '토큰화해보세요.', '파이썬으로', '하루']


In [4]:
import numpy as np

def onehot_encoding(word, vocabulary):
    # 단어의 인덱스 찾기
    index = vocabulary.index(word)
    # 원핫 인코딩 수행
    onehot = np.zeros(len(vocabulary))
    onehot[index] = 1
    return onehot

# 단어집합과 예제 단어
vocabulary = ['안녕', '하세요', '쿠글', '9기', '이혜승', '이태형입니다', '파이썬', '텍스트', '토큰화해보세요', '좋은', '하루', '보내세요']
example_word = '쿠글'

# 원핫 인코딩 수행
onehot_vector = onehot_encoding(example_word, vocabulary)
print(f'"{example_word}"의 원핫 인코딩 벡터:', onehot_vector)

"쿠글"의 원핫 인코딩 벡터: [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [5]:
from konlpy.tag import Okt
from collections import Counter

def bow_representation(text):
    # 형태소 분석기 초기화
    okt = Okt()
    # 텍스트를 형태소 단위로 분리
    tokens = okt.morphs(text)
    # 빈도수 계산
    vocab = Counter(tokens)
    # BoW 표현 생성
    bow = [vocab[token] for token in tokens]
    return vocab, bow

# 예제 문장
example_text = "안녕하세요. 쿠글 9기 이혜승, 이태형입니다. 파이썬으로 텍스트를 토큰화해보세요. 좋은 하루 보내세요!"

# 단어집합과 BoW 표현 생성
vocab, bow = bow_representation(example_text)

# 결과 출력
print("단어집합 (vocabulary):")
for word, frequency in vocab.items():
    print(word + ":", frequency)

print("\nBag of Words (BoW) 벡터:")
print(bow)

단어집합 (vocabulary):
안녕하세요: 1
.: 3
쿠글: 1
9: 1
기: 1
이혜승: 1
,: 1
이태형: 1
입니다: 1
파이썬: 1
으로: 1
텍스트: 1
를: 1
토큰: 1
화: 1
해보세요: 1
좋은: 1
하루: 1
보내세요: 1
!: 1

Bag of Words (BoW) 벡터:
[1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1]


In [7]:
from konlpy.tag import Okt

def analyze_morphology(text):
    okt = Okt()
    # 형태소 분석과 품사 태깅
    morphs = okt.pos(text)
    return morphs

# (과제) 문장 넣기
example_text = " 저는 임재성입니다 "
print(analyze_morphology(example_text))

[('저', 'Noun'), ('는', 'Josa'), ('임재성', 'Noun'), ('입니다', 'Adjective')]


In [8]:
!pip install nltk



In [10]:
from nltk.util import ngrams
from collections import defaultdict, Counter

def generate_ngrams(text, N=2):
    tokens = text.split()
    n_grams = list(ngrams(tokens, N))
    n_gram_freq = Counter(n_grams)
    return n_gram_freq

# (과제) 문장 넣기
example_text = "아이패드는 깨끗해요"
print(generate_ngrams(example_text, 2))

Counter({('아이패드는', '깨끗해요'): 1})


In [11]:
!pip install glove-python3  #glove 라이브러리
from glove import Corpus, Glove
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

Collecting glove-python3
  Downloading glove_python3-0.1.0.tar.gz (326 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m327.0/327.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: glove-python3
  Building wheel for glove-python3 (setup.py) ... [?25l[?25hdone
  Created wheel for glove-python3: filename=glove_python3-0.1.0-cp310-cp310-linux_x86_64.whl size=1065514 sha256=34fc2eb76a59fe51e91c9d8b2d5778b520297151a439426aadcae2509b571055
  Stored in directory: /root/.cache/pip/wheels/fe/2f/79/34314d44a0907e90e323c8c182ec23f126eb460829e02d98cf
Successfully built glove-python3
Installing collected packages: glove-python3
Successfully installed glove-python3-0.1.0


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [12]:
# 샘플 텍스트 데이터
texts = [
    "GloVe is an unsupervised learning algorithm for obtaining vector representations for words.",
    "Training is performed on aggregated global word-word co-occurrence statistics from a corpus.",
    "The result is a set of word vectors that are interesting linear substructures of the word vector space.",
]

# 텍스트를 토큰화합니다.
tokenized_texts = [word_tokenize(text.lower()) for text in texts]

# Corpus 객체를 생성합니다.
corpus = Corpus()

# Corpus를 학습 데이터로 사용하여 GloVe 모델을 훈련합니다.
corpus.fit(tokenized_texts, window=5)

# GloVe 모델을 생성합니다.
glove = Glove(no_components=100, learning_rate=0.05)

# 사전 훈련된 Corpus를 사용하여 모델을 학습시킵니다.
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)

Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


In [13]:
#GloVe로 워드 임베딩한 벡터
word = 'glove' #원하는 단어를 입력 ex) 'glove' -> '  '
vector = glove.word_vectors[glove.dictionary[word]]
print(f"Vector for '{word}': {vector}")

Vector for 'glove': [ 3.58743959e-03 -1.78938141e-03  3.04164614e-03  4.35052457e-03
  2.67252265e-03  2.64462009e-03 -1.56603322e-04 -1.93544355e-03
 -1.90219079e-03  2.87773638e-03  7.42132146e-04 -3.56197544e-03
 -4.22602035e-03 -9.14165758e-04  2.32300428e-03  3.57994550e-03
 -3.68763953e-03 -1.05680249e-03  6.02323858e-04  4.60210989e-03
 -4.07979860e-04 -4.70781310e-03 -4.40407507e-03 -3.80523769e-03
  4.49402513e-03 -1.24771736e-03 -3.91190953e-03 -2.38792042e-03
 -3.82413722e-03 -6.97239569e-04  3.66907361e-03 -1.80342845e-03
 -1.73493776e-03 -3.87196687e-04 -2.75455976e-03 -2.10016652e-03
  3.51355353e-03 -2.37529464e-03 -2.83808506e-03  4.06870578e-03
  1.78667069e-03  3.11565557e-03 -1.25066462e-03  4.56073046e-03
 -1.93958108e-03 -3.02516547e-03 -8.30621832e-04  2.14949307e-05
  8.37455432e-04 -3.75624749e-03  2.17206096e-03 -3.92221062e-03
  4.84593093e-03 -4.31556697e-03 -3.27412874e-03 -2.78361121e-03
 -3.03056536e-03  4.95070542e-03  1.16704802e-03 -4.20633105e-03
 -3.1

In [14]:
print(glove.most_similar('glove'))

[('obtaining', 0.20804855898998215), ('word', 0.15160989120886495), ('word-word', 0.1350117959766783), ('representations', 0.12255771450558453)]


In [15]:

print(glove.most_similar('word'))

[('vector', 0.242800761770688), ('the', 0.17749865589145597), ('aggregated', 0.16651246220844743), ('global', 0.15753650925260645)]


In [16]:

!pip install transformers



In [17]:
from transformers import pipeline

def sentiment_analysis(text):
    # Hugging Face 파이프라인 사용
    classifier = pipeline('sentiment-analysis')
    results = classifier(text)
    return results

# (과제) 문장 넣기
example_review = " 아침에 사과를 먹었다 "
print(sentiment_analysis(example_review))

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

[{'label': 'POSITIVE', 'score': 0.8604418635368347}]
