## Library 설치

In [2]:
!pip install konlpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting JPype1>=0.7.0
  Downloading JPype1-1.4.1-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (465 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.3/465.3 KB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.4.1 konlpy-0.6.0


# TF-IDF 예시

In [4]:
from konlpy.tag import Okt
import pandas as pd

# 입력받은 문장 리스트
sentences = ['배가 부르다.',
             '배의 가격이 비싸다.',
             '진짜 사과가 진짜 좋다.',
             '아침엔 사과가 좋다.']

# 문장 리스트를 전처리하여 토큰화
tokenizer = Okt()
sentences = [tokenizer.morphs(sentence) for sentence in sentences]

sentences = [' '.join(sentence) for sentence in sentences]
sentences

['배 가 부르다 .', '배 의 가격 이 비싸다 .', '진짜 사과 가 진짜 좋다 .', '아침 엔 사과 가 좋다 .']

## TF 예시

In [7]:
from sklearn.feature_extraction.text import CountVectorizer


# CountVectorizer 객체 생성
vectorizer = CountVectorizer()

# 문장에서 단어의 출현 빈도 계산
word_count_matrix = vectorizer.fit_transform(sentences)

# 각 단어의 이름 저장
feature_names = vectorizer.get_feature_names_out()

# 출현 빈도를 데이터프레임으로 변환
word_count_df = pd.DataFrame(word_count_matrix.toarray(), columns=feature_names)
word_count_df

Unnamed: 0,가격,부르다,비싸다,사과,아침,좋다,진짜
0,0,1,0,0,0,0,0
1,1,0,1,0,0,0,0
2,0,0,0,1,0,1,2
3,0,0,0,1,1,1,0


## DF 예시

In [8]:
# CountVectorizer 객체 생성
vectorizer = CountVectorizer()

# 문장에서 단어가 출현한 문장의 수 계산
vectorizer.fit(sentences)
feature_names = vectorizer.get_feature_names_out()

# 각 단어가 출현한 문장의 수 저장
document_frequency = vectorizer.transform(sentences).sum(axis=0)
df_df = pd.DataFrame(document_frequency, columns=feature_names)
df_df

Unnamed: 0,가격,부르다,비싸다,사과,아침,좋다,진짜
0,1,1,1,2,1,2,2


In [9]:
import numpy as np

# IDF 계산식
print(np.log(4/(1+df_df)))

         가격       부르다       비싸다        사과        아침        좋다        진짜
0  0.693147  0.693147  0.693147  0.287682  0.693147  0.287682  0.287682


## TF-IDF 예시

In [11]:

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tf_idf_matrix = vectorizer.fit_transform(sentences)


tf_idf_df = pd.DataFrame(tf_idf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
tf_idf_df

Unnamed: 0,가격,부르다,비싸다,사과,아침,좋다,진짜
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.707107,0.0,0.707107,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.344315,0.0,0.344315,0.873439
3,0.0,0.0,0.0,0.526405,0.667679,0.526405,0.0
