<a href="https://colab.research.google.com/github/jhkr1/Practical-Al-Natural-Language-Processing/blob/main/Word_Representation_TF_IDF%2C_nGram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 단어의 표현 (Word Representation)

기계는 문자를 그대로 인식할 수 없기때문에 숫자로 변환

# 1. TF-IDF를 활용한 단어 벡터

## 1-1) 직접 구현하기

1.svg

2.svg  


In [1]:
d1 = "The cat sat on my face I hate a cat"
d2 = "The dog sat on my bed I love a dog"

In [15]:
import numpy as np

def f(t, d):
    return d.count(t)

# tf(term frequency) 계산
def tf(t, d):
    return f(t,d)/len(d)

# idf (inverse Document Frequency) 계산
def idf(t, D):
    numerator = len(D)
    denominator = len([True for d in D if t in d])
    return np.log(numerator/denominator)

# tf-idf 계산
def tfidf(t, d, D):
  return tf(t, d) * idf(t, D)

# 공백을 기준으로 토큰화
def tokenizer(d):
  return d.split()


# tfidf 계산
def tfidfScorer(D):
  docs = [tokenizer(d) for d in D]
  result = []
  for d in docs:
    result.append([(t, tfidf(t, d, docs)) for t in d])
  return result

corpus = [d1, d2]

for i, doc in enumerate(tfidfScorer(corpus)):
    print('====== document[%d] ======' % i)
    print(doc)

[('The', np.float64(0.0)), ('cat', np.float64(0.13862943611198905)), ('sat', np.float64(0.0)), ('on', np.float64(0.0)), ('my', np.float64(0.0)), ('face', np.float64(0.06931471805599453)), ('I', np.float64(0.0)), ('hate', np.float64(0.06931471805599453)), ('a', np.float64(0.0)), ('cat', np.float64(0.13862943611198905))]
[('The', np.float64(0.0)), ('dog', np.float64(0.13862943611198905)), ('sat', np.float64(0.0)), ('on', np.float64(0.0)), ('my', np.float64(0.0)), ('bed', np.float64(0.06931471805599453)), ('I', np.float64(0.0)), ('love', np.float64(0.06931471805599453)), ('a', np.float64(0.0)), ('dog', np.float64(0.13862943611198905))]


## 1-2) sklearn 활용

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

d1 = "The cat sat on my face I hate a cat"
d2 = 'The dog sat on my bed I love a dog'
corpus = [d1, d2]
count_vect = CountVectorizer()
countv = count_vect.fit_transform(corpus)
print(countv.toarray())
print(count_vect.vocabulary_)

[[0 2 0 1 1 0 1 1 1 1]
 [1 0 2 0 0 1 1 1 1 1]]
{'the': 9, 'cat': 1, 'sat': 8, 'on': 7, 'my': 6, 'face': 3, 'hate': 4, 'dog': 2, 'bed': 0, 'love': 5}


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

d1 = "The cat sat on my face I hate a cat"
d2 = 'The dog sat on my bed I love a dog'
corpus = [d1, d2]
tfidf_vect = TfidfVectorizer().fit(corpus)
tfidfv = tfidf_vect.transform(corpus)
print(tfidfv.toarray())
print(tfidf_vect.vocabulary_)

[[0.         0.70600557 0.         0.35300279 0.35300279 0.
  0.25116439 0.25116439 0.25116439 0.25116439]
 [0.35300279 0.         0.70600557 0.         0.         0.35300279
  0.25116439 0.25116439 0.25116439 0.25116439]]
{'the': 9, 'cat': 1, 'sat': 8, 'on': 7, 'my': 6, 'face': 3, 'hate': 4, 'dog': 2, 'bed': 0, 'love': 5}


## 1-3) gensim 활용

In [22]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━

In [4]:
from gensim.models import TfidfModel
from gensim import corpora

d1 = "The cat sat on my face I hate a cat"
d2 = 'The dog sat on my bed I love a dog'
corpus = [d1, d2]

doc_ls = [doc.split() for doc in corpus]
id2word = corpora.Dictionary(doc_ls) # fit dictionary
corpus = [id2word.doc2bow(doc) for doc in doc_ls] # Convert corpus to Bow format

tfidf = TfidfModel(corpus)
vector = tfidf[corpus[0]]

In [6]:
corpus

[[(0, 1), (1, 1), (2, 1), (3, 2), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)],
 [(0, 1), (1, 1), (2, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 2), (11, 1)]]

In [7]:
vector

[(3, 0.8164965809277261), (4, 0.4082482904638631), (5, 0.4082482904638631)]