In [19]:
# 워드 임베딩

In [20]:
## TF

text = """John likes to watch movies. Mary likes movies too. 
Mary also lines to watch football games.
"""

In [21]:
words = text.replace('.', '').replace(',', '').split()
print(words)


['John', 'likes', 'to', 'watch', 'movies', 'Mary', 'likes', 'movies', 'too', 'Mary', 'also', 'lines', 'to', 'watch', 'football', 'games']


In [22]:
import numpy as np
word_counts = np.unique(words, return_counts=True)
print(word_counts)

(array(['John', 'Mary', 'also', 'football', 'games', 'likes', 'lines',
       'movies', 'to', 'too', 'watch'], dtype='<U8'), array([1, 2, 1, 1, 1, 2, 1, 2, 2, 1, 2]))


In [23]:
word_to_cnt = {}
for word, count in zip(*word_counts):
    word_to_cnt[word] = count
print(word_to_cnt)

{'John': 1, 'Mary': 2, 'also': 1, 'football': 1, 'games': 1, 'likes': 2, 'lines': 1, 'movies': 2, 'to': 2, 'too': 1, 'watch': 2}


In [24]:
# John의 빈도수
print(word_to_cnt['John'])

1


In [25]:
## TDM

In [26]:
corpus = [
    "John likes to watch movies. Mary likes movies too.",
    "Mary also lines to watch football games."
]

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
# vectorizer.fit(corpus) # 학습, 변환 모델, 단어 사전을 생성
# X = vectorizer.transform(corpus).toarray() # 변환
X = vectorizer.fit_transform(corpus).toarray()  # 학습과 변환을 동시에 수행

print(X)

[[0 0 0 1 2 0 1 2 1 1 1]
 [1 1 1 0 0 1 1 0 1 0 1]]


In [28]:
tf_dic = vectorizer.vocabulary_
print(tf_dic)

{'john': 3, 'likes': 4, 'to': 8, 'watch': 10, 'movies': 7, 'mary': 6, 'too': 9, 'also': 0, 'lines': 5, 'football': 1, 'games': 2}


In [29]:
import pandas as pd
tf_dic_sorted = dict(sorted(tf_dic.items(), key=lambda item: item[1]))
print(tf_dic_sorted)
print(tf_dic_sorted.keys())
tdm = pd.DataFrame(X, columns=tf_dic_sorted.keys())
print(tdm)

{'also': 0, 'football': 1, 'games': 2, 'john': 3, 'likes': 4, 'lines': 5, 'mary': 6, 'movies': 7, 'to': 8, 'too': 9, 'watch': 10}
dict_keys(['also', 'football', 'games', 'john', 'likes', 'lines', 'mary', 'movies', 'to', 'too', 'watch'])
   also  football  games  john  likes  lines  mary  movies  to  too  watch
0     0         0      0     1      2      0     1       2   1    1      1
1     1         1      1     0      0      1     1       0   1    0      1


In [30]:
## TF-IDF

In [31]:
corpus = [
    "John likes to watch movies. Mary likes movies too.",
    "Mary also lines to watch football games."
]

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus).toarray()
print(X)

[[0.         0.         0.         0.29464404 0.58928809 0.
  0.20964166 0.58928809 0.20964166 0.29464404 0.20964166]
 [0.42567716 0.42567716 0.42567716 0.         0.         0.42567716
  0.30287281 0.         0.30287281 0.         0.30287281]]


In [33]:
tfidf_dic = vectorizer.vocabulary_
print(tfidf_dic)

{'john': 3, 'likes': 4, 'to': 8, 'watch': 10, 'movies': 7, 'mary': 6, 'too': 9, 'also': 0, 'lines': 5, 'football': 1, 'games': 2}


In [34]:
tfidf_dic_sorted = dict(sorted(tfidf_dic.items(), key=lambda item: item[1]))
tdm = pd.DataFrame(X, columns=tfidf_dic_sorted.keys())
print(tdm)

       also  football     games      john     likes     lines      mary  \
0  0.000000  0.000000  0.000000  0.294644  0.589288  0.000000  0.209642   
1  0.425677  0.425677  0.425677  0.000000  0.000000  0.425677  0.302873   

     movies        to       too     watch  
0  0.589288  0.209642  0.294644  0.209642  
1  0.000000  0.302873  0.000000  0.302873  
