# 단어의 임베딩
- 빈도수 계산: 빈도기반 - TF 상대빈도
- TDM: matrix TF를 행렬로 만든 것, 사전을 이용한 단순빈도
- TF-IDF: TF*IDF
- IDF: 역문서빈도

In [None]:
text = 'John likes to watch movies. Mary likes movies too.\
    Mary also likes to watch football games.'

words = text.replace('.', '').split()
words

['John',
 'likes',
 'to',
 'watch',
 'moives',
 'Mary',
 'likes',
 'movies',
 'too',
 'Mary',
 'also',
 'likes',
 'to',
 'watch',
 'football',
 'games']

In [37]:
# 리스트에서 유일한 값의 개수를 셈
import numpy as np

word_count = np.unique(words, return_counts=True)
word_count

(array(['John', 'Mary', 'also', 'football', 'games', 'likes', 'moives',
        'movies', 'to', 'too', 'watch'], dtype='<U8'),
 array([1, 2, 1, 1, 1, 3, 1, 1, 2, 1, 2]))

In [38]:
# 딕셔너리 TF 생성
word_to_cnt = {}

for word, cnt in zip (*word_count):
    word_to_cnt[word] = cnt
    
word_to_cnt

{'John': 1,
 'Mary': 2,
 'also': 1,
 'football': 1,
 'games': 1,
 'likes': 3,
 'moives': 1,
 'movies': 1,
 'to': 2,
 'too': 1,
 'watch': 2}

In [39]:
word_to_cnt['movies']

1

In [None]:
corpus = ['John likes to watch movies. Mary likes movies too.',
    'Mary also likes to watch football games.']

# TDM

In [41]:
from sklearn.feature_extraction.text import CountVectorizer

vector = CountVectorizer()
dtm_array = vector.fit_transform(corpus).toarray()
dtm_array

array([[0, 0, 0, 1, 2, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1]])

In [42]:
tf_dic = vector.vocabulary_
print(vector.vocabulary_)

{'john': 3, 'likes': 4, 'to': 8, 'watch': 10, 'moives': 6, 'mary': 5, 'movies': 7, 'too': 9, 'also': 0, 'football': 1, 'games': 2}


In [43]:
import pandas as pd
tf_dic_sorted = dict(sorted(tf_dic.items(), key=lambda item: item[1]))
# tf_dic_sorted
df = pd.DataFrame(dtm_array, columns=tf_dic_sorted.keys())
df

Unnamed: 0,also,football,games,john,likes,mary,moives,movies,to,too,watch
0,0,0,0,1,2,1,1,1,1,1,1
1,1,1,1,0,1,1,0,0,1,0,1


# TF-IDF 계산 -> 매트릭스

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer()
tfidf_array = tfidf_vec.fit_transform(corpus).toarray()
tfidf_array

array([[0.        , 0.        , 0.        , 0.36408901, 0.51810466,
        0.25905233, 0.36408901, 0.36408901, 0.25905233, 0.36408901,
        0.25905233],
       [0.44610081, 0.44610081, 0.44610081, 0.        , 0.3174044 ,
        0.3174044 , 0.        , 0.        , 0.3174044 , 0.        ,
        0.3174044 ]])

In [45]:
tfidf_dic = tfidf_vec.vocabulary_
# tfidf_dic
tfidf_dic_sorted = dict(sorted(tfidf_dic.items(), key=lambda item: item[1]))
tfidf_dtm = pd.DataFrame(tfidf_array, columns=tfidf_dic_sorted.keys())
tfidf_dtm

Unnamed: 0,also,football,games,john,likes,mary,moives,movies,to,too,watch
0,0.0,0.0,0.0,0.364089,0.518105,0.259052,0.364089,0.364089,0.259052,0.364089,0.259052
1,0.446101,0.446101,0.446101,0.0,0.317404,0.317404,0.0,0.0,0.317404,0.0,0.317404


In [9]:
from gensim.models import Word2Vec
corpus = [
    'John likes to watch movies. Mary likes movies too.',
    'Mary also likes to watch football games.']

word_list = []

In [10]:
for word in corpus: 
    word_list.append(word.replace('.', '').split())
    
word_list

[['John', 'likes', 'to', 'watch', 'movies', 'Mary', 'likes', 'movies', 'too'],
 ['Mary', 'also', 'likes', 'to', 'watch', 'football', 'games']]

In [11]:
model = Word2Vec(word_list, sg=0, vector_size=100, window=3, min_count=1)
model.wv.most_similar('likes', 'movies')

[('John', 0.17164471745491028),
 ('also', 0.06594578176736832),
 ('Mary', 0.008838453330099583),
 ('watch', -0.06765829026699066),
 ('games', -0.08544928580522537),
 ('football', -0.08948154747486115),
 ('too', -0.11860241740942001),
 ('to', -0.13643866777420044)]

In [12]:
model = Word2Vec(word_list, sg=1, vector_size=100, window=3, min_count=1)
model.wv.most_similar('likes', 'movies')

[('John', 0.17164471745491028),
 ('also', 0.06594578176736832),
 ('Mary', 0.008853347972035408),
 ('watch', -0.06765829026699066),
 ('games', -0.08544928580522537),
 ('football', -0.08948154747486115),
 ('too', -0.11860241740942001),
 ('to', -0.13643862307071686)]

In [13]:
model = Word2Vec(word_list, sg=1, vector_size=100, window=3, min_count=1)
model.wv.most_similar('games')

[('to', 0.13887979090213776),
 ('watch', 0.13149002194404602),
 ('movies', 0.06408978253602982),
 ('too', 0.06059185042977333),
 ('football', 0.019152285531163216),
 ('Mary', 0.009383062832057476),
 ('also', -0.05774582177400589),
 ('likes', -0.05987628549337387),
 ('John', -0.10513809323310852)]