<a href="https://colab.research.google.com/github/kiyoungkim1/LMkor/blob/main/notebooks/word_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bag of words

In [None]:
corpus = [
    '학교에 가서 수업을 들었다. 학교에 간건 오랜만이다.',
    '학교에 가서 친구 얘기를 들었다.',
    '내일 가서 뭐 먹지?'
]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
vect.fit(corpus)
vect.vocabulary_

{'가서': 0,
 '간건': 1,
 '내일': 2,
 '들었다': 3,
 '먹지': 4,
 '수업을': 5,
 '얘기를': 6,
 '오랜만이다': 7,
 '친구': 8,
 '학교에': 9}

In [None]:
vect.transform(corpus).toarray()

array([[1, 1, 0, 1, 0, 1, 0, 1, 0, 2],
       [1, 0, 0, 1, 0, 0, 1, 0, 1, 1],
       [1, 0, 1, 0, 1, 0, 0, 0, 0, 0]])

In [None]:
vect.transform(['수업을 들었다. 수업은 재미있다.']).toarray()

array([[0, 0, 0, 1, 0, 1, 0, 0, 0, 0]])

# TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidv = TfidfVectorizer().fit(corpus)
tfidv.transform(corpus).toarray()

array([[0.23642005, 0.40029393, 0.        , 0.30443385, 0.        ,
        0.40029393, 0.        , 0.40029393, 0.        , 0.60886771],
       [0.31544415, 0.        , 0.        , 0.40619178, 0.        ,
        0.        , 0.53409337, 0.        , 0.53409337, 0.40619178],
       [0.38537163, 0.        , 0.65249088, 0.        , 0.65249088,
        0.        , 0.        , 0.        , 0.        , 0.        ]])

# Word2vec


In [None]:
# make datasets (.txt file)
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt

import pandas as pd

df = pd.read_csv('ratings_train.txt', sep='\t')
doc = list(df['document'])

with open('ratings_train_text_only.txt', 'w') as f:
  for text in doc:
    f.write(str(text) + '\n')

--2021-02-10 02:03:26--  https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14628807 (14M) [text/plain]
Saving to: ‘ratings_train.txt’


2021-02-10 02:03:26 (37.2 MB/s) - ‘ratings_train.txt’ saved [14628807/14628807]



In [None]:
# read text file
with open('ratings_train_text_only.txt', 'r') as f:
  texts = [str(text).replace('\n', '') for text in doc if len(str(text)) >= 10]

In [None]:
# word2vec training

import os
from gensim.models import Word2Vec

def word2vec(texts):
    inputs = [tt.split(' ') for tt in texts]
    print('number of text = ', len(inputs))

    print('word2vec training...')
    model = Word2Vec(inputs, size=50, window=3, min_count=3, negative=5, workers=os.cpu_count(), iter=10, sg=1)
    model.init_sims

    model.save('word2vec')

word2vec(texts)

number of text =  136748
word2vec training...


In [None]:
w2v = Word2Vec.load('word2vec')

In [None]:
# 단어 벡터
w2v.wv['감동']

array([-0.2877369 , -0.3411937 ,  0.98615265,  0.40388942, -0.2993487 ,
       -0.8082169 , -0.05968314, -0.14713229, -0.7971726 , -0.2910246 ,
        0.20702799,  0.1501431 ,  0.62876755,  0.38210574, -0.12299415,
        0.5009918 , -0.2925843 , -1.0950974 ,  0.01489488, -0.16576152,
       -0.05468882,  0.17707469, -0.72506976,  0.2971289 ,  0.10010708,
        1.0921265 , -0.94679564, -0.01515222,  0.03146487,  0.23118128,
        0.1426021 , -0.19204514, -0.27978763, -0.26251298, -0.68439114,
       -0.40017757,  0.9187491 ,  0.42633244,  0.85304224,  0.3333111 ,
        0.07864343,  0.2232901 ,  0.28953448,  0.3314806 , -0.99922657,
        0.13744463,  0.31774592,  0.87070864, -0.29360154,  0.22819856],
      dtype=float32)

In [None]:
# 유사 단어
w2v.wv.most_similar('이제까지')

[('10년동안', 0.9651472568511963),
 ('여태것', 0.9527884125709534),
 ('재난영화중', 0.952028751373291),
 ('여태껏', 0.9492594003677368),
 ('수천편의', 0.9491361379623413),
 ('3년간', 0.9450587630271912),
 ('이제껏', 0.9443521499633789),
 ('정신병', 0.9437797665596008),
 ('원망스럽다', 0.9426261782646179),
 ('2006년', 0.9412530064582825)]

# Fasttext

In [None]:
# fasttext training

import os
from gensim.models import FastText

def fasttext(texts):
    inputs = [tt.split(' ') for tt in texts]
    print('number of text = ', len(inputs))

    model = FastText(inputs, size=50, window=3, min_count=3, negative=5, workers=os.cpu_count(), iter=10, sg=1)
    model.init_sims()

    model.save('fasttext')
    print('fasttext is trained')

fasttext(texts)

number of text =  136748
fasttext is trained


In [None]:
w2v = Word2Vec.load('word2vec')
fasttext = FastText.load('fasttext')

In [None]:
wav.wv.most_similar('고능학교')

NameError: ignored

In [None]:
fasttext.wv.most_similar('고능학교')

[('학교', 0.9648439884185791),
 ('중학교', 0.9563544988632202),
 ('고등학교', 0.9352776408195496),
 ('초등학교', 0.9316931962966919),
 ('국민학교', 0.9257780909538269),
 ('대학교', 0.9132347106933594),
 ('2학년', 0.8956236839294434),
 ('다닐', 0.8853365778923035),
 ('친구집에', 0.8824955821037292),
 ('1학년', 0.8821584582328796)]