In [1]:
import re
import urllib.request
import zipfile
from lxml import etree
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
# 훈련 데이터(XML)
urllib.request.urlretrieve("https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/09.%20Word%20Embedding/dataset/ted_en-20160408.xml", filename="ted_en-20160408.xml")

('ted_en-20160408.xml', <http.client.HTTPMessage at 0x15f99a190>)

In [9]:
# 전처리
with open("ted_en-20160408.xml") as targetXML:
    target_text = etree.parse(targetXML)
    parse_text = "\n".join(target_text.xpath("//content/text()")) # <content> 태그 안의 내용만 가져오기
    content_text = re.sub(r"\([^)]*\)", "", parse_text) # (Audio), (Laughter) 등의 배경음 제거
    sent_text = sent_tokenize(content_text)

    normalized_text = []
    for string in sent_text:
        tokens = re.sub(r"[^a-z0-9]+", " ", string.lower()) # 구두점 제거 및 소문자 변환
        normalized_text.append(tokens)

    res =[word_tokenize(sentence) for sentence in normalized_text]

print(len(res))

273424


In [11]:
print(res[:3])

[['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new'], ['to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation'], ['both', 'are', 'necessary', 'but', 'it', 'can', 'be', 'too', 'much', 'of', 'a', 'good', 'thing']]


In [14]:
# 학습
from gensim.models import Word2Vec, KeyedVectors

# vector_size: 임베딩 벡터의 차원, window: 윈도우 크기, min_count: 단어 최소 빈도 수, workers: 프로세스 수, sg: 0 = CBOW, 1 = Skip-gram
model = Word2Vec(sentences=res, vector_size=100, window=5, min_count=5, workers=4, sg=0)

In [15]:
model_res = model.wv.most_similar("man") # 토큰 man과 가장 유사한 단어들

model_res

[('woman', 0.8486449122428894),
 ('guy', 0.806094229221344),
 ('lady', 0.7764718532562256),
 ('boy', 0.771442174911499),
 ('girl', 0.7330987453460693),
 ('soldier', 0.7150641679763794),
 ('gentleman', 0.7043351531028748),
 ('kid', 0.6938618421554565),
 ('poet', 0.6463817954063416),
 ('rabbi', 0.6455687284469604)]

In [18]:
# 저장 및 불러오기
model.wv.save_word2vec_format("eng_w2v")
loaded_model = KeyedVectors.load_word2vec_format("eng_w2v")

loaded_model.most_similar("man")

[('woman', 0.8486449122428894),
 ('guy', 0.806094229221344),
 ('lady', 0.7764718532562256),
 ('boy', 0.771442174911499),
 ('girl', 0.7330987453460693),
 ('soldier', 0.7150641679763794),
 ('gentleman', 0.7043351531028748),
 ('kid', 0.6938618421554565),
 ('poet', 0.6463817954063416),
 ('rabbi', 0.6455687284469604)]