In [1]:
import re
from lxml import etree
import urllib.request
import zipfile
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
urllib.request.urlretrieve('https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip', filename='ted_en-20160408.zip')

('ted_en-20160408.zip', <http.client.HTTPMessage at 0x2c24b2ae748>)

In [3]:
with zipfile.ZipFile('ted_en-20160408.zip','r') as z:
    target_text = etree.parse(z.open('ted_en-20160408.xml','r'))
    parse_text = '\n'.join(target_text.xpath('//content/text()'))
    
parse_text[:500]

"Here are two reasons companies fail: they only do more of the same, or they only do what's new.\nTo me the real, real solution to quality growth is figuring out the balance between two activities: exploration and exploitation. Both are necessary, but it can be too much of a good thing.\nConsider Facit. I'm actually old enough to remember them. Facit was a fantastic company. They were born deep in the Swedish forest, and they made the best mechanical calculators in the world. Everybody used them. A"

In [4]:
content_text = re.sub(r'\([^)]*\)','',parse_text)

In [5]:
import nltk
nltk.download('punkt')

sent_text = sent_tokenize(content_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
normalized_text = []
for string in sent_text:
    tokens = re.sub(r'[^a-z0-9]+',' ', string.lower())
    normalized_text.append(tokens)

In [7]:
result = [ word_tokenize(sentence) for sentence in normalized_text]
print('총 샘플의 개수: {}'.format(len(result)))

총 샘플의 개수: 273424


In [8]:
for line in result[:3]:
    print(line)

['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new']
['to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation']
['both', 'are', 'necessary', 'but', 'it', 'can', 'be', 'too', 'much', 'of', 'a', 'good', 'thing']


In [9]:
import gensim
from gensim.models import Word2Vec
model = Word2Vec(sentences = result, size=100, window = 5, min_count = 5, workers = 4, sg=0) 
#size: 워드 벡터의 특징 값, 임베딩 벡터 차원
#window: context 윈도우 크기
#min_count: 단어 최소 빈도 수 제한 (빈도가 적은 단어들은 학습하지 않는다.)
# workers: 학습을 위한 프로세스 수
#sg: 0dms CBOW, 1은 Skip-gram
model_result = model.wv.most_similar('man')
print(model_result)

[('woman', 0.8503380417823792), ('guy', 0.8077146410942078), ('boy', 0.7764290571212769), ('lady', 0.7629806995391846), ('girl', 0.7617174983024597), ('gentleman', 0.7438566088676453), ('soldier', 0.731104850769043), ('kid', 0.6897828578948975), ('poet', 0.6780432462692261), ('friend', 0.6685655117034912)]


In [10]:
from gensim.models import KeyedVectors
model.wv.save_word2vec_format('eng_w2v') #모델저장
loaded_model = KeyedVectors.load_word2vec_format('eng_w2v') #모델로드

model_result = loaded_model.most_similar('man')
print(model_result)

[('woman', 0.8503380417823792), ('guy', 0.8077146410942078), ('boy', 0.7764290571212769), ('lady', 0.7629806995391846), ('girl', 0.7617174983024597), ('gentleman', 0.7438566088676453), ('soldier', 0.731104850769043), ('kid', 0.6897828578948975), ('poet', 0.6780432462692261), ('friend', 0.6685655117034912)]
