# Word2Vec

In [12]:
import numpy as np
import os
from random import shuffle
import re
import urllib.request
import zipfile
import lxml.etree

# extract subtitle
doc = lxml.etree.parse(open('ted_en-20160408.xml', 'r'))

input_text = '\n'.join(doc.xpath('//content/text()'))		

# remove parenthesis
input_text_noparens = re.sub(r'\([^)]*\)', '', input_text)

# store as list of sentences
sentences_strings_ted = []

for line in input_text_noparens.split('\n'):
    m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)
    sentences_strings_ted.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)

# store as list of lists of words
sentences_ted = []
for sent_str in sentences_strings_ted:
    tokens = re.sub(r"[^a-z0-9]+", " ", sent_str.lower()).split()
    sentences_ted.append(tokens)

from gensim.models import Word2Vec

model_ted = Word2Vec(sentences=sentences_ted, vector_size=100, window=5, min_count=5, workers=4, sg=0)


In [14]:
model_ted.wv.most_similar("man")

[('woman', 0.8602633476257324),
 ('guy', 0.8204045295715332),
 ('lady', 0.7615416646003723),
 ('soldier', 0.7612980008125305),
 ('girl', 0.7477767467498779),
 ('boy', 0.7187429666519165),
 ('gentleman', 0.7056071758270264),
 ('kid', 0.6818613409996033),
 ('poet', 0.6656844615936279),
 ('person', 0.6572151184082031)]

# FastText

In [16]:
from gensim.models import FastText

model_ted = FastText(sentences_ted, vector_size=100, window=5, min_count=5, workers=4,sg=1)

In [17]:
model_ted.wv.most_similar("Gastroenteritis")

[('arthritis', 0.8389511108398438),
 ('anthropocene', 0.8389402031898499),
 ('kp', 0.8350369334220886),
 ('pseudonym', 0.8325868248939514),
 ('curitiba', 0.8308171629905701),
 ('anagnorisis', 0.8261967301368713),
 ('karnataka', 0.823052167892456),
 ('pseudo', 0.8183250427246094),
 ('hirshhorn', 0.8172647953033447),
 ('aeronautics', 0.8166674971580505)]

# Glove

In [2]:
import gensim.downloader as api
model = api.load("glove-wiki-gigaword-50")



In [7]:
model.most_similar("ball")

[('kick', 0.864284873008728),
 ('catch', 0.8190028667449951),
 ('off', 0.8133060336112976),
 ('kicking', 0.8079286813735962),
 ('got', 0.8033515214920044),
 ('throw', 0.7966356873512268),
 ('missed', 0.7893549799919128),
 ('back', 0.7857473492622375),
 ('throws', 0.7807802557945251),
 ('caught', 0.7794879674911499)]

In [1]:
lines= ["Hello this is a tutorial to convert word to integer" , "It is a beautiful day" , "Jack is going to office"]

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize

word_tokens=[]
i = 0
for line in lines:
    words = word_tokenize(line)
    word_tokens.insert(i,words)
    i=i+1

print (word_tokens)

[['Hello', 'this', 'is', 'a', 'tutorial', 'to', 'convert', 'word', 'to', 'integer'], ['It', 'is', 'a', 'beautiful', 'day'], ['Jack', 'is', 'going', 'to', 'office']]


In [3]:
from nltk.corpus import stopwords
stop_words=stopwords.words('english')

lines_without_stopwords=[]
for line in lines:
    stop_removed=[]
    for line in word_tokens:
        for word in line:
            if word not in stop_words:
                stop_removed.append(word)

print(stop_removed)

['Hello', 'tutorial', 'convert', 'word', 'integer', 'It', 'beautiful', 'day', 'Jack', 'going', 'office']


In [26]:
from nltk import WordNetLemmatizer  
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer() 

lines_with_lemmas=[]
for line in lines:
    lem_line=[]
    s = line.split() 
    for word in s:
        if s not in stop_words: 
            lem_line.append(wordnet_lemmatizer.lemmatize(word)) 
    lines_with_lemmas.append(lem_line)

print(lines_with_lemmas)

[['Hello', 'this', 'is', 'a', 'tutorial', 'to', 'convert', 'word', 'to', 'integer'], ['It', 'is', 'a', 'beautiful', 'day'], ['Jack', 'is', 'going', 'to', 'office']]


In [27]:
from glove import Corpus, Glove
corpus = Corpus()

corpus.fit(lines_with_lemmas, window=10)
glove = Glove(no_components=5, learning_rate=0.05)
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)

glove.save('glove.model')

Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


In [29]:
print(glove.word_vectors[glove.dictionary['tutorial']])

[ 0.08139673  0.03268511  0.07912026  0.05500959 -0.03326362]


In [32]:
glove.most_similar('beautiful')

[('going', 0.7350365882817204),
 ('Hello', 0.693252670358993),
 ('It', 0.6343277175485825),
 ('Jack', 0.4510978346710577)]