# Text Representations: Words to Numbers

In [3]:
import gensim
import os
from urllib.request import urlretrieve

def get_data(url, filename):
    if not os.path.exists(filename):
        dirname = os.path.dirname(filename)
        if not os.path.exists(dirname):
            os.makedirs(dirname)
    urlretrieve(url, filename)

In [4]:
embedding_url = 'http://nlp.stanford.edu/data/glove.6B.zip'
get_data(embedding_url, 'data/glove.6B.zip')

In [5]:
!unzip data/glove.6B.zip 
!mv glove.6B.300d.txt data/glove.6B.300d.txt 
!mv glove.6B.200d.txt data/glove.6B.200d.txt 
!mv glove.6B.100d.txt data/glove.6B.100d.txt 
!mv glove.6B.50d.txt data/glove.6B.50d.txt

Archive:  data/glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [9]:
#Use pretrained embeddings
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'data/glove.6B.300d.txt'

word2vec_output_file = 'data/glove.6B.300d.word2vec.txt'

if not os.path.exists(word2vec_output_file):
    glove2word2vec(glove_input_file,word2vec_output_file)

In [10]:
from gensim.models import KeyedVectors
filename = word2vec_output_file

pretrained_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

In [12]:
#Using the (king - man) + woman example
result = pretrained_model.wv.most_similar(positive = ['woman','king'], negative=['man'], topn=1)

print(result)

[('queen', 0.6713277101516724)]


  


In [15]:
pretrained_model.wv.most_similar('india')

  """Entry point for launching an IPython kernel.


[('indian', 0.7355823516845703),
 ('pakistan', 0.7285579442977905),
 ('delhi', 0.6846907138824463),
 ('bangladesh', 0.6203191876411438),
 ('lanka', 0.609517514705658),
 ('sri', 0.6011613607406616),
 ('kashmir', 0.5746493935585022),
 ('nepal', 0.5421023368835449),
 ('pradesh', 0.5405811071395874),
 ('maharashtra', 0.518537700176239)]

In [16]:
#Handling OOV words
#Using the TED dataset to compare Fasttext and Word2vec

ted_dataset = "https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip"
get_data(ted_dataset,"data/ted_en.zip")

In [17]:
import zipfile
import lxml.etree
with zipfile.ZipFile('data/ted_en.zip','r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml','r'))
input_text = '\n'.join(doc.xpath('//content/text()'))

In [18]:
input_text[:500]

"Here are two reasons companies fail: they only do more of the same, or they only do what's new.\nTo me the real, real solution to quality growth is figuring out the balance between two activities: exploration and exploitation. Both are necessary, but it can be too much of a good thing.\nConsider Facit. I'm actually old enough to remember them. Facit was a fantastic company. They were born deep in the Swedish forest, and they made the best mechanical calculators in the world. Everybody used them. A"

In [28]:
#Remove parenthesis
import re
# remove parenthesis 
input_text_noparens = re.sub(r'\([^)]*\)', '', input_text)

# store as list of sentences
sentences_strings_ted = []
for line in input_text_noparens.split('\n'):
    m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)
    sentences_strings_ted.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)

# store as list of lists of words
sentences_ted = []
for sent_str in sentences_strings_ted:
    tokens = re.sub(r"[^a-z0-9]+", " ", sent_str.lower()).split()
    sentences_ted.append(tokens)

In [30]:
print(sentences_ted[:1])

[['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new']]


In [31]:
from gensim.models.fasttext import FastText

fasttext_ted_model = FastText(sentences_ted, size=100,window=5, min_count=5, workers=-1, sg=1)

In [32]:
fasttext_ted_model.wv.most_similar("india")

[('indians', 0.5911639928817749),
 ('indian', 0.5406097769737244),
 ('indiana', 0.4898717999458313),
 ('indicated', 0.4400438070297241),
 ('indicate', 0.4042605757713318),
 ('internal', 0.39166826009750366),
 ('interior', 0.3871103823184967),
 ('byproducts', 0.3752930164337158),
 ('princesses', 0.37265270948410034),
 ('indications', 0.369659960269928)]

In [33]:
from gensim.models.word2vec import Word2Vec

word2vec_ted_model = Word2Vec(sentences=sentences_ted, size=100, window=5, min_count=5, workers=-1, sg=1)

In [34]:
word2vec_ted_model.wv.most_similar("india")

[('crooked', 0.4316353499889374),
 ('neurosurgeon', 0.4018043875694275),
 ('compartment', 0.38469767570495605),
 ('band', 0.3766378164291382),
 ('torajans', 0.34789350628852844),
 ('chassis', 0.3418603539466858),
 ('district', 0.3298587501049042),
 ('split', 0.3265147805213928),
 ('misinterpreted', 0.32462072372436523),
 ('documenting', 0.32307395339012146)]