## English

In [1]:
!wget https://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2022-12-12 19:24:03--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-12-12 19:24:03--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2022-12-12 19:26:42 (5.18 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [3]:
import spacy
import numpy as np
from gensim.models import KeyedVectors



In [4]:
embeddings_index = {}

f = open('glove.6B.100d.txt')
for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:], dtype='float32')
  embeddings_index[word] = coefs
f.close()
print('Found {} word vectors'.format(len(embeddings_index)))

Found 400000 word vectors


In [29]:
def check_glove_data(text):
  for word in text.split():
    if word in embeddings_index.keys():
      print('\x1B[32m' '"{}" word found'.format(word))
    else:
      print('\x1B[30m' '"{}" word not found'.format(word))

In [30]:
sentence = 'This is just a sample text for the purpose of testing'.lower()
check_glove_data(sentence)

[32m"this" word found
[32m"is" word found
[32m"just" word found
[32m"a" word found
[32m"sample" word found
[32m"text" word found
[32m"for" word found
[32m"the" word found
[32m"purpose" word found
[32m"of" word found
[32m"testing" word found


### Lemmatization

In [10]:
def get_lemmatization(text, model):
  doc = model(text)
  return " ".join([token.lemma_ for token in doc])

In [26]:
nlp_en = spacy.load('en_core_web_sm')

In [35]:
sentence_lemma = get_lemmatization(sentence, nlp_en)
sentence_lemma

'this be just a sample text for the purpose of testing'

In [36]:
check_glove_data(sentence_lemma)

[32m"this" word found
[32m"be" word found
[32m"just" word found
[32m"a" word found
[32m"sample" word found
[32m"text" word found
[32m"for" word found
[32m"the" word found
[32m"purpose" word found
[32m"of" word found
[32m"testing" word found


In [41]:
sentence_2 = 'I am hanging out in a garden'
sentence_lemma = get_lemmatization(sentence_2, nlp_en).lower()
sentence_lemma

'i be hang out in a garden'

In [42]:
check_glove_data(sentence_lemma)

[32m"i" word found
[32m"be" word found
[32m"hang" word found
[32m"out" word found
[32m"in" word found
[32m"a" word found
[32m"garden" word found


## Spanish

In [49]:
!wget http://dcc.uchile.cl/~jperez/word-embeddings/glove-sbwc.i25.vec.gz
!gzip -dv glove-sbwc.i25.vec.gz

glove-sbwc.i25.vec.gz:	 61.2% -- replaced with glove-sbwc.i25.vec


In [4]:
wordvectors = KeyedVectors.load_word2vec_format('glove-sbwc.i25.vec', limit=50000)

In [5]:
def check_glove_data(text):
  for word in text.split():
    if word in wordvectors.vocab.keys():
      print('\x1B[32m' '"{}" word found'.format(word))
    else:
      print('\x1B[30m' '"{}" word not found'.format(word))

In [6]:
sentence_es = 'Esta película fue genial'.lower()
check_glove_data(sentence_es)

[32m"esta" word found
[32m"película" word found
[32m"fue" word found
[32m"genial" word found


In [7]:
sentence_es = 'yo creo que es la mejor actriz del mundo'.lower()
check_glove_data(sentence_es)

[32m"yo" word found
[32m"creo" word found
[32m"que" word found
[32m"es" word found
[32m"la" word found
[32m"mejor" word found
[32m"actriz" word found
[32m"del" word found
[32m"mundo" word found


### Lemmatization

In [16]:
!python -m spacy download es_core_news_md

2022-12-12 19:38:17.464660: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting es-core-news-md==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_md-3.4.0/es_core_news_md-3.4.0-py3-none-any.whl (42.3 MB)
[K     |████████████████████████████████| 42.3 MB 1.4 MB/s 
Installing collected packages: es-core-news-md
Successfully installed es-core-news-md-3.4.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_md')


In [8]:
nlp_es = spacy.load('es_core_news_md')

In [11]:
sentence_es = 'Esta pelicula fue genial'.lower()
sentence_lemma = get_lemmatization(sentence_es, nlp_es)
sentence_lemma

'este pelicula ser genial'

In [12]:
check_glove_data(sentence_lemma)

[32m"este" word found
[32m"pelicula" word found
[32m"ser" word found
[32m"genial" word found


In [15]:
sentence_es = 'yo creo que es la mejor actriz del mundo'.lower()
sentence_lemma = get_lemmatization(sentence_es, nlp_es)
sentence_lemma

'yo creer que ser el mejor actriz del mundo'

In [16]:
check_glove_data(sentence_lemma)

[32m"yo" word found
[32m"creer" word found
[32m"que" word found
[32m"ser" word found
[32m"el" word found
[32m"mejor" word found
[32m"actriz" word found
[32m"del" word found
[32m"mundo" word found


In [17]:
sentence_es = 'la mayor posibilidad en todas las opciones'.lower()
sentence_lemma = get_lemmatization(sentence_es, nlp_es)
sentence_lemma

'el mayor posibilidad en todo el opción'

In [18]:
check_glove_data(sentence_lemma)

[32m"el" word found
[32m"mayor" word found
[32m"posibilidad" word found
[32m"en" word found
[32m"todo" word found
[32m"el" word found
[32m"opción" word found


In [19]:
wordvectors.get_vector('posibilidad').shape

(300,)

In [49]:
wordvectors.wv.most_similar(positive=['mujer', 'rey'], negative=['hombre'])

  wordvectors.wv.most_similar(positive=['mujer', 'rey'], negative=['hombre'])


[('reina', 0.6732203364372253),
 ('isabel', 0.5993215441703796),
 ('monarca', 0.5833542346954346),
 ('princesa', 0.5566387176513672),
 ('hija', 0.5369765162467957),
 ('infanta', 0.5317001938819885),
 ('esposa', 0.5256122350692749),
 ('alfonso', 0.5193145275115967),
 ('iv', 0.5174581408500671),
 ('ii', 0.5153015851974487)]

In [56]:
wordvectors.wv.most_similar(positive=['gato', 'tigre'], negative=['perro'])

  wordvectors.wv.most_similar(positive=['gato', 'tigre'], negative=['perro'])


[('venado', 0.4637227952480316),
 ('leopardo', 0.44934844970703125),
 ('puma', 0.4475456178188324),
 ('juniors', 0.4290238320827484),
 ('felinos', 0.42076990008354187),
 ('montés', 0.4189727306365967),
 ('boca', 0.39804211258888245),
 ('zorro', 0.3968369662761688),
 ('pez', 0.39571985602378845),
 ('elefante', 0.39226317405700684)]

In [44]:
wordvectors_en = KeyedVectors.load_word2vec_format('glove.6B.100d_2.txt', binary=False, encoding="ISO-8859-1")

In [51]:
wordvectors_en.wv.most_similar(positive=['woman', 'king'], negative=['man'])

  wordvectors_en.wv.most_similar(positive=['woman', 'king'], negative=['man'])


[('queen', 0.7698541283607483),
 ('monarch', 0.6843380928039551),
 ('throne', 0.6755735874176025),
 ('daughter', 0.6594556570053101),
 ('princess', 0.6520534753799438),
 ('prince', 0.6517034769058228),
 ('elizabeth', 0.6464517712593079),
 ('mother', 0.6311717629432678),
 ('emperor', 0.6106470823287964),
 ('wife', 0.6098655462265015)]

In [57]:
wordvectors_en.wv.most_similar(positive=['cat', 'tiger'], negative=['dog'])

  wordvectors_en.wv.most_similar(positive=['cat', 'tiger'], negative=['dog'])


[('leopard', 0.6474355459213257),
 ('tigers', 0.6170241832733154),
 ('crocodile', 0.5733538866043091),
 ('elephant', 0.5629477500915527),
 ('dragon', 0.5605452060699463),
 ('lion', 0.5505025386810303),
 ('rhino', 0.5312058925628662),
 ('tamil', 0.5310259461402893),
 ('elephants', 0.5188644528388977),
 ('turtle', 0.5155850648880005)]

## Reformat GloVe Original output

In [45]:
import os
import shutil
import smart_open
from sys import platform

import gensim


def prepend_line(infile, outfile, line):
	""" 
	Function use to prepend lines using bash utilities in Linux. 
	(source: http://stackoverflow.com/a/10850588/610569)
	"""
	with open(infile, 'r') as old:
		with open(outfile, 'w') as new:
			new.write(str(line) + "\n")
			shutil.copyfileobj(old, new)

def prepend_slow(infile, outfile, line):
	"""
	Slower way to prepend the line by re-creating the inputfile.
	"""
	with open(infile, 'r') as fin:
		with open(outfile, 'w') as fout:
			fout.write(line + "\n")
			for line in fin:
				fout.write(line)

def get_lines(glove_file_name):
    """Return the number of vectors and dimensions in a file in GloVe format."""
    with smart_open.smart_open(glove_file_name, 'r') as f:
        num_lines = sum(1 for line in f)
    with smart_open.smart_open(glove_file_name, 'r') as f:
        num_dims = len(f.readline().split()) - 1
    return num_lines, num_dims
	
# Input: GloVe Model File
# More models can be downloaded from http://nlp.stanford.edu/projects/glove/
glove_file="glove.6B.100d.txt"

num_lines, dims = get_lines(glove_file)

# Output: Gensim Model text format.
gensim_file='glove.6B.100d_2.txt'
gensim_first_line = "{} {}".format(num_lines, dims)

# Prepends the line.
if platform == "linux" or platform == "linux2":
	prepend_line(glove_file, gensim_file, gensim_first_line)
else:
	prepend_slow(glove_file, gensim_file, gensim_first_line)