# Create Custom Word Embeddings


In [1]:
import gensim

In [2]:
from nltk.corpus import brown
model = gensim.models.Word2Vec(brown.sents())

In [3]:
model.save('brown.embedding')
new_model = gensim.models.Word2Vec.load('brown.embedding')

In [4]:
len(new_model['university'])

  """Entry point for launching an IPython kernel.


100

In [5]:
new_model.similarity('university','school') > 0.3

  """Entry point for launching an IPython kernel.


True

In [None]:
from nltk.data import find
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

In [None]:
len(model.vocab)

In [None]:
len(model['university'])

In [None]:
model.most_similar(positive=['university'], topn = 3)

In [None]:
model.doesnt_match('breakfast cereal dinner lunch'.split())

In [None]:
model.most_similar(positive=['woman','king'], negative=['man'], topn = 1)

In [None]:
model.most_similar(positive=['Paris','Germany'], negative=['Berlin'], topn = 1)

In [None]:
import numpy as np
labels = []
count = 0
max_count = 50
X = np.zeros(shape=(max_count,len(model['university'])))

In [None]:
import numpy as np
labels = []
count = 0
max_count = 50
X = np.zeros(shape=(max_count,len(model['university'])))

In [None]:
for term in model.vocab:
    X[count] = model[term]
    labels.append(term)
    count+= 1
    if count >= max_count: break

In [None]:
# It is recommended to use PCA first to reduce to ~50 dimensions
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
X_50 = pca.fit_transform(X)

In [None]:
# Using TSNE to further reduce to 2 dimensions
from sklearn.manifold import TSNE
model_tsne = TSNE(n_components=2, random_state=0)
Y = model_tsne.fit_transform(X_50)

In [None]:
# Show the scatter plot
import matplotlib.pyplot as plt
plt.scatter(Y[:,0], Y[:,1], 20)
#Add labels
for label, x, y in zip(labels, Y[:, 0], Y[:, 1]):
    plt.annotate(label, xy = (x,y), xytext = (0, 0), textcoords = 'offset points', size = 10)

plt.show()

# Use Of default word Embedding

In [2]:
import spacy
nlp=spacy.load('en_core_web_sm')

In [3]:
#importing from website
from urllib import request
url="http://www.gutenberg.org/files/62304/62304-0.txt"
response=request.urlopen(url)
rawtext=response.read().decode('utf8')
type(rawtext)
doc=nlp(rawtext)

In [4]:
# Get the vector for 'text':
doc[3].vector


array([-0.54711723, -1.9657634 ,  0.2217341 , -3.7481773 ,  1.1649952 ,
        0.82003343,  3.1833057 ,  1.3274641 ,  0.6739967 , -1.5965436 ,
       -1.6797376 , -0.29845873,  1.6415154 ,  1.8445041 , -1.990669  ,
        0.5988834 ,  0.36557356,  1.0014813 , -0.2872901 ,  1.5161064 ,
        3.033439  , -0.5433321 , -1.7090623 , -2.684139  , -1.3771006 ,
        4.3160543 , -0.39623195,  2.1399436 , -1.5976278 , -0.8109217 ,
       -0.01566228, -2.1897125 ,  1.8993993 , -2.2924256 ,  0.53279907,
        1.0346884 , -1.360132  ,  1.2321562 ,  2.3556318 , -2.3481326 ,
        0.6372895 ,  4.0386653 , -1.6353981 ,  1.3072901 , -0.4548158 ,
        0.50143105,  2.0086179 , -1.170733  , -0.02461302,  0.38845354,
        0.2930492 ,  2.2574885 ,  2.008726  ,  0.2767743 , -3.3642    ,
        3.158045  ,  0.4759503 , -3.3558373 , -2.28846   , -2.868757  ,
       -1.7053337 , -0.149629  , -1.3066813 , -2.9380534 ,  0.8086827 ,
        2.3146083 ,  3.981751  , -0.6188738 , -2.6224663 ,  4.15

In [5]:
# Get the mean vector for the entire sentence (useful for sentence classification etc.)
doc.vector

array([ 0.2334902 , -0.40256122, -0.78241116, -0.7215848 ,  0.8360339 ,
       -0.16409647,  1.2739755 ,  0.75103194, -0.05853712, -0.29435676,
       -0.12873717, -0.38099372, -0.15153512,  0.24945143, -0.90397596,
        0.01044266, -0.40441835, -0.13788924, -0.03983409,  0.56642383,
        0.25509375,  0.85622525, -0.217485  , -0.4397096 ,  0.12726183,
        0.7110893 ,  0.27750942,  0.24298477,  0.45374662, -0.17743309,
       -0.66717607, -0.0933699 , -0.5372219 , -0.65386796, -0.45200068,
       -0.10164119,  0.38375252,  0.18838185,  0.03894509,  0.7054388 ,
        0.35017535, -0.25892934,  0.41956583, -0.80153257,  0.9959928 ,
       -0.02972438, -0.18958187,  0.203635  , -0.00624284,  0.5241773 ,
       -0.17431964,  0.15089673,  0.17450866,  0.3888655 , -0.22213805,
       -0.04573665,  0.76046556,  0.13938342, -0.5810467 , -0.28338286,
        0.04311839, -0.75463355,  0.364408  , -0.33297136, -0.18057045,
       -0.2680869 ,  0.75112617, -0.3508736 , -0.02367671,  0.42

##### Once assigned, word embeddings in Spacy are accessed for words and sentences using the .vector attribute.

In [6]:
import gensim

In [12]:
from gensim.models import KeyedVectors

# Load vectors directly from the file
model = KeyedVectors.load_word2vec_format('C:/Users/JEV/Downloads/GoogleNews.bin', binary=True)

# Access vectors for specific words with a keyed lookup:
vector = model['easy']
# see the shape of the vector (300,)
vector.shape

vectors = [model[x] for x in doc]


TypeError: 'spacy.tokens.token.Token' object is not iterable

In [None]:
model.similarity('straightforward','easy')

In [None]:
model.similarity('simple','impossible')

In [None]:
model.most_similar('simple')

##### Gensim provides a number of helper functions to interact with word vector models. Similarity is determined using the cosine distance between two vectors