## Learning embeddings

### Using Word2Vec

In [8]:
from gensim.models import Word2Vec

In [7]:
# Sample sentences
sentences = [["the", "cat", "sat", "on", "the", "mat"],
             ["dog", "barked", "at", "the", "mailman"]]

# Train a Skip-Gram model
word2vec_model = Word2Vec(sentences, vector_size=50, window=2, min_count=1, sg=1)

# Get word vector
print(word2vec_model.wv["cat"])

# Find similar words
print(word2vec_model.wv.most_similar("cat"))

[-0.01427803  0.00248206 -0.01435343 -0.00448924  0.00743861  0.01166625
  0.00239637  0.00420546 -0.00822078  0.01445067 -0.01261408  0.00929443
 -0.01643995  0.00407294 -0.0099541  -0.00849538 -0.00621797  0.01131042
  0.0115968  -0.0099493   0.00154666 -0.01699156  0.01561961  0.01851458
 -0.00548466  0.00160045  0.0014933   0.01095577 -0.01721216  0.00116891
  0.01373884  0.00446319  0.00224935 -0.01864431  0.01696473 -0.01252825
 -0.00598475  0.00698757 -0.00154526  0.00282258  0.00356398 -0.0136578
 -0.01944962  0.01808117  0.01239611 -0.01382586  0.00680696  0.00041213
  0.00950749 -0.01423989]
[('sat', 0.1845843642950058), ('at', 0.13940520584583282), ('dog', 0.10704141855239868), ('barked', -0.010146207176148891), ('the', -0.0560765378177166), ('on', -0.08931691199541092), ('mat', -0.10186848789453506), ('mailman', -0.11910455673933029)]


### Using PyTorch Embedding Layer

In [9]:
import torch
import torch.nn as nn

In [10]:
# Define vocabulary
word_to_ix = {"hello": 0, "world": 1, "goodbye": 2}

# Create embedding layer
embedding_dim = 5  # Size of word vectors
embedding_layer = nn.Embedding(num_embeddings=len(word_to_ix), embedding_dim=embedding_dim)

# Convert word index to tensor
word_idx = torch.tensor([word_to_ix["hello"]], dtype=torch.long)

# Get embedding
word_embedding = embedding_layer(word_idx)
print(word_embedding)


tensor([[-1.2225, -0.3311, -0.8140,  0.3835,  1.0128]],
       grad_fn=<EmbeddingBackward0>)
