# Word Embedding In Natural language Processing


### Example 1

In [1]:
# define tokenized senences as training data
tokenized_sentences = [['Hello','This','is','python','training','by','Institute'],
             ['Hello','This','is','Java','training','by','Institute'],
             ['Hello','This','is','Data Science','training','for','graduates'],
             ['Hello','This','is','programming','training','']]

In [2]:
# training word2vec model
from gensim.models import Word2Vec
import warnings
warnings.filterwarnings('ignore')
mymodel = Word2Vec(tokenized_sentences, min_count=1)

In [3]:
# summarizing the loaded model
print(mymodel)

Word2Vec(vocab=13, vector_size=100, alpha=0.025)


In [4]:
# Get the vocabulary keys
words = list(mymodel.wv.key_to_index)
words

['training',
 'is',
 'This',
 'Hello',
 'Institute',
 'by',
 '',
 'programming',
 'graduates',
 'for',
 'Data Science',
 'Java',
 'python']

In [5]:
# summarize vocabulary
print(words)

['training', 'is', 'This', 'Hello', 'Institute', 'by', '', 'programming', 'graduates', 'for', 'Data Science', 'Java', 'python']


In [7]:
# access word vector for one word "training"
vector = mymodel.wv.get_vector('training')
print(vector)

[-5.3622725e-04  2.3643016e-04  5.1033497e-03  9.0092728e-03
 -9.3029495e-03 -7.1168090e-03  6.4588715e-03  8.9729885e-03
 -5.0154282e-03 -3.7633730e-03  7.3805046e-03 -1.5334726e-03
 -4.5366143e-03  6.5540504e-03 -4.8601604e-03 -1.8160177e-03
  2.8765798e-03  9.9187379e-04 -8.2852151e-03 -9.4488189e-03
  7.3117660e-03  5.0702621e-03  6.7576934e-03  7.6286553e-04
  6.3508893e-03 -3.4053659e-03 -9.4640255e-04  5.7685734e-03
 -7.5216386e-03 -3.9361049e-03 -7.5115822e-03 -9.3004224e-04
  9.5381187e-03 -7.3191668e-03 -2.3337698e-03 -1.9377422e-03
  8.0774352e-03 -5.9308959e-03  4.5161247e-05 -4.7537349e-03
 -9.6035507e-03  5.0072931e-03 -8.7595871e-03 -4.3918253e-03
 -3.5099984e-05 -2.9618264e-04 -7.6612402e-03  9.6147414e-03
  4.9820566e-03  9.2331432e-03 -8.1579182e-03  4.4957972e-03
 -4.1370774e-03  8.2453492e-04  8.4986184e-03 -4.4621779e-03
  4.5175003e-03 -6.7869616e-03 -3.5484887e-03  9.3985079e-03
 -1.5776539e-03  3.2137157e-04 -4.1406299e-03 -7.6826881e-03
 -1.5080094e-03  2.46979

In [10]:
#try finding most similar words for word "Data"
similar_words = mymodel.wv.most_similar("Java")
for word, score in similar_words:
    print(word, score)

This 0.17272791266441345
by 0.16694681346416473
programming 0.11117953062057495
Data Science 0.10942255705595016
training 0.07963485270738602
Hello 0.04131004959344864
graduates 0.03771485015749931
is 0.008315937593579292
Institute -0.005896786693483591
python -0.030302345752716064


In [16]:
#try finding most similar words for word "Data"
similar_words = mymodel.wv.doesnt_match(["Java","Institute","Hello","This"])
similar_words

'Hello'

### Example 2

In [37]:
# Importing necessary libraries
from gensim.models import Word2Vec


In [17]:
# Sample data
sentences = [
    ['I', 'love', 'natural', 'language', 'processing'],
    ['Word', 'embeddings', 'are', 'useful', 'in', 'NLP'],
    ['I', 'enjoy', 'working', 'with', 'word', 'vectors'],
    ['Machine', 'learning', 'is', 'an', 'important', 'aspect', 'of', 'data', 'science'],
    ['Word', 'embeddings', 'can', 'capture', 'semantic', 'relationships'],
    ['Deep', 'learning', 'models', 'often', 'use', 'word', 'embeddings'],
    ['Word2Vec', 'is', 'a', 'popular', 'algorithm', 'for', 'word', 'embedding'],
    ['I', 'am', 'learning', 'Word2Vec', 'for', 'NLP'],
    ['Word', 'embeddings', 'are', 'dense', 'vector', 'representations', 'of', 'words'],
    ['NLP', 'tasks', 'often', 'benefit', 'from', 'pre-trained', 'word', 'embeddings']
]

In [18]:

# Training the Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

In [40]:

# Accessing word vectors
print("Vector for 'language':", model.wv['language'])

Vector for 'language': [ 0.00973708 -0.00978046 -0.00649382  0.00278359  0.00642799 -0.00537154
  0.00275221  0.00912417 -0.00681627 -0.00610591 -0.00498472 -0.00368031
  0.00185309  0.00968501  0.0064413   0.00040063  0.00247638  0.00843981
  0.00912612  0.00562099  0.00594713 -0.00761882 -0.00382212 -0.00568689
  0.00618007 -0.00225061 -0.00877997  0.0076169   0.00840086 -0.00331963
  0.00911852 -0.00074093 -0.00362374 -0.00038817  0.00019251 -0.00350342
  0.00281534  0.00573002  0.00686411 -0.00890567 -0.00218942 -0.00548028
  0.00752014  0.00650113 -0.00435958  0.00232067 -0.00596084  0.00023551
  0.0094599  -0.00260661 -0.00518894 -0.00739851 -0.00291376 -0.00086356
  0.00352524  0.00974464 -0.00338761  0.00190258  0.00968369  0.00153214
  0.00098226  0.00980093  0.00929687  0.00771189 -0.00617373  0.00999075
  0.00584348  0.00907265 -0.00199469  0.00335072  0.00683613 -0.00388727
  0.00664142  0.00256674  0.00931891 -0.00303695 -0.00310408  0.00621246
 -0.00908007 -0.00725452 -0.

In [41]:

# Finding most similar words
similar_words = model.wv.most_similar('language')
print("Most similar words to 'language':", similar_words)

Most similar words to 'language': [('word', 0.15937383472919464), ('representations', 0.15623889863491058), ('I', 0.15278607606887817), ('relationships', 0.14992523193359375), ('dense', 0.14473387598991394), ('useful', 0.14260095357894897), ('are', 0.13277468085289001), ('capture', 0.12229277193546295), ('with', 0.11944551765918732), ('benefit', 0.09594906121492386)]


In [24]:

# Similarity between words
similarity_score = model.wv.similarity('language', 'word')
print("Similarity between 'language' and 'processing':", similarity_score)

Similarity between 'language' and 'processing': 0.15937382


In [26]:

# Similarity between words that are not in the vocabulary
try:
    similarity_score = model.wv.similarity('language', 'English')
    print("Similarity between these words:", similarity_score)
except KeyError as e:
    print(f"At least one of the words is not in the vocabulary.")

# Accessing vocabulary
vocabulary = list(model.wv.index_to_key)
print("Vocabulary:", vocabulary)


At least one of the words is not in the vocabulary.
Vocabulary: ['embeddings', 'word', 'I', 'learning', 'Word', 'NLP', 'for', 'is', 'often', 'Word2Vec', 'of', 'are', 'in', 'useful', 'aspect', 'working', 'processing', 'with', 'language', 'vectors', 'Machine', 'natural', 'love', 'an', 'important', 'enjoy', 'pre-trained', 'from', 'algorithm', 'benefit', 'tasks', 'words', 'representations', 'vector', 'dense', 'am', 'embedding', 'popular', 'science', 'a', 'use', 'models', 'Deep', 'relationships', 'semantic', 'capture', 'can', 'data']


### Example 3

In [28]:
from gensim.models import Word2Vec

# Sample data
sentences = [
    ['I', 'love', 'mango', 'and', 'apple', 'fruits'],
    ['Word', 'embeddings', 'are', 'useful', 'in', 'NLP'],
    ['I', 'enjoy', 'working', 'with', 'word', 'vectors'],
    ['Machine', 'learning', 'is', 'an', 'important', 'aspect', 'of', 'data', 'science'],
    ['Word', 'embeddings', 'can', 'capture', 'semantic', 'relationships'],
    ['Deep', 'learning', 'models', 'often', 'use', 'word', 'embeddings'],
    ['Word2Vec', 'is', 'a', 'popular', 'algorithm', 'for', 'word', 'embedding'],
    ['I', 'am', 'learning', 'Word2Vec', 'for', 'NLP'],
    ['Word', 'embeddings', 'are', 'dense', 'vector', 'representations', 'of', 'words'],
    ['NLP', 'tasks', 'often', 'benefit', 'from', 'pre-trained', 'word', 'embeddings']
]

# Training the Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Application: Finding similar words
def find_similar_words(word):
    try:
        similar_words = model.wv.most_similar(word)
        print(f"Words similar to '{word}':")
        for similar_word, similarity_score in similar_words:
            print(f"{similar_word}: {similarity_score}")
    except KeyError:
        print(f"'{word}' not found in vocabulary.")

# Application: Similarity between words
def calculate_similarity(word1, word2):
    try:
        similarity_score = model.wv.similarity(word1, word2)
        print(f"Similarity between '{word1}' and '{word2}': {similarity_score}")
    except KeyError:
        print("At least one of the words is not found in vocabulary.")

# Showcase the application
print("Welcome to Word2Vec Showcase!")
print("1. Find similar words")
print("2. Calculate similarity between words")
choice = input("Enter your choice (1/2): ")

if choice == '1':
    word = input("Enter a word to find similar words: ")
    find_similar_words(word)
elif choice == '2':
    word1 = input("Enter the first word: ")
    word2 = input("Enter the second word: ")
    calculate_similarity(word1, word2)
else:
    print("Invalid choice. Please enter '1' or '2'.")


Welcome to Word2Vec Showcase!
1. Find similar words
2. Calculate similarity between words
Enter your choice (1/2): 1
Enter a word to find similar words: Word
Words similar to 'Word':
important: 0.17818447947502136
embedding: 0.16392144560813904
capture: 0.14955472946166992
NLP: 0.13167978823184967
Deep: 0.07775183022022247
is: 0.07505468279123306
word: 0.06803048402070999
dense: 0.0677378922700882
an: 0.04822660610079765
benefit: 0.047279421240091324
