In [1]:
import numpy as np
from keras.preprocessing.text import Tokenizer

# Sample vocabulary
x = {'text', 'the', 'leader', 'prime', 'natural', 'language'}

# Create the tokenizer and fit on texts
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x)

# Number of unique words in the dictionary
print("Number of unique words in dictionary=", len(tokenizer.word_index))
print("Dictionary is =", tokenizer.word_index)

# Function to load GloVe embeddings and create the embedding matrix
def embedding_for_vocab(glove_file, word_index, embedding_dim):
    embeddings_index = {}
    with open(glove_file, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

# Path to GloVe embeddings file (ensure you have the file in the correct path)
glove_file = '/content/sample_data/glove.6B.50d.txt'
embedding_dim = 50

# Create the embedding matrix for the vocabulary
embedding_matrix_vocab = embedding_for_vocab(glove_file, tokenizer.word_index, embedding_dim)

# Print the dense vector for the first word in the dictionary
print("Dense vector for first word is =>", embedding_matrix_vocab[1])

# Function to calculate cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

# Words to compare
word1 = "natural"
word2 = "language"

# Check if words are in the tokenizer's word index
if word1 in tokenizer.word_index and word2 in tokenizer.word_index:
    vec1 = embedding_matrix_vocab[tokenizer.word_index[word1]]
    vec2 = embedding_matrix_vocab[tokenizer.word_index[word2]]

    # Calculate and print cosine similarity
    similarity = cosine_similarity(vec1, vec2)
    print(f"Cosine similarity between '{word1}' and '{word2}': {similarity}")
else:
    print(f"One or both words are not in the vocabulary: '{word1}', '{word2}'")


ModuleNotFoundError: No module named 'keras.preprocessing.text'