# Word Embeddings and Semantic Analysis

This notebook explores various techniques for analyzing word relationships, similarities, and semantic meanings using WordNet and Word2Vec models.

## WordNet-based Analysis

First, we'll use WordNet, a lexical database for the English language, to explore word taxonomies and similarities.

In [1]:
from nltk.corpus import wordnet as wn
from numpy import dot
from numpy.linalg import norm
import numpy as np

all_hypernyms = list(set(wn.all_synsets(pos='n')))

# Print the taxonomy (hypernym hierarchy) for a given word. Default is noun 'n'
def print_taxonomy(word, pos='n'):
    synset = wn.synsets(word, pos=pos)[0]
    hyper = lambda s: s.hypernyms()
    taxonomy = list(synset.closure(hyper))
    
    print(f"Taxonomy for '{word}':")
    # Print each hypernym in reverse order (from most general to most specific)
    for i, hypernym in enumerate(reversed(taxonomy), 1):
        print(f"{i}. {hypernym.name().split('.')[0]} ({hypernym.definition()})")
    print(f"{len(taxonomy) + 1}. {synset.name().split('.')[0]} ({synset.definition()})")


ModuleNotFoundError: No module named 'nltk'

Let's see the taxonomy for "lion":

In [None]:
print_taxonomy("lion")

### Word Similarity using WordNet
Now, we'll define functions to calculate cosine similarity between words based on their WordNet hypernym vectors.

In [None]:
def get_vector(word):
    synset = wn.synsets(word)[0]
    hypernyms = list(synset.closure(lambda s: s.hypernyms()))
    return [1 if h in hypernyms else 0 for h in all_hypernyms]

# Compute dot product of the two vectors divided by the product of their norms
def cosine_similarity(vec1, vec2):
    return dot(vec1, vec2) / (norm(vec1) * norm(vec2))

def show_similarity(word1, word2):
    vec1 = get_vector(word1)
    vec2 = get_vector(word2)
    similarity = cosine_similarity(vec1, vec2)
    print(f"Cosine similarity between '{word1}' and '{word2}': {similarity}")

Let's compare some words:

In [None]:
show_similarity("lion", "cat")

In [None]:
show_similarity("lion", "dog")

In [None]:
show_similarity("lion", "paper") 

## Word2Vec Model Analysis

Now, we'll switch to using a pre-trained Word2Vec model for more advanced word relationship analysis.

### Loading the Word2Vec Model

In [None]:
import gensim.downloader as api
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Download and load the pre-trained Word2Vec model
print("Downloading and loading the model... This may take a few minutes.")
model = api.load('word2vec-google-news-300')
print("Model loaded!")

### Exploring Word Vectors
Let's look at the raw vector for "dog" and the vocabulary size of our model.

In [None]:
# Function to get the word vector regardless of case
def get_word_vector(word):
    if word.lower() in model.key_to_index:
        return model[word.lower()]
    elif word.upper() in model.key_to_index:
        return model[word.upper()]
    elif word.capitalize() in model.key_to_index:
        return model[word.capitalize()]
    else:
        raise KeyError(f"Word '{word}' not in vocabulary")

In [None]:
# 1. Print raw vector of a word
print("\nRaw vector for 'dog':")
print(get_word_vector("dog"))

In [None]:
# 2. Print vocabulary size
print("\nVocabulary size:", len(model.key_to_index))

### Finding Similar Words
We'll define functions to find words similar to a given word.

In [None]:
# Function to check if a word is a valid English word and lemmatize it
def process_word(word):
    word = word.lower()
    if word.isalpha() and not word.startswith('_') and not word.endswith('_') and '_' not in word:
        return lemmatizer.lemmatize(word)
    return None

# Function to check if a word is in the vocabulary
def word_in_vocab(word):
    return word.lower() in model.key_to_index or word.upper() in model.key_to_index or word.capitalize() in model.key_to_index

# Function to find similar words
def find_similar_words(word, topn=5):
    if not word_in_vocab(word):
        print(f"'{word}' is not in the vocabulary.")
        return
    print(f"\nWords similar to '{word}':")
    original_word = process_word(word)
    similar_words = model.most_similar(positive=[get_word_vector(word)], topn=50)  # Get more words than needed
    valid_words = []
    for similar_word, score in similar_words:
        processed_word = process_word(similar_word)
        if processed_word and processed_word != original_word and processed_word not in [w for w, _ in valid_words]:
            valid_words.append((processed_word, score))
            if len(valid_words) == topn:
                break
    
    # If we don't have enough words, relax the filtering
    if len(valid_words) < topn:
        for similar_word, score in similar_words:
            if similar_word not in [w for w, _ in valid_words]:
                valid_words.append((similar_word, score))
                if len(valid_words) == topn:
                    break
    
    for similar_word, score in valid_words[:topn]:
        print(f"{similar_word}: {score:.2f}")

Let's find words similar to "dog", "computer", and "president":

In [None]:
# 3. Demonstrate similarity
find_similar_words("dog")
find_similar_words("computer")
find_similar_words("president")

### Finding Least Similar Words
Now, let's find words that are least similar to a given word.

In [None]:
import random
import numpy as np
from nltk.corpus import words as nltk_words
import nltk
nltk.download('words', quiet=True)

def find_least_similar_words(word, topn=5, min_count=1000):
    if word not in model:
        print(f"'{word}' is not in the vocabulary.")
        return
    
    print(f"\nWords least similar to '{word}':")
    
    # Get common English words
    common_words = set(w.lower() for w in nltk_words.words())
    
    # Filter vocabulary to only include common words and words that appear frequently
    filtered_vocab = [w for w in model.key_to_index if w in common_words and model.get_vecattr(w, 'count') >= min_count]
    
    # If we have too few words, reduce the min_count
    while len(filtered_vocab) < 1000 and min_count > 100:
        min_count //= 2
        filtered_vocab = [w for w in model.key_to_index if w in common_words and model.get_vecattr(w, 'count') >= min_count]
    
    # Sample words from the filtered vocabulary
    sample_size = min(10000, len(filtered_vocab))
    sampled_words = random.sample(filtered_vocab, sample_size)
    
    # Calculate similarities
    similarities = [(w, model.similarity(word, w)) for w in sampled_words if w != word]
    
    # Sort by similarity (ascending) and get the least similar words
    least_similar = sorted(similarities, key=lambda x: x[1])[:topn]
    
    for similar_word, score in least_similar:
        print(f"{similar_word}: {score:.2f}")



Let's find words least similar to "dog":

In [None]:
find_least_similar_words("dog", 10)

### Word Analogies
We'll explore word analogies using the Word2Vec model.

In [None]:
# Function to perform word analogy
def word_analogy(word1, word2, word3):
    if not all(word_in_vocab(w) for w in [word1, word2, word3]):
        print("One or more words are not in the vocabulary.")
        return
    results = model.most_similar(positive=[get_word_vector(word2), get_word_vector(word3)], 
                                 negative=[get_word_vector(word1)], topn=10)
    valid_results = []
    for result_word, score in results:
        processed_word = process_word(result_word)
        if processed_word and processed_word not in [word1, word2, word3]:
            valid_results.append((processed_word, score))
            if len(valid_results) == 1:
                break
    if valid_results:
        result = valid_results[0]
        print(f"\nAnalogy: {word1} is to {word2} as {word3} is to {result[0]}")
    else:
        print("No valid analogy found.")

Let's try some analogies:

In [None]:
# 4. Demonstrate analogy
word_analogy("man", "king", "woman")
word_analogy("paris", "france", "rome")
word_analogy("good", "best", "bad")


### Visualizing Word Embeddings
Finally, let's visualize word embeddings in a 2D space using t-SNE.

In [None]:
# Function to visualize word embeddings
def visualize_embeddings(words):
    valid_words = [word for word in words if word_in_vocab(word) and process_word(word)]
    if not valid_words:
        print("None of the provided words are valid words in the vocabulary.")
        return
    word_vectors = [get_word_vector(word) for word in valid_words]
    
    # Convert list of vectors to a numpy array
    word_vectors_array = np.array(word_vectors)
    
    # Adjust perplexity based on the number of samples
    n_samples = word_vectors_array.shape[0]
    perplexity = min(30, n_samples - 1)  # Default is 30, but we need it to be less than n_samples
    
    tsne = TSNE(n_components=2, random_state=0, perplexity=perplexity)
    vectors_2d = tsne.fit_transform(word_vectors_array)
    
    plt.figure(figsize=(16, 14))
    for i, word in enumerate(valid_words):
        plt.scatter(vectors_2d[i, 0], vectors_2d[i, 1])
        plt.annotate(word, (vectors_2d[i, 0], vectors_2d[i, 1]))
    plt.title("Word Embedding Visualization")
    plt.show()


Let's visualize a set of words:

In [None]:
# 5. Visualize embeddings
words_to_plot = [
    "love", "hate", "joy", "sadness",
    "computer", "internet", "technology", "science",
    "democracy", "freedom", "justice", "equality",
    "earth", "moon", "sun", "galaxy",
    "music", "art", "dance", "literature",
    "health", "disease", "medicine", "wellness",
    "money", "wealth", "poverty", "economy",
    "war", "peace", "conflict", "diplomacy",
    "education", "knowledge", "learning", "wisdom",
    "time", "space", "reality", "imagination"
]
visualize_embeddings(words_to_plot)