# Importing Libraries and Dependencies

In [1]:
import numpy as np
import pandas as pd
import pickle
import torch
import torch
import torch.nn.functional as F

from class_function import Skipgram, SkipgramNeg, Glove
from scipy import stats
from scipy.stats import spearmanr
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.metrics import mean_squared_error

## Import Data

In [2]:
# Importing training data
Data = pickle.load(open('../../models/Data.pkl', 'rb'))

corpus = Data['corpus']
vocab = Data['vocab']
word2index = Data['word2index']
voc_size = Data['voc_size']
embed_size = Data['embedding_size']
window_size = Data['window_size']

In [3]:
Data['embedding_size']

50

In [4]:
Data['voc_size']

7136

## Import Models

In [5]:
# Import the saved Skipgram model

skipgram = Skipgram(voc_size, embed_size)

# Load the state_dict into the model
skipgram.load_state_dict(torch.load('../../models/Word2Vec(Skipgram).pt'),  strict=False)

skipgram.eval()

  skipgram.load_state_dict(torch.load('../../models/Word2Vec(Skipgram).pt'),  strict=False)


Skipgram(
  (embedding_center): Embedding(7136, 50)
  (embedding_outside): Embedding(7136, 50)
)

In [6]:
# Import the saved negative Skipgram model
state_dict = torch.load('../../models/Word2Vec(Neg_Sampling).pt')

# Load the remapped state_dict into the model
skipgramNeg = SkipgramNeg(voc_size, embed_size)

skipgramNeg.load_state_dict(state_dict)

skipgramNeg.eval()


  state_dict = torch.load('../../models/Word2Vec(Neg_Sampling).pt')


SkipgramNeg(
  (embedding_center): Embedding(7136, 50)
  (embedding_outside): Embedding(7136, 50)
  (logsigmoid): LogSigmoid()
)

In [7]:
# Import the saved Glove from scratch model

glove = Glove(voc_size, embed_size)

glove.load_state_dict(torch.load('../../models/Glove_from_scratch.pt'),  strict=False)

glove.eval()

  glove.load_state_dict(torch.load('../../models/Glove_from_scratch.pt'),  strict=False)


Glove(
  (center_embedding): Embedding(7136, 50)
  (outside_embedding): Embedding(7136, 50)
  (center_bias): Embedding(7136, 1)
  (outside_bias): Embedding(7136, 1)
)

In [8]:
# Build Glove (Gensim) like example from Dr. Chaklam
glove_file = datapath(r'C:\Users\zaqih\Downloads\glove.6B\glove.6B.100d.txt')

gensim = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)


## Define Custom Functions

In [9]:
# Custom Function to compute vectors for all words in the vocabulary
def compute_all_word_vectors(vocab, model):
    word_vectors = []
    for word in vocab:
        word_vectors.append(model.get_vector(word))
    return torch.stack(word_vectors)

# Custom Function to caluates the performance of a word embedding model and return accuracy
def similarities(lines, model, vocab):
    all_word_vectors = compute_all_word_vectors(vocab, model)

    correct = 0
    skipped = 0

    for line in lines:
        words = line.split()

        # Skip lines with unknown words
        if any(word not in vocab for word in words):
            skipped += 1
            # print(f"Skipping analogy due to unknown words: {line}")
            continue

        # Retrieve vectors for analogy words
        vectors = [model.get_vector(word.lower()) for word in words]

        # Perform vector manipulation
        result_vector = vectors[1] - vectors[0] + vectors[2]
        result_vector = result_vector.unsqueeze(0)  # Add batch dimension

        # Calculate cosine similarities
        similarities = F.cosine_similarity(result_vector, all_word_vectors)

        # Get the closest word
        closest_word_index = torch.argmax(similarities).item()
        closest_word = vocab[closest_word_index]

        if closest_word == words[3]:  # Check if predicted word matches target
            correct += 1
        # else:
        #     # print(f"Mismatch: {line} -> Predicted: {closest_word}")

    # Calculate accuracy
    total = len(lines) - skipped
    accuracy = (correct / total) * 100 if total > 0 else 0
    # print('---------------------------------------------------------')
    # print(f'Total: {total} analogies')
    # print(f'Skipped: {skipped} analogies (unknown words)')
    print(f'Accuracy: {accuracy:.2f}%')
    
    return accuracy


In [10]:
# Custom Function to evaluate Gensim


def evaluate_glove(lines, model):
    
    correct = 0
    total = 0

    for line in lines:
        words = line.lower().split()  # Convert line to lowercase and split into words

        # Check if line is valid and all words exist in the model
        if len(words) != 4:
            print(f"Skipping malformed line: {line}")
            continue
        if any(word not in model for word in words):
            print(f"Skipping line due to OOV words: {line}")
            continue

        # Perform analogy
        try:
            result = model.most_similar(positive=[words[2], words[1]], negative=[words[0]], topn=1)
            closest_word = result[0][0]  # Get the most similar word
            total += 1

            if closest_word == words[3]:
                correct += 1
        except Exception as e:
            print(f"Error processing line {line}: {e}")
            continue

    accuracy = (correct / total) * 100 if total > 0 else 0
    # print('---------------------------------------------------------')
    # print(f'Total lines evaluated: {total}')
    print(f'Accuracy: {accuracy:.2f}%')
    return accuracy


In [11]:
# Compute and return cosine similarity between two vectors

def cosine_similarity(A, B):

    # Flatten vectors to ensure they're 1D
    A = A.flatten()
    B = B.flatten()

    # Compute dot product and norms
    dot_product = np.dot(A, B)
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B)

    # Return cosine similarity
    similarity = dot_product / (norm_a * norm_b)
    return similarity

In [12]:
# Evaluate word similarity and Return the Spearman rank correlation 

def similar(lines, model):
    
    scores_real = []  # To store actual similarity scores (from the dataset)
    scores_pred = []  # To store predicted similarity scores (using cosine similarity)

    for line in lines:
        words = line.split()  # Split line into words
        vec = []  # List to store word vectors

        # Assuming the first two words need to be compared
        for word in words[:2]: 
            try:
                # Attempt to get the vector for the word
                vec.append(model.get_vector(word).detach().numpy())
            except:
                # If the word is not in the vocabulary, use the <UNK> token
                vec.append(model.get_vector('<UNK>').detach().numpy())

        # Store the actual similarity score from the dataset (third word)
        scores_real.append(float(words[2]))  
        
        # Calculate the cosine similarity between the two words and store the predicted score
        scores_pred.append(cosine_similarity(np.array(vec[0]), np.array(vec[1])))

    # Calculate and return Spearman's rank correlation between actual and predicted scores
    return spearmanr(scores_real, scores_pred)

In [13]:
# Custom Function to Evaluate word similarity and Return the Spearman rank correlation for Gensim

def similar_gensim(lines, model):
    scores_real = []  # Store real human similarity scores
    scores_pred = []  # Store predicted cosine similarities based on embeddings

    for line in lines:
        words = line.split()  # Split each line into words
        vec = []
        
        # Extract word vectors for the first two words
        for word in words[:2]:
            try:
                # Use model[word] to get the embedding directly from the model (Gensim)
                vec.append(model[word])
            except KeyError:
                # Handle missing words by using a placeholder or a default embedding
                # Here I use 'unk' as an example. Adjust based on your vocabulary.
                vec.append(model['unk'])  # You can use your own word for unknown words.
        
        # Append human similarity score (the third element of each line)
        scores_real.append(float(words[2]))

        # Compute the predicted similarity using cosine similarity
        similarity_score = cosine_similarity(np.array(vec[0]), np.array(vec[1]))
        scores_pred.append(similarity_score)

    # Calculate Spearman's rank correlation between real and predicted scores
    correlation, p_value = spearmanr(scores_real, scores_pred)

    # print(f"Spearman Rank Correlation of Gensim: {correlation:.4f}")
    # print(f"P-value: {p_value:.4f}")
    
    return correlation, p_value

In [14]:
# #  Compute cosine similarity between two words using a given model.
# def compute_similarity(word1, word2, model):
#     try:
#         # Handle Gensim-like models (dictionary-like objects)
#         if isinstance(model, dict) or hasattr(model, '__getitem__'):
#             vec1 = model[word1].reshape(1, -1)  # Get vector for word1
#             vec2 = model[word2].reshape(1, -1)  # Get vector for word2
#         # Handle PyTorch models
#         elif hasattr(model, 'get_vector'):
#             vec1 = model.get_vector(word1).detach().numpy().reshape(1, -1)
#             vec2 = model.get_vector(word2).detach().numpy().reshape(1, -1)
#         else:
#             raise ValueError("Unsupported model type")
        
#         # Compute cosine similarity
#         return cosine_similarity(vec1, vec2)[0][0]
#     except KeyError:
#         # Handle unknown words (e.g., return 0 or use a default vector)
#         return 0.0

In [15]:
def compute_similarity(word1, word2, model):
    """
    Compute cosine similarity between two words using a given model.
    
    Args:
        word1 (str): First word.
        word2 (str): Second word.
        model: A word embedding model (either PyTorch-based, Gensim-based, or a dictionary-like object).
    
    Returns:
        float: Cosine similarity between the two word vectors.
    """
    try:
        # Handle Gensim-like models (dictionary-like objects)
        if isinstance(model, dict) or hasattr(model, '__getitem__'):
            vec1 = np.array(model[word1]).reshape(1, -1)  # Get vector for word1 and reshape to 2D
            vec2 = np.array(model[word2]).reshape(1, -1)  # Get vector for word2 and reshape to 2D
        # Handle PyTorch models
        elif hasattr(model, 'get_vector'):
            vec1 = model.get_vector(word1).detach().numpy().reshape(1, -1)  # Get vector for word1 and reshape to 2D
            vec2 = model.get_vector(word2).detach().numpy().reshape(1, -1)  # Get vector for word2 and reshape to 2D
        else:
            raise ValueError("Unsupported model type")
        
        # Compute cosine similarity
        return cosine_similarity(vec1, vec2)  # No need to index [0][0] since it returns a scalar
    except KeyError:
        # Handle unknown words (e.g., return 0 or use a default vector)
        return 0.0

## Prepare Test Dataset

In [16]:
#load the dataset for testing
file_path = "../../data/word-test.v1.txt"

with open(file_path, 'r') as f:
    contents = f.read()
    data = contents.split('\n')

data[:7]

['// Copyright 2013 Google Inc. All Rights Reserved.',
 ': capital-common-countries',
 'Athens Greece Baghdad Iraq',
 'Athens Greece Bangkok Thailand',
 'Athens Greece Beijing China',
 'Athens Greece Berlin Germany',
 'Athens Greece Bern Switzerland']

In [17]:
#explore the dataset
empty_count = 0
for idx, sent in enumerate(data):
    if not sent:  # Check if the string is empty
        empty_count += 1
    elif sent[0] == ':':
        print(idx, sent)

print(f"Number of empty strings in the dataset: {empty_count}")

1 : capital-common-countries
508 : capital-world
5033 : currency
5900 : city-in-state
8368 : family
8875 : gram1-adjective-to-adverb
9868 : gram2-opposite
10681 : gram3-comparative
12014 : gram4-superlative
13137 : gram5-present-participle
14194 : gram6-nationality-adjective
15794 : gram7-past-tense
17355 : gram8-plural
18688 : gram9-plural-verbs
Number of empty strings in the dataset: 0


In [18]:
#create the corpora for testing

# capital-common-countries corpus to be used for semantic analogies as per assignment
capital = data[2:508]
sem_lines = capital
print(sem_lines[:5])

# past-tense corpus to be used for syntatic analogies
past = data[15795:17355]
syn_lines = past
print(syn_lines[:5])

['Athens Greece Baghdad Iraq', 'Athens Greece Bangkok Thailand', 'Athens Greece Beijing China', 'Athens Greece Berlin Germany', 'Athens Greece Bern Switzerland']
['dancing danced decreasing decreased', 'dancing danced describing described', 'dancing danced enhancing enhanced', 'dancing danced falling fell', 'dancing danced feeding fed']


## Semantic Analysis

In [19]:
# Define the models and their names
models = {
    'Word2Vec (Skipgram)': skipgram,
    'Word2Vec (Neg Sampling)': skipgramNeg,
    'GloVe from Scratch': glove,
    'GloVe (Gensim)': gensim
}

In [20]:
# Perform semantic analysis
print("Semantic Analysis:")
for model_name, model in models.items():
    print(f"\nEvaluating {model_name} on semantic analogies:")
    if model_name == 'GloVe (Gensim)':
        # Use evaluate_glove for gensim model
        evaluate_glove(sem_lines, model)
    else:
        # Use similarities for other models
        similarities(sem_lines, model, vocab)

Semantic Analysis:

Evaluating Word2Vec (Skipgram) on semantic analogies:
Accuracy: 0.00%

Evaluating Word2Vec (Neg Sampling) on semantic analogies:
Accuracy: 0.00%

Evaluating GloVe from Scratch on semantic analogies:
Accuracy: 0.00%

Evaluating GloVe (Gensim) on semantic analogies:
Accuracy: 93.87%


## Syntatic Analysis

In [21]:
# Perform syntactic analysis
print("\nSyntactic Analysis:")
for model_name, model in models.items():
    print(f"\nEvaluating {model_name} on syntactic analogies:")
    if model_name == 'GloVe (Gensim)':
        # Use evaluate_glove for gensim model
        evaluate_glove(syn_lines, model)
    else:
        # Use similarities for other models
        similarities(syn_lines, model, vocab)


Syntactic Analysis:

Evaluating Word2Vec (Skipgram) on syntactic analogies:
Accuracy: 0.00%

Evaluating Word2Vec (Neg Sampling) on syntactic analogies:
Accuracy: 0.00%

Evaluating GloVe from Scratch on syntactic analogies:
Accuracy: 0.00%

Evaluating GloVe (Gensim) on syntactic analogies:
Accuracy: 55.45%


## Model Comparison

![alt text](<compariosn of models.png>)

## Using the similarity dataset to find correlation

In [22]:
#load the similarity dataset for testing
file_path = "../../data/wordsim353_sim_rel/wordsim_similarity_goldstandard.txt"

with open(file_path, 'r') as f:
    similarity_lines = f.readlines()

In [23]:
# Define the models and their evaluation functions
models = {
    'Word2Vec (Skipgram)': (skipgram, similar),
    'Word2Vec (Neg Sampling)': (skipgramNeg, similar),
    'GloVe from Scratch': (glove, similar),
    'GloVe (Gensim)': (gensim, similar_gensim)
}

In [24]:
# Evaluate each model
for model_name, (model, eval_func) in models.items():
    if model_name == 'gensim':
        # Use the specific evaluation function for Gensim
        correlation_score = eval_func(similarity_lines, model)[0]
    else:
        # Use the general evaluation function for other models
        correlation_score = eval_func(similarity_lines, model)[0]
    
    # Print the correlation score
    print(f'{model_name} correlation score: {correlation_score:.4f}')

Word2Vec (Skipgram) correlation score: 0.1635
Word2Vec (Neg Sampling) correlation score: 0.1589
GloVe from Scratch correlation score: 0.1757
GloVe (Gensim) correlation score: 0.6038


The correlation scores between the models' predicted similarity scores (dot product) and human-annotated similarity scores are as follows:

Word2Vec (Skipgram): 0.1635 (weak correlation)

Word2Vec (Neg Sampling): 0.1589 (weak correlation)

GloVe from Scratch: 0.1757 (weak correlation)

GloVe (Gensim): 0.6038 (moderate to strong correlation)

**Conclusion:** The GloVe (Gensim) pre-trained model demonstrates the strongest correlation with human judgments, indicating that its embeddings align well with human understanding of word similarity. 

In contrast, the Word2Vec models and the GloVe model trained from scratch show weak correlations, suggesting that their embeddings do not capture semantic relationships as effectively. 

This highlights the importance of using well-trained, high-quality embeddings for tasks requiring alignment with human judgments.

## MSE Calculation

In [25]:
# load similarity data in table format
df = pd.read_table(file_path, header=None)
df.head()

Unnamed: 0,0,1,2
0,tiger,cat,7.35
1,tiger,tiger,10.0
2,plane,car,5.77
3,train,car,6.31
4,television,radio,6.77


In [26]:
#extract features and target
x1 = df.iloc[:, 0]
x2 = df.iloc[:, 1]
y  = df.iloc[:, 2]

In [27]:
# Compute predicted similarity scores for each model
y_pred_skipgram = [compute_similarity(w1, w2, skipgram) for w1, w2 in zip(x1, x2)]
y_pred_neg = [compute_similarity(w1, w2, skipgramNeg) for w1, w2 in zip(x1, x2)]
y_pred_glove = [compute_similarity(w1, w2, glove) for w1, w2 in zip(x1, x2)]
y_pred_glove_gensim = [compute_similarity(w1, w2, gensim) for w1, w2 in zip(x1, x2)]

In [28]:
# Calculate MSE for each model
mse_skipgram = mean_squared_error(y, y_pred_skipgram)
mse_neg = mean_squared_error(y, y_pred_neg)
mse_glove = mean_squared_error(y, y_pred_glove)
mse_glove_gensim = mean_squared_error(y, y_pred_glove_gensim)
mse_y_true = mean_squared_error(y, y)

In [29]:
# Print the results
print(f'MSE (Skipgram): {mse_skipgram:.4f}')
print(f'MSE (NEG): {mse_neg:.4f}')
print(f'MSE (GloVe): {mse_glove:.4f}')
print(f'MSE (GloVe Gensim): {mse_glove_gensim:.4f}')
print(f'MSE (Y true): {mse_y_true:.4f}')

MSE (Skipgram): 32.6069
MSE (NEG): 32.5223
MSE (GloVe): 32.6296
MSE (GloVe Gensim): 27.8081
MSE (Y true): 0.0000


![alt text](<comparison of mse.png>)

The GloVe (Gensim) model performs the best among the four models, with the lowest MSE (27.8081).

However, all models show relatively similar MSE values.