https://nlpassignment1-horilwnh5asymqyy6bu9m6.streamlit.app/ link to web deployment app


In [None]:
# Import Libraries
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import time
import nltk
import matplotlib
nltk.download('brown')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [None]:
np.__version__, torch.__version__

('1.26.4', '2.5.1+cu121')

In [None]:
from nltk.corpus import brown

# Corpus containing documents from the 'earn' category
corpus = brown.sents()

# Limit the corpus to the first 1000 sentences
corpus = [[word.lower() for word in sentence] for sentence in corpus]
corpus = corpus[:1000]

In [None]:
# Get word sequences and unique words
flatten = lambda l: [item for sublist in l for item in sublist]
vocab = list(set(flatten(corpus)))

In [None]:
# Numericalization
word2index = {w: i for i, w in enumerate(vocab)}
# Vocab size
voc_size = len(vocab)
print(voc_size)

# Append UNK
vocab.append('<UNK>')
word2index['<UNK>'] = 0

index2word = {v:k for k, v in word2index.items()}

4272


In [None]:
from collections import Counter
from itertools import combinations_with_replacement

# Index the corpus
X_i = Counter(flatten(corpus))

# Function to generate skip-grams dynamically
def generate_skipgrams_dynamic(corpus, window_size=2):
    skip_grams = []
    for doc in corpus:
        for i in range(window_size, len(doc) - window_size):
            center = doc[i]
            outside = []
            for j in range(1, window_size + 1):  # Dynamic range based on window size
                if i - j >= 0:
                    outside.append(doc[i - j])  # Words to the left
                if i + j < len(doc):
                    outside.append(doc[i + j])  # Words to the right
            for each_out in outside:
                skip_grams.append((center, each_out))
    return skip_grams

# Generate skip-grams with dynamic window size
window_size = 2  # Default window size
skip_grams = generate_skipgrams_dynamic(corpus, window_size)
X_ik_skipgrams = Counter(skip_grams)

# Weighting function
def weighting(w_i, w_j, X_ik):
    try:
        x_ij = X_ik[(w_i, w_j)]
    except:
        x_ij = 1
    x_max = 100
    alpha = 0.75
    if x_ij < x_max:
        result = (x_ij / x_max)**alpha
    else:
        result = 1
    return result

# Generate co-occurrence matrix and weighting dictionary
X_ik = {}  # For co-occurrences
weighting_dic = {}  # For scaled weights

for bigram in combinations_with_replacement(vocab, 2):
    if X_ik_skipgrams.get(bigram) is not None:  # Matches
        co_occer = X_ik_skipgrams[bigram]  # Count from skip-grams
        X_ik[bigram] = co_occer + 1  # Add 1 for stability
        X_ik[(bigram[1], bigram[0])] = co_occer + 1  # Symmetry
    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)


In [None]:
#create pairs of center word, and outside word

def random_batch(batch_size, corpus, window_size=2):
    skipgrams = []
    for doc in corpus:
        for i in range(window_size, len(doc)-window_size):
            center = word2index[doc[i]]
            outside = [word2index[doc[i-j]] for j in range(-window_size, window_size+1) if j != 0]
            for each_out in outside:
                skipgrams.append([center, each_out])

    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)
    inputs, labels = [], []
    for index in random_index:
        inputs.append([skipgrams[index][0]])
        labels.append([skipgrams[index][1]])

    return np.array(inputs), np.array(labels)


x, y = random_batch(2, corpus)
import math

def random_batch_glove(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):

    #convert to id since our skip_grams is word, not yet id
    skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]

    random_inputs = []
    random_labels = []
    random_coocs  = []
    random_weightings = []
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False) #randomly pick without replacement

    for i in random_index:
        random_inputs.append([skip_grams_id[i][0]])  # target, e.g., 2
        random_labels.append([skip_grams_id[i][1]])  # context word, e.g., 3

        #get cooc
        pair = skip_grams[i]
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])

        #get weighting
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])

    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)
x.shape  #batch_size, 1


(2, 1)

In [None]:
y.shape

(2, 1)

In [None]:
len(vocab)

4273

In [None]:
embedding = nn.Embedding(63314, 2)
x_tensor = torch.LongTensor(x)
embedding(x_tensor).shape  #(batch_size, 1, emb_size)

torch.Size([2, 1, 2])

In [None]:
class Skipgram(nn.Module):

    def __init__(self, voc_size, emb_size):
        super(Skipgram, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)

    def forward(self, center, outside, all_vocabs):
        center_embedding     = self.embedding_center(center)  #(batch_size, 1, emb_size)
        outside_embedding    = self.embedding_center(outside) #(batch_size, 1, emb_size)
        all_vocabs_embedding = self.embedding_center(all_vocabs) #(batch_size, voc_size, emb_size)

        top_term = torch.exp(outside_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2))
        #batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1)

        lower_term = all_vocabs_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2)
        #batch_size, voc_size, emb_size) @ (batch_size, emb_size, 1) = (batch_size, voc_size, 1) = (batch_size, voc_size)

        lower_term_sum = torch.sum(torch.exp(lower_term), 1)  #(batch_size, 1)

        loss = -torch.mean(torch.log(top_term / lower_term_sum))  #scalar

        return loss


In [None]:
class SkipgramNegSampling(nn.Module):

    def __init__(self, vocab_size, emb_size):
        super(SkipgramNegSampling, self).__init__()
        self.embedding_center = nn.Embedding(vocab_size, emb_size) # center embedding
        self.embedding_outside = nn.Embedding(vocab_size, emb_size) # out embedding
        self.logsigmoid = nn.LogSigmoid()

    def forward(self, center_words, target_words, negative_words):
        center_embeds = self.embedding_center(center_words) # [batch_size, 1, emb_size]
        target_embeds = self.embedding_outside(target_words) # [batch_size, 1, emb_size]
        neg_embeds    = -self.embedding_outside(negative_words) # [batch_size, num_neg, emb_size]

        positive_score = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]

        negative_score = neg_embeds.bmm(center_embeds.transpose(1, 2))
        #[batch_size, k, emb_size] @ [batch_size, emb_size, 1] = [batch_size, k, 1]

        loss = self.logsigmoid(positive_score) + torch.sum(self.logsigmoid(negative_score), 1)

        return -torch.mean(loss)

    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)

        return embeds

In [None]:
class Glove(nn.Module):

    def __init__(self, voc_size, emb_size):
        super(Glove, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)

        self.center_bias       = nn.Embedding(voc_size, 1)
        self.outside_bias      = nn.Embedding(voc_size, 1)

    def forward(self, center, outside, coocs, weighting):
        center_embeds  = self.embedding_center(center) #(batch_size, 1, emb_size)
        outside_embeds = self.embedding_outside(outside) #(batch_size, 1, emb_size)

        center_bias    = self.center_bias(center).squeeze(1)
        target_bias    = self.outside_bias(outside).squeeze(1)

        inner_product  = outside_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #(batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1)

        loss = weighting * torch.pow(inner_product + center_bias + target_bias - coocs, 2)

        return torch.sum(loss)

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2025-01-18 01:12:49--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-01-18 01:12:49--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-01-18 01:12:49--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

# setting the dataset
glove_file = datapath('/content/glove.6B.100d.txt')
model_gensim = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

In [None]:
batch_size = 2
voc_size   = len(vocab)
emb_size = 2

def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

all_vocabs = prepare_sequence(list(vocab), word2index).expand(batch_size, voc_size)
all_vocabs

tensor([[   0,    1,    2,  ..., 4270, 4271,    0],
        [   0,    1,    2,  ..., 4270, 4271,    0]])

In [None]:
model_skipgram = Skipgram(voc_size, emb_size)
model_skipgram

Skipgram(
  (embedding_center): Embedding(4273, 2)
  (embedding_outside): Embedding(4273, 2)
)

In [None]:
model_skipgram_neg = Skipgram(voc_size, emb_size)
model_skipgram_neg

Skipgram(
  (embedding_center): Embedding(4273, 2)
  (embedding_outside): Embedding(4273, 2)
)

In [None]:
model_glove = Glove(voc_size, emb_size)
model_glove

Glove(
  (embedding_center): Embedding(4273, 2)
  (embedding_outside): Embedding(4273, 2)
  (center_bias): Embedding(4273, 1)
  (outside_bias): Embedding(4273, 1)
)

In [None]:
input_tensor = torch.LongTensor(x)
label_tensor = torch.LongTensor(y)
loss_skipgram = model_skipgram(input_tensor, label_tensor, all_vocabs)
loss_skipgram_neg = model_skipgram_neg(input_tensor, label_tensor, all_vocabs)
# x, y, cooc, weighting = random_batch_glove(batch_size, corpus, skip_grams, X_ik, weighting_dic)

# loss_glove = model_glove(torch.LongTensor(x), torch.LongTensor(y), torch.LongTensor(cooc), torch.LongTensor(weighting))
batch_size = 2
emb_size   = 2
model_skipgram     = Skipgram(voc_size, emb_size)
optimizer_skipgram  = optim.Adam(model_skipgram.parameters(), lr=0.001)
optimizer_skipgram_neg  = optim.Adam(model_skipgram_neg.parameters(), lr=0.001)

criterion = nn.CrossEntropyLoss()
optimizer_glove = optim.Adam(model_glove.parameters(), lr=0.001)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs
num_epochs = 10
total_start = time.time()

for epoch in range(num_epochs):
    start = time.time()

    #get batch
    input_batch, label_batch = random_batch(batch_size, corpus)

    input_tensor = torch.LongTensor(input_batch)
    label_tensor = torch.LongTensor(label_batch)

    #predict
    loss_skipgram= model_skipgram(input_tensor, label_tensor, all_vocabs)

    #backprogate
    optimizer_skipgram.zero_grad()
    loss_skipgram.backward()

    #update alpha
    optimizer_skipgram.step()
    end = time.time()

    epoch_mins, epoch_secs = epoch_time(start, end)

    #print the loss_skipgram_positive
    # if (epoch + 1) % 1000 == 0:
    print("Positive Skigram")
    print(f"Epoch {epoch+1:6.0f} | Loss: {loss_skipgram:2.6f}| time: {epoch_mins}m {epoch_secs}s")
# Record the ending time
total_end = time.time()

# Calculate and print the total runtime
total_runtime = total_end - total_start
print(f"Total runtime: {total_runtime:.2f} seconds")

Positive Skigram
Epoch      1 | Loss: 8.643091| time: 0m 0s
Positive Skigram
Epoch      2 | Loss: 9.371527| time: 0m 0s
Positive Skigram
Epoch      3 | Loss: 12.445749| time: 0m 0s
Positive Skigram
Epoch      4 | Loss: 8.952690| time: 0m 0s
Positive Skigram
Epoch      5 | Loss: 9.025588| time: 0m 0s
Positive Skigram
Epoch      6 | Loss: 9.195283| time: 0m 0s
Positive Skigram
Epoch      7 | Loss: 7.735710| time: 0m 0s
Positive Skigram
Epoch      8 | Loss: 10.200654| time: 0m 0s
Positive Skigram
Epoch      9 | Loss: 9.624472| time: 0m 0s
Positive Skigram
Epoch     10 | Loss: 11.756741| time: 0m 0s
Total runtime: 1.99 seconds


In [None]:
num_epochs = 10


total_start = time.time()
for epoch in range(num_epochs):
    start = time.time()

    #get batch
    input_batch, label_batch = random_batch(batch_size, corpus)

    input_tensor = torch.LongTensor(input_batch)
    label_tensor = torch.LongTensor(label_batch)

    #predict
    loss_skipgram_negative = model_skipgram_neg(input_tensor, label_tensor, all_vocabs)

    #backprogate
    optimizer_skipgram_neg.zero_grad()
    loss_skipgram_negative.backward()

    #update alpha
    optimizer_skipgram_neg.step()

    end = time.time()

    epoch_mins, epoch_secs = epoch_time(start, end)

    #print the loss_skipgram_positive
    # if (epoch + 1) % 1000 == 0:
    print("Negative Skigram")
    print(f"Epoch {epoch+1:6.0f} | Loss: {loss_skipgram_negative:2.6f} | time: {epoch_mins}m {epoch_secs}s")
# Record the ending time
total_end = time.time()

# Calculate and print the total runtime
total_runtime = total_end - total_start
print(f"Total runtime: {total_runtime:.2f} seconds")

Negative Skigram
Epoch      1 | Loss: 10.175230 | time: 0m 0s
Negative Skigram
Epoch      2 | Loss: 8.720857 | time: 0m 0s
Negative Skigram
Epoch      3 | Loss: 9.671392 | time: 0m 0s
Negative Skigram
Epoch      4 | Loss: 9.040108 | time: 0m 0s
Negative Skigram
Epoch      5 | Loss: 9.543203 | time: 0m 0s
Negative Skigram
Epoch      6 | Loss: 10.529750 | time: 0m 0s
Negative Skigram
Epoch      7 | Loss: 7.815857 | time: 0m 0s
Negative Skigram
Epoch      8 | Loss: 9.118710 | time: 0m 0s
Negative Skigram
Epoch      9 | Loss: 8.788888 | time: 0m 0s
Negative Skigram
Epoch     10 | Loss: 8.751039 | time: 0m 0s
Total runtime: 1.89 seconds


In [None]:
num_epochs = 10

for epoch in range(num_epochs):
    start = time.time()

    #get batch
    input_batch, label_batch = random_batch(batch_size, corpus)

    input_batch, target_batch, cooc_batch, weighting_batch = random_batch_glove(batch_size, corpus, skip_grams, X_ik, weighting_dic)
    input_batch  = torch.LongTensor(input_batch)         #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch)        #[batch_size, 1]
    cooc_batch   = torch.FloatTensor(cooc_batch)         #[batch_size, 1]
    weighting_batch = torch.FloatTensor(weighting_batch)

    #predict
    loss_glove = model_glove(input_batch, target_batch, cooc_batch, weighting_batch)

    #backprogate
    optimizer_glove.zero_grad()
    loss_glove.backward()

    #update alpha
    optimizer_glove.step()

    end = time.time()

    epoch_mins, epoch_secs = epoch_time(start, end)

    #print the loss_skipgram_positive
    # if (epoch + 1) % 1000 == 0:
    print("Glove")
    print(f"Epoch {epoch+1:6.0f} | Loss: {loss_glove:2.6f} | time: {epoch_mins}m {epoch_secs}s")
total_runtime = total_end - total_start
print(f"Total runtime: {total_runtime:.2f} seconds")

Glove
Epoch      1 | Loss: 1.239861 | time: 0m 0s
Glove
Epoch      2 | Loss: 1.200808 | time: 0m 0s
Glove
Epoch      3 | Loss: 1.010004 | time: 0m 0s
Glove
Epoch      4 | Loss: 21.652637 | time: 0m 0s
Glove
Epoch      5 | Loss: 0.322434 | time: 0m 0s
Glove
Epoch      6 | Loss: 0.217125 | time: 0m 0s
Glove
Epoch      7 | Loss: 1.000654 | time: 0m 0s
Glove
Epoch      8 | Loss: 0.203099 | time: 0m 0s
Glove
Epoch      9 | Loss: 0.078265 | time: 0m 0s
Glove
Epoch     10 | Loss: 2.034088 | time: 0m 0s
Total runtime: 1.89 seconds


In [None]:
def get_embed(model, word):
    try:
        # Find the index
        index = word2index[word]
    except:
        # if not found give the index of unknown token
        index = word2index['<UNK>']

    # get the word in terms of tensor
    word = torch.LongTensor([word2index[word]])
     # embed the center and the outside word and then find the final embed
    embed_c = model.embedding_center(word)
    embed_o = model.embedding_outside(word)
    embed   = (embed_c + embed_o) / 2


    return embed[0][0].item(), embed[0][1].item()
import torch
import numpy as np

def get_embed_for_corpus(model, words):
    embeddings = {}

    for word in words:
        try:
            index = word2index[word]
        except KeyError:
            index = word2index['<UNK>']

        word_tensor = torch.LongTensor([index])

        embed_c = model.embedding_center(word_tensor)
        embed_o = model.embedding_outside(word_tensor)
        embed = (embed_c + embed_o) / 2

        # return as dictionary with key as the word and value as the array of its embedding
        embeddings[word] = np.array([embed[0][0].item(), embed[0][1].item()])

    return embeddings

In [None]:
# Function to compute cosine similarity between two vectors
def cosine_similarity(A, B):
    # Calculate dot product of the two vectors
    dot_product = np.dot(A, B)
    # Compute the norm (magnitude) of each vector
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B)
    # Calculate the cosine similarity
    similarity = dot_product / (norm_a * norm_b)
    return similarity

# Function to compute cosine similarity for all words in the corpus relative to a target word
def cosine_similarity_for_corpus(embeddings, target_word):
    # List to store pairs of (word, cosine_similarity) for each word in the corpus
    similarities = []

    # Get the index of the target word; fallback to '<UNK>' if the word is not found
    target_index = word2index.get(target_word, word2index['<UNK>'])

    # Retrieve the vector representation of the target word
    target_vector = embeddings[target_index]

    # Iterate through all words and their corresponding vectors in the embeddings
    for word, vector in embeddings.items():
        # Calculate the cosine similarity between the target vector and the current word vector
        similarity = cosine_similarity(target_vector, vector)

        # Store the word and its similarity score as a tuple in the list
        similarities.append((word, similarity))

    return similarities


In [None]:
from google.colab import files
uploaded = files.upload()

Saving word-test.v1.txt to word-test.v1.txt


In [None]:
# Specify the path to your .txt file
file_path = '/content/word-test.v1.txt'

# Read the content of the file
with open(file_path, 'r') as file:
    # Skip the first line
    file.readline()

    # Read the remaining content of the file
    file_content = file.readlines()

# Initialize variables to store relevant lines
total_corpus = []

# Variable to keep track of the current heading
current_heading = None

# Iterate through each line in the file content
for line in file_content:
    # Check if the line is a heading
    if line.startswith(':'):
        current_heading = line.strip()
    else:
        # Split the line into individual words and convert to lowercase
        words = [word.lower() for word in line.strip().split()]
        total_corpus.append(words)

# Initialize variables to store relevant lines
capital_common_countries = []
past_tense = []

# Variable to keep track of the current heading
current_heading = None

# Iterate through each line in the file content
for line in file_content:
    # Check if the line is a heading
    if line.startswith(':'):
        current_heading = line.strip()
    elif current_heading == ': capital-common-countries':
        # Split the line into individual words and convert to lowercase
        words = [word.lower() for word in line.strip().split()]
        capital_common_countries.append(words)
    elif current_heading == ': gram7-past-tense':
        # Split the line into individual words and convert to lowercase
        words = [word.lower() for word in line.strip().split()]
        past_tense.append(words)

In [None]:
# Flatten the 2D list into a list of lists
flattened_list_of_country = [word for pair in capital_common_countries for word in pair]

# Wrap the flattened list in another list
resulting_capital_list = [flattened_list_of_country]

# Flatten the 2D list into a list of lists
flattened_list_of_past_tense = [word for pair in past_tense for word in pair]

# Wrap the flattened list in another list
resulting_capital_list = [flattened_list_of_country]
resulting_past_tense_list = [flattened_list_of_past_tense]

# Flatten the 2D list into a list of lists
flattened_list_total_words = [word for pair in total_corpus for word in pair]
# Wrap the flattened list in another list
resulting_total_corpus = [flattened_list_total_words]


flatten = lambda l: [item for sublist in l for item in sublist]
capital_list = list(set(flatten(resulting_capital_list)))
past_tense_list = list(set(flatten(resulting_past_tense_list)))
whole_corpus = list(set(flatten(resulting_total_corpus)))

In [None]:
# Store the embeddings
embed_capital_glove = get_embed_for_corpus(model_glove, capital_list)
embed_capital_skipgram = get_embed_for_corpus(model_skipgram, capital_list)
embed_capital_skipgram_neg = get_embed_for_corpus(model_skipgram_neg, capital_list)

embed_past_tense_glove = get_embed_for_corpus(model_glove, past_tense_list)
embed_past_tense_skipgram = get_embed_for_corpus(model_skipgram, past_tense_list)
embed_past_tense_skipgram_neg = get_embed_for_corpus(model_skipgram_neg, past_tense_list)

embed_total_glove = get_embed_for_corpus(model_glove, whole_corpus)
embed_whole_skipgram = get_embed_for_corpus(model_skipgram, whole_corpus)
embed_whole_skipgram_neg = get_embed_for_corpus(model_skipgram_neg, whole_corpus)
# y_pred for glove for the capital list
y_pred_glove_country = []

for i in capital_common_countries:
    y = embed_capital_glove[i[1]] - embed_capital_glove[i[0]] + embed_capital_glove[i[2]]
    y_pred_glove_country.append(y)
# y_pred for glove for the past tense list
y_pred_glove_past = []

for i in past_tense:
    y = embed_past_tense_glove[i[1]] - embed_past_tense_glove[i[0]] + embed_past_tense_glove[i[2]]
    y_pred_glove_past.append(y)
# y_pred for skipgram negative sampling for the capital list
y_pred_neg_samp_country = []

for i in capital_common_countries:
    y = embed_capital_skipgram_neg[i[1]] - embed_capital_skipgram_neg[i[0]] + embed_capital_skipgram_neg[i[2]]
    y_pred_neg_samp_country.append(y)

In [None]:
# y_pred for skip-gram with negative sampling for the past tense list
y_pred_neg_samp_past = []

for i in past_tense:
    # Calculate predicted vector for past tense using negative sampling embeddings
    y = embed_past_tense_skipgram_neg[i[0]] - embed_past_tense_skipgram_neg[i[0]] + embed_past_tense_skipgram_neg[i[2]]
    y_pred_neg_samp_past.append(y)

# y_pred for skip-gram with positive sampling for the country list
y_pred_positive_samp_country = []

for i in capital_common_countries:
    # Calculate predicted vector for capital-common-countries using positive sampling embeddings
    y = embed_capital_skipgram[i[1]] - embed_capital_skipgram[i[0]] + embed_capital_skipgram[i[2]]
    y_pred_positive_samp_country.append(y)

# y_pred for skip-gram with positive sampling for the past tense list
y_pred_positive_past_tense = []

for i in past_tense:
    # Calculate predicted vector for past tense using positive sampling embeddings
    y = embed_past_tense_skipgram[i[1]] - embed_past_tense_skipgram[i[0]] + embed_past_tense_skipgram[i[2]]
    y_pred_positive_past_tense.append(y)

# Function to compute cosine similarity between two vectors
def cosine_similarity(A, B):
    # Calculate dot product of the two vectors
    dot_product = np.dot(A, B)
    # Compute the norm (magnitude) of each vector
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B)
    # Calculate the cosine similarity
    similarity = dot_product / (norm_a * norm_b)
    return similarity

# Function to find the word with the maximum cosine similarity for each vector in y_pred
def find_max_cosine_words(y_pred, embeddings):
    """
    Find the word with the maximum cosine similarity for each vector in y_pred.

    Parameters:
    - y_pred: List of vectors for which to find the max cosine similarity words.
    - embeddings: Dictionary of word embeddings.

    Returns:
    - List of words with the maximum cosine similarity for each vector in y_pred.
    """
    max_cosine_words = []

    for j in range(len(y_pred)):
        max_cosine = -1
        max_cosine_word = ""

        # Iterate through all words in the embeddings dictionary
        for i in embeddings.keys():
            # Calculate cosine similarity between the predicted vector and the current word vector
            cosine_temp = cosine_similarity(y_pred[j], embeddings[i])

            # Update the word with the highest cosine similarity
            if cosine_temp > max_cosine:
                max_cosine_word = i
                max_cosine = cosine_temp

        max_cosine_words.append(max_cosine_word)

    return max_cosine_words

# Usage: Compute syntactic predictions for models
cosine_neg_samp_syntatical = find_max_cosine_words(y_pred_neg_samp_country, embed_capital_skipgram_neg)
cosine_positive_samp_syntatical = find_max_cosine_words(y_pred_positive_samp_country, embed_capital_skipgram)
cosine_glove_syntatical = find_max_cosine_words(y_pred_glove_country, embed_capital_glove)

# Function to find the next top-N similar words for a specific word using cosine similarity
from heapq import nlargest

def find_next_10_cosine_words_for_word(target_word, embeddings, top_n=10):
    """
    Find the next 10 words with the maximum cosine similarity for a user-provided specific word.

    Parameters:
    - target_word: The word for which to find the next 10 cosine similarity words.
    - embeddings: Dictionary of word embeddings.
    - top_n: Number of top words to retrieve for the target word (default is 10).

    Returns:
    - List of the next 10 words with the maximum cosine similarity for the target word or ["Word not in Corpus"].
    """
    if target_word not in embeddings:
        return ["Word not in Corpus"]

    # Get the embedding vector for the target word
    target_vector = embeddings[target_word]

    # Calculate cosine similarities between the target word and all other words
    cosine_similarities = [(word, cosine_similarity(target_vector, embeddings[word])) for word in embeddings.keys()]

    # Find the top-N most similar words
    top_n_words = nlargest(top_n + 1, cosine_similarities, key=lambda x: x[1])

    # Exclude the target word itself from the results
    top_n_words = [word for word, _ in top_n_words if word != target_word]

    return top_n_words[:10]

# Usage: Find the next 10 most similar words for a specific word
user_target_word = 'italy'
next_10_cosine_for_user_word = find_next_10_cosine_words_for_word(user_target_word, embed_whole_skipgram_neg, top_n=10)

# Print the results
if next_10_cosine_for_user_word == ["Word not in Corpus"]:
    print("Word not in Corpus")
else:
    print(f"Next 10 similar words for user-provided word '{user_target_word}': {next_10_cosine_for_user_word}")


Next 10 similar words for user-provided word 'italy': ['husband', 'kuna', 'ashgabat', 'brother', 'hid', 'pineapple', 'warmest', 'seeing', 'listens', 'cows']


In [None]:
def calculate_accuracy(predictions, true_words):
    """
    Calculate accuracy based on predictions and true words.

    Parameters:
    - predictions: List of predicted words.
    - true_words: List of true words.

    Returns:
    - Accuracy as a percentage.
    """
    total_trials = len(predictions)
    total_correct = sum(1 for pred_word in predictions if pred_word in true_words)

    accuracy = (total_correct / total_trials) * 100

    return accuracy

# Usage:
semantic_accuracy_neg_samp = calculate_accuracy(find_max_cosine_words(y_pred_neg_samp_country, embed_whole_skipgram_neg), [true_word[3] for true_word in capital_common_countries])
semantic_accuracy_pos_samp = calculate_accuracy(find_max_cosine_words(y_pred_positive_samp_country, embed_whole_skipgram), [true_word[3] for true_word in capital_common_countries])
semantic_accuracy_glove = calculate_accuracy(find_max_cosine_words(y_pred_glove_country, embed_total_glove), [true_word[3] for true_word in capital_common_countries])

print("Semantic Accuracy of Skipgram : {:.10f}%".format(semantic_accuracy_pos_samp))
print("Semantic Accuracy of Skipgram Neg: {:.10f}%".format(semantic_accuracy_neg_samp))
print("Semantic Accuracy of Glove: {:.10f}%".format(semantic_accuracy_glove))

Semantic Accuracy of Skipgram : 14.6245059289%
Semantic Accuracy of Skipgram Neg: 14.4268774704%
Semantic Accuracy of Glove: 14.2292490119%


In [None]:
input_file_path = '/content/word-test.v1.txt'
output_file_path = '/content/word-test-cleaned.txt'

# Open the input file for reading
with open(input_file_path, 'r', encoding='utf-8') as input_file:
    # Read all lines from the input file
    lines = input_file.readlines()

# Open the output file for writing
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    # Write all lines except the first line to the output file
    output_file.writelines(lines[1:])

print(f"First line removed and content saved to: {output_file_path}")


First line removed and content saved to: /content/word-test-cleaned.txt


In [None]:
put_file_path = '/content/word-test.v1.txt'
output_file_path = '/content/capital-common-countries.txt'

# Open the input file for reading
with open(input_file_path, 'r', encoding='utf-8') as input_file:
    # Read all lines from the input file into a list
    lines = input_file.readlines()

# Open the output file for writing
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    # Flag to indicate whether the relevant section should be written
    start_writing = False

    # Iterate through each line in the input file
    for line in lines:
        # If the line starts with ': capital-common-countries', set the flag to start writing
        if line.startswith(': capital-common-countries'):
            start_writing = True
        # If a new section header is encountered (line starts with ':'), stop writing
        elif line.startswith(':'):
            start_writing = False

        # Write the line to the output file if within the relevant section
        if start_writing:
            output_file.write(line)

print(f"Lines starting with ': capital-common-countries' saved to: {output_file_path}")

Lines starting with ': capital-common-countries' saved to: /content/capital-common-countries.txt


In [None]:
analogy_score_sem = model_gensim.evaluate_word_analogies(datapath('/content/capital-common-countries.txt'))
print("Semtatical Accuracy of Model Gensim:", analogy_score_sem[0])

Semtatical Accuracy of Model Gensim: 0.9387351778656127


In [None]:
def calculate_accuracy(predictions, true_words):
    """
    Calculate the accuracy of predictions compared to true words.

    Parameters:
    - predictions: List of predicted words generated by the model.
    - true_words: List of ground-truth words to compare against.

    Returns:
    - Accuracy as a percentage value.
    """
    total_trials = len(predictions)  # Total number of predictions made
    total_correct = sum(1 for pred_word, true_word in zip(predictions, true_words) if pred_word == true_word)  # Count correct predictions

    accuracy = (total_correct / total_trials) * 100  # Compute accuracy percentage

    return accuracy

# Usage: Calculate syntactic accuracy for different models
syntatical_accuracy_neg_samp = calculate_accuracy(find_max_cosine_words(y_pred_neg_samp_past, embed_whole_skipgram_neg), [true_word[3] for true_word in past_tense])
syntatical_accuracy_pos_samp = calculate_accuracy(find_max_cosine_words(y_pred_positive_past_tense, embed_whole_skipgram), [true_word[3] for true_word in past_tense])
syntatical_accuracy_glove = calculate_accuracy(find_max_cosine_words(y_pred_glove_past, embed_total_glove), [true_word[3] for true_word in past_tense])

# Print the syntactic accuracy results for each model
print("Syntactic Accuracy of Skipgram Pos Sampling: {:.2f}%".format(syntatical_accuracy_pos_samp))
print("Syntactic Accuracy of Skipgram Neg Sampling: {:.2f}%".format(syntatical_accuracy_neg_samp))
print("Syntactic Accuracy of Glove: {:.2f}%".format(syntatical_accuracy_glove))


Syntactic Accuracy of Skipgram Pos Sampling: 0.06%
Syntactic Accuracy of Skipgram Neg Sampling: 0.00%
Syntactic Accuracy of Glove: 0.13%


In [None]:
input_file_path = '/content/word-test.v1.txt'
output_file_path = '/content/text_past_tense.txt'

# Open the input file for reading
with open(input_file_path, 'r', encoding='utf-8') as input_file:
    # Read all lines from the input file
    lines = input_file.readlines()

# Open the output file for writing
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    # Flag to indicate whether the relevant section (gram7-past-tense) should be written
    start_writing = False

    # Iterate through each line in the input file
    for line in lines:
        # If the line starts with ': gram7-past-tense', set the flag to start writing
        if line.startswith(': gram7-past-tense'):
            start_writing = True
        # If a new section header is encountered (line starts with ':'), stop writing
        elif line.startswith(':'):
            start_writing = False

        # Write the line to the output file if within the relevant section
        if start_writing:
            output_file.write(line)

print(f"The relevant lines have been saved to: {output_file_path}")

def calculate_accuracy(predictions, true_words):
    """
    Calculate the accuracy of predictions compared to true words.

    Parameters:
    - predictions: List of predicted words generated by the model.
    - true_words: List of ground-truth words to compare against.

    Returns:
    - Accuracy as a percentage value.
    """
    total_trials = len(predictions)  # Total number of predictions made
    total_correct = sum(1 for pred_word, true_word in zip(predictions, true_words) if pred_word == true_word)  # Count correct predictions

    accuracy = (total_correct / total_trials) * 100  # Compute accuracy percentage

    return accuracy

# Usage: Calculate syntactic accuracy for different models
syntatical_accuracy_neg_samp = calculate_accuracy(find_max_cosine_words(y_pred_neg_samp_past, embed_whole_skipgram_neg), [true_word[3] for true_word in past_tense])
syntatical_accuracy_pos_samp = calculate_accuracy(find_max_cosine_words(y_pred_positive_past_tense, embed_whole_skipgram), [true_word[3] for true_word in past_tense])
syntatical_accuracy_glove = calculate_accuracy(find_max_cosine_words(y_pred_glove_past, embed_total_glove), [true_word[3] for true_word in past_tense])

# Print the syntactic accuracy results for each model
print("Syntactic Accuracy of Skipgram Negative Sampling: {:.2f}%".format(syntatical_accuracy_neg_samp))
print("Syntactic Accuracy of Skipgram Positive Sampling: {:.2f}%".format(syntatical_accuracy_pos_samp))
print("Syntactic Accuracy of Glove: {:.2f}%".format(syntatical_accuracy_glove))


The relevant lines have been saved to: /content/text_past_tense.txt
Syntactic Accuracy of Skipgram Negative Sampling: 0.00%
Syntactic Accuracy of Skipgram Positive Sampling: 0.06%
Syntactic Accuracy of Glove: 0.13%


In [None]:
analogy_score_syn = model_gensim.evaluate_word_analogies(datapath('/content/text_past_tense.txt'))
print("Syntatical Accuracy of Model Gensim:", analogy_score_syn[0])

Syntatical Accuracy of Model Gensim: 0.5544871794871795


In [None]:
from google.colab import files
uploaded = files.upload()

Saving wordsim_similarity_goldstandard.txt to wordsim_similarity_goldstandard (1).txt


In [None]:
import pandas as pd

file_path = 'wordsim_similarity_goldstandard.txt'

# Define the column names
columns = ['word_1', 'word_2', 'similarity_index']

# Read the text file into a pandas DataFrame with specified column names
df = pd.read_csv(file_path, sep='\t', header=None, names=columns)

df

Unnamed: 0,word_1,word_2,similarity_index
0,tiger,cat,7.35
1,tiger,tiger,10.00
2,plane,car,5.77
3,train,car,6.31
4,television,radio,6.77
...,...,...,...
198,rooster,voyage,0.62
199,noon,string,0.54
200,chord,smile,0.54
201,professor,cucumber,0.31


In [None]:
get_embed(model_skipgram_neg,'<UNK>')

(0.19411882758140564, -0.42883604764938354)

In [None]:
# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    word_1 = row['word_1']
    word_2 = row['word_2']

    try:
        # Attempt to get embeddings and compute the dot product for various models
        embed_1_neg_samp = get_embed(model_skipgram_neg, word_1)
        embed_2_neg_samp = get_embed(model_skipgram_neg, word_2)
        embed_1_pos_samp = get_embed(model_skipgram, word_1)
        embed_2_pos_samp = get_embed(model_skipgram, word_2)
        embed_1_glove = get_embed(model_glove, word_1)
        embed_2_glove = get_embed(model_glove, word_2)

    except KeyError:
        # Handle missing words by substituting with '<UNK>' embeddings
        embed_1_neg_samp = get_embed(model_skipgram_neg, '<UNK>')
        embed_2_neg_samp = get_embed(model_skipgram_neg, '<UNK>')
        embed_1_pos_samp = get_embed(model_skipgram, '<UNK>')
        embed_2_pos_samp = get_embed(model_skipgram, '<UNK>')
        embed_1_glove = get_embed(model_glove, '<UNK>')
        embed_2_glove = get_embed(model_glove, '<UNK>')

    # Compute and store the dot product results in the DataFrame
    df.at[index, 'dot_product_neg_samp'] = np.dot(embed_1_neg_samp, embed_2_neg_samp)
    df.at[index, 'dot_product_pos_samp'] = np.dot(embed_1_pos_samp, embed_2_pos_samp)
    df.at[index, 'dot_product_glove'] = np.dot(embed_1_glove, embed_2_glove)

# Display the first 10 rows of the updated DataFrame
df[:10]


Unnamed: 0,word_1,word_2,similarity_index,dot_product_neg_samp,dot_product_pos_samp,dot_product_glove
0,tiger,cat,7.35,0.221582,2.313294,0.719298
1,tiger,tiger,10.0,0.221582,2.313294,0.719298
2,plane,car,5.77,0.221582,2.313294,0.719298
3,train,car,6.31,0.221582,2.313294,0.719298
4,television,radio,6.77,0.147957,-0.040519,0.349478
5,media,radio,7.42,0.221582,2.313294,0.719298
6,bread,butter,6.19,0.221582,2.313294,0.719298
7,cucumber,potato,5.92,0.221582,2.313294,0.719298
8,doctor,nurse,7.0,-0.225749,-0.02356,0.43534
9,professor,doctor,6.62,-0.750224,1.038951,0.280645


In [None]:
from scipy.stats import spearmanr

# Calculate the Spearman correlation between similarity scores and dot products from different models
correlation_neg, _ = spearmanr(df['similarity_index'], df['dot_product_neg_samp'])
correlation_pos, _ = spearmanr(df['similarity_index'], df['dot_product_pos_samp'])
correlation_glove, _ = spearmanr(df['similarity_index'], df['dot_product_glove'])

# Print the Spearman correlation coefficients for each model
print(f"Spearman Correlation Coefficient of Skipgram Negative Sampling: {correlation_neg:.4f}")
print(f"Spearman Correlation Coefficient of Skipgram Positive Sampling: {correlation_pos:.4f}")
print(f"Spearman Correlation Coefficient of Glove: {correlation_glove:.4f}")


Spearman Correlation Coefficient of Skipgram Negative Sampling: 0.0657
Spearman Correlation Coefficient of Skipgram Positive Sampling: 0.0222
Spearman Correlation Coefficient of Glove: -0.0012


In [None]:
# Finding y_true based on the mean of similarity index in the df
y_true = df['similarity_index'].mean()

print(f"y_true: {y_true:.2f}")

y_true: 5.13


In [None]:
# Using the correlation coeffiecient of the gensim model using the predefined function
correlation_coefficient = model_gensim.evaluate_word_pairs(datapath('/content/wordsim_similarity_goldstandard.txt'))
print(f"Correlation coefficient: {correlation_coefficient[1][0]:.2f}")

Correlation coefficient: 0.60


In [None]:
embed_whole_glove = get_embed_for_corpus(model_glove, vocab)
embed_whole_neg_skg = get_embed_for_corpus(model_skipgram_neg, vocab)
embed_whole_pos_skg = get_embed_for_corpus(model_skipgram, vocab)

In [None]:
import pickle

# Save the Gensim model to a file using pickle
gensim_model_path = 'model_gensim.pkl'

with open(gensim_model_path, 'wb') as model_file:
    pickle.dump(model_gensim, model_file)

print(f"Gensim model saved to: {gensim_model_path}")

Gensim model saved to: model_gensim.pkl


In [None]:
# Specify the path to your pickled Gensim model file
gensim_model_path = 'model_gensim.pkl'

# Load the Gensim model from the pickle file
with open(gensim_model_path, 'rb') as model_file:
    loaded_model = pickle.load(model_file)
for i in range (1,10):
    print(loaded_model.most_similar('language')[i][0])

word
spoken
arabic
english
dialect
vocabulary
text
translation
words


In [None]:
import pickle

# Save the embeddings for Skipgram Positive Sampling
embedding_dict = embed_whole_pos_skg  # Embedding dictionary for the positive sampling model

# Specify the file path for the pickle file
pickle_file_path = 'embed_skipgram_pos.pkl'

# Save the embedding dictionary to a pickle file
with open(pickle_file_path, 'wb') as pickle_file:
    pickle.dump(embedding_dict, pickle_file)

print(f"Embedding dictionary has been saved to: {pickle_file_path}")


Embedding dictionary has been saved to: embed_skipgram_pos.pkl


In [None]:
import pickle

# Save the embeddings for Skipgram Negative Sampling
embedding_dict = embed_whole_neg_skg  # Embedding dictionary for the negative sampling model

# Specify the file path for the pickle file
pickle_file_path = 'embed_skipgram_neg.pkl'

# Open the file in binary write mode and save the embedding dictionary
with open(pickle_file_path, 'wb') as pickle_file:
    pickle.dump(embedding_dict, pickle_file)

# Confirm that the embedding dictionary has been saved successfully
print(f"Embedding dictionary for negative sampling has been saved to: {pickle_file_path}")


Embedding dictionary for negative sampling has been saved to: embed_skipgram_neg.pkl


In [None]:
import pickle

# Save the embeddings for the GloVe model
embedding_dict = embed_whole_glove  # Embedding dictionary for the GloVe model

# Specify the file path for the pickle file
pickle_file_path = 'embed_glove.pkl'

# Open the file in binary write mode and save the embedding dictionary
with open(pickle_file_path, 'wb') as pickle_file:
    pickle.dump(embedding_dict, pickle_file)

# Confirm that the embedding dictionary has been saved successfully
print(f"Embedding dictionary for GloVe has been saved to: {pickle_file_path}")


Embedding dictionary for GloVe has been saved to: embed_glove.pkl


In [None]:
import pickle

# Specify the path to the pickled file for Skipgram Positive Sampling embeddings
pickle_file_path = 'embed_skipgram_pos.pkl'

# Load the embedding dictionary for Skipgram Positive Sampling from the pickle file
with open(pickle_file_path, 'rb') as pickle_file:
    embedding_dict_neg = pickle.load(pickle_file)

import pickle

# Specify the path to the pickled file for Skipgram Negative Sampling embeddings
pickle_file_path = 'embed_skipgram_neg.pkl'

# Load the embedding dictionary for Skipgram Negative Sampling from the pickle file
with open(pickle_file_path, 'rb') as pickle_file:
    embedding_dict_pos = pickle.load(pickle_file)

import pickle

# Specify the path to the pickled file for GloVe embeddings
pickle_file_path = 'embed_glove.pkl'

# Load the embedding dictionary for GloVe from the pickle file
with open(pickle_file_path, 'rb') as pickle_file:
    embedding_dict_glove = pickle.load(pickle_file)

# Define a target word for similarity calculation
user_target_word = "run"

# Find the next 10 most similar words to the target word using the GloVe embeddings
next_10_cosine_for_user_word = find_next_10_cosine_words_for_word(user_target_word, embedding_dict_glove, top_n=10)

# Display the results
if next_10_cosine_for_user_word == ["Word not in Corpus"]:
    print("The target word is not found in the corpus.")
else:
    print(f"Next 10 similar words for the target word '{user_target_word}': {next_10_cosine_for_user_word}")


Next 10 similar words for the target word 'run': ['molvar', "wouldn't", 'inflate', 'letters', 'understand', 'know', 'couple', 'published', 'salinger', 'raised']


## Model Comparison and Analysis

### **Training Loss and Runtime**

| Model             | Window Size | Training Loss (Last Epoch) | Training Time (s) |
|--------------------|-------------|----------------------------|--------------------|
| Skipgram (Positive)| 2           | 11.756741                  | 1.99              |
| Skipgram (Negative)| 2           | 8.751039                   | 1.89              |
| GloVe             | 2           | 2.034088                   | 1.89              |
| GloVe (Gensim)    | -           | -                          | -                 |

---

### **Semantic Accuracy**

| Model                | Accuracy (%)         |
|-----------------------|----------------------|
| Skipgram             | 14.62               |
| Skipgram (Negative)  | 14.43               |
| GloVe               | 14.23               |
| GloVe (Gensim)      | 0.93                |

---

### **Syntactic Accuracy**

| Model                | Accuracy (%)         |
|-----------------------|----------------------|
| Skipgram (Positive)  | 0.06                |
| Skipgram (Negative)  | 0.00                |
| GloVe               | 0.13                |
| GloVe (Gensim)      | 0.55                |

---

### **Notes:**
- **Semantic accuracy** is calculated using the **capital-common-countries** dataset.
- **Syntactic accuracy** is evaluated using the **gram7-past-tense** dataset.
- The results may vary due to corpus limitations. Using a pre-trained model such as **GloVe (Gensim)** might yield better results.
- Training times are approximate and may depend on hardware.



In [None]:
pip install streamlit


Collecting streamlit
  Downloading streamlit-1.41.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.41.1-py2.py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m69.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[

In [73]:
import pickle
import streamlit as st
import numpy as np

# Define a function to compute cosine similarity
def compute_cosine_similarity(vector1, vector2):
    """Calculate cosine similarity between two vectors."""
    dot_prod = np.dot(vector1, vector2)
    norm1 = np.linalg.norm(vector1)
    norm2 = np.linalg.norm(vector2)
    return dot_prod / (norm1 * norm2) if norm1 > 0 and norm2 > 0 else 0.0

# Load pre-saved embedding dictionaries
def load_embedding_files():
    """Load embeddings for different models from pickle files."""
    positive_path = 'embed_skipgram_pos.pkl'
    negative_path = 'embed_skipgram_neg.pkl'
    glove_path = 'embed_glove.pkl'

    with open(positive_path, 'rb') as pos_file:
        pos_embeddings = pickle.load(pos_file)
    with open(negative_path, 'rb') as neg_file:
        neg_embeddings = pickle.load(neg_file)
    with open(glove_path, 'rb') as glove_file:
        glove_embeddings = pickle.load(glove_file)

    return pos_embeddings, neg_embeddings, glove_embeddings

# Find the most similar words for a given word
def find_similar_words(target_word, embedding_dict, top_n=10):
    """
    Identify top N words with the highest cosine similarity to the target word.

    Parameters:
    - target_word: Word for which similar words are sought.
    - embedding_dict: Dictionary containing word embeddings.
    - top_n: Number of similar words to retrieve.

    Returns:
    - List of the most similar words.
    """
    if target_word not in embedding_dict:
        return ["Word not in Corpus"]

    target_vec = embedding_dict[target_word]
    similarities = []

    for word, vec in embedding_dict.items():
        if word != target_word:  # Exclude the target word itself
            similarity = compute_cosine_similarity(target_vec, vec)
            similarities.append((word, similarity))

    # Rank words by similarity score in descending order
    similarities.sort(key=lambda x: x[1], reverse=True)

    return [word for word, _ in similarities[:top_n]]

# Main Streamlit app functionality
def main():
    # Load embedding dictionaries for all models
    pos_embeddings, neg_embeddings, glove_embeddings = load_embedding_files()

    # Display the app title and description
    st.title("Word Similarity Finder")
    st.write("Enter a word and choose a model to find similar words based on cosine similarity.")

    # Input field for user to type a word
    input_word = st.text_input("Type a word:", "example")  # Default word is "example"

    # Dropdown menu to select the embedding model
    selected_model = st.selectbox(
        "Select Embedding Model",
        ["GloVe Embeddings", "Skipgram Positive Embeddings", "Skipgram Negative Embeddings"]
    )

    # Select the appropriate embedding dictionary
    if selected_model == "GloVe Embeddings":
        embeddings = glove_embeddings
    elif selected_model == "Skipgram Positive Embeddings":
        embeddings = pos_embeddings
    elif selected_model == "Skipgram Negative Embeddings":
        embeddings = neg_embeddings

    # Display the top 10 similar words if input is provided
    if input_word:
        with st.spinner('Processing your request...'):
            similar_words = find_similar_words(input_word, embeddings, top_n=10)

            # Show results
            if similar_words == ["Word not in Corpus"]:
                st.error("The word you entered is not in the corpus.")
            else:
                st.success(f"Top 10 similar words for '{input_word}':")
                st.write(similar_words)

if __name__ == "__main__":
    main()


2025-01-18 02:32:18.789 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-01-18 02:32:18.813 Session state does not function when running a script without `streamlit run`


In [75]:

# Save the Streamlit app as app.py in Colab's /content directory
modified_code = """
import pickle
import streamlit as st
import numpy as np

# Define a function to compute cosine similarity
def compute_cosine_similarity(vector1, vector2):
    \"\"\"Calculate cosine similarity between two vectors.\"\"\"
    dot_prod = np.dot(vector1, vector2)
    norm1 = np.linalg.norm(vector1)
    norm2 = np.linalg.norm(vector2)
    return dot_prod / (norm1 * norm2) if norm1 > 0 and norm2 > 0 else 0.0

# Load pre-saved embedding dictionaries
def load_embedding_files():
    \"\"\"Load embeddings for different models from pickle files.\"\"\"
    positive_path = 'embed_skipgram_pos.pkl'
    negative_path = 'embed_skipgram_neg.pkl'
    glove_path = 'embed_glove.pkl'

    with open(positive_path, 'rb') as pos_file:
        pos_embeddings = pickle.load(pos_file)
    with open(negative_path, 'rb') as neg_file:
        neg_embeddings = pickle.load(neg_file)
    with open(glove_path, 'rb') as glove_file:
        glove_embeddings = pickle.load(glove_file)

    return pos_embeddings, neg_embeddings, glove_embeddings

# Find the most similar words for a given word
def find_similar_words(target_word, embedding_dict, top_n=10):
    \"\"\"
    Identify top N words with the highest cosine similarity to the target word.

    Parameters:
    - target_word: Word for which similar words are sought.
    - embedding_dict: Dictionary containing word embeddings.
    - top_n: Number of similar words to retrieve.

    Returns:
    - List of the most similar words.
    \"\"\"
    if target_word not in embedding_dict:
        return [\"Word not in Corpus\"]

    target_vec = embedding_dict[target_word]
    similarities = []

    for word, vec in embedding_dict.items():
        if word != target_word:  # Exclude the target word itself
            similarity = compute_cosine_similarity(target_vec, vec)
            similarities.append((word, similarity))

    # Rank words by similarity score in descending order
    similarities.sort(key=lambda x: x[1], reverse=True)

    return [word for word, _ in similarities[:top_n]]

# Main Streamlit app functionality
def main():
    # Load embedding dictionaries for all models
    pos_embeddings, neg_embeddings, glove_embeddings = load_embedding_files()

    # Display the app title and description
    st.title("Word Similarity Finder")
    st.write("Enter a word and choose a model to find similar words based on cosine similarity.")

    # Input field for user to type a word
    input_word = st.text_input("Type a word:", "example")  # Default word is "example"

    # Dropdown menu to select the embedding model
    selected_model = st.selectbox(
        "Select Embedding Model",
        ["GloVe Embeddings", "Skipgram Positive Embeddings", "Skipgram Negative Embeddings"]
    )

    # Select the appropriate embedding dictionary
    if selected_model == "GloVe Embeddings":
        embeddings = glove_embeddings
    elif selected_model == "Skipgram Positive Embeddings":
        embeddings = pos_embeddings
    elif selected_model == "Skipgram Negative Embeddings":
        embeddings = neg_embeddings

    # Display the top 10 similar words if input is provided
    if input_word:
        with st.spinner('Processing your request...'):
            similar_words = find_similar_words(input_word, embeddings, top_n=10)

            # Show results
            if similar_words == [\"Word not in Corpus\"]:
                st.error(\"The word you entered is not in the corpus.\")
            else:
                st.success(f\"Top 10 similar words for '{input_word}':\")
                st.write(similar_words)

if __name__ == \"__main__\":
    main()
"""

# Write the modified code to app.py
with open('/content/app.py', 'w') as f:
    f.write(modified_code)

print("Streamlit app has been saved as app.py in /content directory.")


Streamlit app has been saved as app.py in /content directory.


In [78]:
# Install Streamlit and pyngrok first
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Downloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.3


In [87]:
!rm -rf /root/.config/ngrok/ngrok.yml

In [88]:
!ngrok authtoken 2lVEdM8aMwFnpNeDzeqw2US1ZDZ_5VHYQQDKamomRpqWsm6R1


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [89]:
from pyngrok import ngrok

# Start the ngrok tunnel explicitly specifying HTTP protocol
public_url = ngrok.connect(addr=8501, proto="http")
print(f"Public URL: {public_url}")

# Run the Streamlit app
!streamlit run /content/app.py

Public URL: NgrokTunnel: "https://2263-34-57-80-108.ngrok-free.app" -> "http://localhost:8501"

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.57.80.108:8501[0m
[0m
[34m  Stopping...[0m
[34m  Stopping...[0m
