In [3]:
import numpy as np
import word_embedding as we # contains the word embedding functions

In [4]:
import nltk
#prevent unnecessary downloads
def download_nltk_data_if_needed(dataset_name):
    try:
        nltk.data.find(f'corpora/{dataset_name}')
    except LookupError:
        nltk.download(dataset_name)

download_nltk_data_if_needed('treebank')

In [5]:
#quick look at data
tagged_corpus = nltk.corpus.treebank.tagged_sents()
print(tagged_corpus[0])

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]


In [14]:
window_sizes = [2, 3, 4, 5]  
words_to_check = ['bank', 'teacher', 'firm']  # test with different words

Co-occurence matrix

In [15]:

for window_size in window_sizes:
    print(f"\n--- Training with window size: {window_size} ---")
    co_occurrence_matrix, vocab_index = we.word2vec_pos(tagged_corpus, window_size=window_size, pos_weighting=None, pos_weights=None)

    for word in words_to_check:
        word_with_pos = f"{word.lower()}_NN"  # look for nouns, change if looking for other POS
        if word_with_pos in vocab_index:
            neighbors = we.find_nearest_neighbors(word_with_pos, co_occurrence_matrix, vocab_index, top_n=5)
            print(f"Nearest neighbors for '{word}' with window size {window_size}: {neighbors}")
        else:
            print(f"Word '{word}' not found in the vocabulary.")




--- Training with window size: 2 ---
Nearest neighbors for 'bank' with window size 2: [('bank_NN', 0.9999999999999998), ('problem_NN', 0.777471561405543), ('day_NN', 0.7444897800851263), ('law_NN', 0.7185290673597967), ('thrift_NN', 0.7184157229917689)]
Nearest neighbors for 'teacher' with window size 2: [('teacher_NN', 0.9999999999999999), ('group_NN', 0.6980366760668572), ('game_NN', 0.6570210739972892), ('metallgesellschaft_NN', 0.636929755298482), ('problem_NN', 0.6361733063910819)]
Nearest neighbors for 'firm' with window size 2: [('firm_NN', 1.0), ('company_NN', 0.8929046695492058), ('market_NN', 0.8656699838558913), ('department_NN', 0.8609000940514345), ('world_NN', 0.8573188420505795)]

--- Training with window size: 3 ---
Nearest neighbors for 'bank' with window size 3: [('bank_NN', 1.0), ('problem_NN', 0.7974392389917512), ('group_NN', 0.7739648027081939), ('law_NN', 0.7659590171910243), ('move_NN', 0.7503959759184194)]
Nearest neighbors for 'teacher' with window size 3: [(

Pointwise Mutual Information

In [17]:
for window_size in window_sizes:
    print(f"\n--- Training with window size: {window_size} ---")
    pos_weights = {"_NN": 1.5, "_VB": 1.2}  # adjust POS weight manually
    
    co_occurrence_matrix, vocab_index = we.word2vec_pos(tagged_corpus, window_size=window_size, pos_weights=pos_weights)
    ppmi_matrix = we.compute_ppmi(co_occurrence_matrix)

    for word in words_to_check:
        word_with_pos = f"{word.lower()}_NN"  # look for nouns, change if looking for other POS
        if word_with_pos in vocab_index:
            word_idx = vocab_index[word_with_pos]
            word_vector = ppmi_matrix[word_idx, :]
            
            neighbors = we.find_nearest_neighbors(word_with_pos, ppmi_matrix, vocab_index, top_n=5)  
            print(f"Nearest neighbors for '{word}' (as a noun) with window size {window_size}: {neighbors}")
        else:
            print(f"Word '{word}' not found in the vocabulary.")






--- Training with window size: 2 ---


  ppmi_matrix = np.maximum(np.log2(joint_prob_matrix / (word_prob[:, None] * context_prob[None, :])), 0)
  ppmi_matrix = np.maximum(np.log2(joint_prob_matrix / (word_prob[:, None] * context_prob[None, :])), 0)


Nearest neighbors for 'bank' (as a noun) with window size 2: [('bank_NN', 1.0000000000000002), ('hub_NN', 0.18358817305338407), ('rap_NN', 0.16677927145748123), ('banking_NN', 0.15012653319507727), ('habit_NN', 0.13220693122246424)]
Nearest neighbors for 'teacher' (as a noun) with window size 2: [('teacher_NN', 1.0), ('incentive-bonus_NN', 0.1871396590702692), ('death_NN', 0.177774234273022), ('cadet_NN', 0.17092311444926309), ('balloting_NN', 0.15334396204507705)]
Nearest neighbors for 'firm' (as a noun) with window size 2: [('firm_NN', 1.0), ('participant_NN', 0.14409819226854925), ('replacement-car_NN', 0.11632386218533429), ('career_NN', 0.10954830004933941), ('merchant_NN', 0.10018303200017441)]

--- Training with window size: 3 ---
Nearest neighbors for 'bank' (as a noun) with window size 3: [('bank_NN', 1.0), ('presidency_NN', 0.13227420408595775), ('banking_NN', 0.12774255806054122), ('milestone_NN', 0.12749062655172147), ('swing_NN', 0.12068961200897817)]
Nearest neighbors for