In [1]:
import numpy as np
import word_embedding as we

In [2]:
import nltk

# Function to check if the dataset is already downloaded
def download_nltk_data_if_needed(dataset_name):
    try:
        # Check if the dataset is already available locally
        nltk.data.find(f'corpora/{dataset_name}')
    except LookupError:
        # If not found, download the dataset
        nltk.download(dataset_name)

# Replace 'treebank' with any other dataset you want to check
download_nltk_data_if_needed('treebank')

In [3]:
tagged_corpus = nltk.corpus.treebank.tagged_sents()
print(tagged_corpus[0])

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]


In [4]:
window_sizes = [2, 3, 4, 5]  # Experiment with different window sizes
words_to_check = ['bank', 'life', 'deal']  # Words to check nearest neighbors for

In [6]:

for window_size in window_sizes:
    print(f"\n--- Training with window size: {window_size} ---")
    
    # Train using the updated word2vec_pos with the current window size
    co_occurrence_matrix, vocab_index = we.word2vec_pos(tagged_corpus, window_size=window_size, pos_weighting=None, pos_weights=None)

    for word in words_to_check:
    # Include the POS tag with the word to differentiate its grammatical role
        word_with_pos = f"{word.lower()}_NN"  # Example: looking for nouns
        if word_with_pos in vocab_index:
            # The function now expects the word with its POS tag as the first argument
            neighbors = we.find_nearest_neighbors(word_with_pos, co_occurrence_matrix, vocab_index, top_n=5)
            print(f"Nearest neighbors for '{word}' with window size {window_size}: {neighbors}")
        else:
            print(f"Word '{word}' not found in the vocabulary.")




--- Training with window size: 2 ---
Nearest neighbors for 'bank' with window size 2: [('bank_NN', 0.9999999999999998), ('problem_NN', 0.777471561405543), ('day_NN', 0.7444897800851263), ('law_NN', 0.7185290673597967), ('thrift_NN', 0.7184157229917689)]
Nearest neighbors for 'life' with window size 2: [('life_NN', 1.0000000000000002), ('business_NN', 0.6281363246550238), ('harm_NN', 0.614023645162669), ('spouse_NN', 0.6092076990801715), ('history_NN', 0.5993505457967399)]
Nearest neighbors for 'deal' with window size 2: [('deal_NN', 1.0), ('problem_NN', 0.8048757988498922), ('team_NN', 0.7601551737556985), ('bill_NN', 0.7574528079233805), ('report_NN', 0.7528979557460042)]

--- Training with window size: 3 ---
Nearest neighbors for 'bank' with window size 3: [('bank_NN', 1.0), ('problem_NN', 0.7974392389917512), ('group_NN', 0.7739648027081939), ('law_NN', 0.7659590171910243), ('move_NN', 0.7503959759184194)]
Nearest neighbors for 'life' with window size 3: [('life_NN', 1.000000000000

In [8]:
for window_size in window_sizes:
    print(f"\n--- Training with window size: {window_size} ---")
    
    # Apply dynamic POS weighting (adjust 'pos_weights' as needed)
    pos_weights = {"_NN": 1.5, "_VB": 1.2}  # Example weights for nouns and verbs
    
    # Generate the co-occurrence matrix with POS weighting
    co_occurrence_matrix, vocab_index = we.word2vec_pos(tagged_corpus, window_size=window_size, pos_weights=pos_weights)
    
    # Transform the co-occurrence matrix using PPMI
    ppmi_matrix = we.compute_ppmi(co_occurrence_matrix)

    for word in words_to_check:
        word_with_pos = f"{word.lower()}_NN"  # Target nouns specifically
        if word_with_pos in vocab_index:
            word_idx = vocab_index[word_with_pos]
            word_vector = ppmi_matrix[word_idx, :]
            
            # Find nearest neighbors, considering only those with the same POS tag ('NN' here)
            neighbors = we.find_nearest_neighbors(word_with_pos, ppmi_matrix, vocab_index, top_n=5)  # Assuming this function is adapted for POS
            print(f"Nearest neighbors for '{word}' (as a noun) with window size {window_size}: {neighbors}")
        else:
            print(f"Word '{word}' not found in the vocabulary.")






--- Training with window size: 2 ---
Nearest neighbors for 'bank' (as a noun) with window size 2: [('bank_NN', 1.0), ('hub_NN', 0.17952764701792673), ('rap_NN', 0.1633844953010835), ('banking_NN', 0.157220764306915), ('habit_NN', 0.130114415948027)]
Nearest neighbors for 'life' (as a noun) with window size 2: [('life_NN', 0.9999999999999999), ('wine-making_NN', 0.24164500764015825), ('batting_NN', 0.20151008696454353), ('sex_NN', 0.1545129465608383), ('spouse_NN', 0.14828250341079866)]
Nearest neighbors for 'deal' (as a noun) with window size 2: [('deal_NN', 0.9999999999999999), ('laser_NN', 0.15279723156683317), ('count_NN', 0.15129338638974235), ('bridge_NN', 0.14226287106357155), ('sidewalk_NN', 0.1306482734255041)]

--- Training with window size: 3 ---
Nearest neighbors for 'bank' (as a noun) with window size 3: [('bank_NN', 1.0), ('banking_NN', 0.12815605433362717), ('milestone_NN', 0.123461102504619), ('hub_NN', 0.1163675461216579), ('directorship_NN', 0.10588980236779791)]
Near