### TTIC 31190 - HW1
#### Yingzi Jin

#### Code

In [3]:
import itertools as iter
import sys
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
import tqdm
from collections import defaultdict

In [4]:
def distributional_counting(text_file, V, Vc, w):
    """
    Computes a distributional counting of word co-occurrences within a given window size.

    Input:
        test_file (string): text file
        V (set): V 
        Vc (set): Vc 
        w (int): window size
    
    Returns:
        counts: dict
    """
    
    counts = {}

    with open(text_file, 'r') as f:
        for line in tqdm.tqdm(f.readlines()):
            sentence = line.strip().split()
            m = len(sentence)
            
            for i, x in enumerate(sentence):
                if x in V:
                    left = max(0, i - w) 
                    right = min(m, i + w + 1)
                    
                    for j in iter.chain(range(left, i), range(i + 1, right)):
                        y = sentence[j]
                        if y in Vc:
                            counts[(x, y)] = counts.get((x, y), 0) + 1 
                                 
    return counts

In [5]:
def calculate_idf(text_file, Vc):
    """
    Computes the Inverse Document Frequency (IDF) for each word in Vc.
    
    Inputs:
        test_file (string): text file
        Vc (set): Vc 
    
    Returns:
        idfs_dict: dict
        S: int
    """
    S = 0
    idfs_dict = dict.fromkeys(list(Vc), 0)

    with open(text_file, 'r') as f:
        for line in tqdm.tqdm(f.readlines()):
            sentence = line.strip().split()
            S += 1
            for y in set(sentence):
                if y in Vc:  
                    idfs_dict[y] += 1


    return idfs_dict, S

In [6]:
def word_vector(counts, Vc):
    """
    Creates word vectors based on co-occurrence counts with context words.
    
    Inputs:
        counts(dict): Dictionary with keys as word pairs (x, y) and values as 
            co-occurrence counts
        Vc (set): Vc 
    
    Returns:
        vectors (dict): word vectors

    """
    
    vc_dim = len(Vc)
    vc_list = list(Vc)
    vectors = {}
    y_indices = {word: index for index, word in enumerate(vc_list)}

    for (x, y), value in tqdm.tqdm(counts.items()):
        if x not in vectors:
            vectors[x] = np.zeros(vc_dim)

        y_index = y_indices[y]
        vectors[x][y_index] = value

    return vectors

In [7]:
def import_vocab(filename):
    """
    Imports vocabulary file.

    Input: 
        filename (string): vocabulary file

    Returns:
        a set 
    """
   
    return set(open(filename).read().split())

In [8]:
def cosine_similarity(vec1, vec2):
    """
    Computes the cosine similarity between two vectors.

    Inputs:
    vec1 (numpy.ndarray): The first vector
    vec2 (numpy.ndarray): The second vector

    Returns:
    float: The cosine similarity between vec1 and vec2, ranging from -1 to 1.
    """
    dot_product = np.dot(vec1, vec2)
    magnitude1 = np.linalg.norm(vec1)
    magnitude2 = np.linalg.norm(vec2)

    return (dot_product / (magnitude1 * magnitude2))

In [9]:
def calculate_spearman(vectors_dict, wordpairs):
    """
    Calculates the spearman correlations between annoted scores and word vector scores.

    Inputs:
        vectors_dict (dict): methods as keys, the word vectors as values
        wordpairs (string): filename

    Returns:
        spearmans (dict): methods as key, correlation coefficient as value
    
    """
    scores_dict = defaultdict(list)
    annoted_scores = []
    spearmans = {}
    
    with open(wordpairs, 'r') as f:
        next(f)
        for line in f:
            inputs = line.strip().split()
            word1 = inputs[0]
            word2 = inputs[1]
            words = [word1, word2]
            annoted_scores.append(float(inputs[2]))
            
            for method, vectors in vectors_dict.items():
                if word1 not in vectors or word2 not in vectors:
                    cos = 0
                else:
                    vec1 = vectors[word1]
                    vec2 = vectors[word2]
                    cos = cosine_similarity(vec1, vec2)

                scores_dict[method].append(cos)

    annoted_scores = np.array(annoted_scores)
    for method in vectors_dict:
        scores = np.array(scores_dict[method])
        cc, _ = spearmanr(scores, annoted_scores)
        spearmans[method] = cc

    return spearmans

In [10]:
def transform_vector(vectors, tfidf=False, pmi=False, idf=None, S=None):
    """
    Transform word vecotrs according to methods. 

    Inputs:
        vectors (dict): word vectors based on counts
        tfidf (=False): True if method is tfidf
        pmi (=False): True is method is pmi
        idf (dict): dictionary of IDFs for each word in Vc
        S: int

    Returns:
        a dict of transformed word vecotrs 
    """
    vector_key_list  = list(vectors.keys())
    matrix = np.vstack(list(vectors.values()))

    with np.errstate(divide='ignore', invalid='ignore'):
        if pmi:
            px = np.sum(matrix, axis=1)
            py = np.sum(matrix, axis=0)
            N = np.sum(matrix)

            val = np.log2(matrix * N / np.outer(px, py))
            val[~np.isfinite(val)] = 0

        if tfidf:
            idf_array = np.array(list(idf.values()))
            val = np.where(idf_array != 0, matrix * (S / idf_array), 0)
        
    
    return dict(zip(vector_key_list, val))

In [11]:
def top_bottom_n_pmi(vectors_pmi, ctrword, Vc, n):
    """
    Select context words that have the top n and bottem n pmi values with a center word.

    Inputs:
        pmi_vectors (dict): word vectors transformed based on pmi
        ctrword (string): center word
        Vc: set
        n: int

    Returns:
        top_n_dict: dict
        bottom_n_dict: dict
    """
    
    values = vectors_pmi[ctrword]

    top_n_indices = values.argsort()[-n:][::-1]
    bottom_n_indices = values.argsort()[:n]

    vc_array = np.array(list(Vc))

    top_n_dict = {}
    bottom_n_dict = {}

    for idx in top_n_indices:
        top_n_dict[vc_array[idx]] = values[idx]

    for idx in bottom_n_indices:
        bottom_n_dict[vc_array[idx]] = values[idx]

    return top_n_dict, bottom_n_dict

In [12]:
def comparison(methods, window_sizes, text_file, V_file, Vc_files, wordpairs_files):
    """
    Compares the spearman scores for different methods, Vcs, wordpairs files, and window sizes. 

    Inputs:
        methods (list of strings): methods applied to the word vectors
        window_sizes: int
        text_file: string
        V_file: string
        Vc_files: list of strings
        wordpairs_files: list of strings

    Returns: 
        results: dict
    """

    V = import_vocab(V_file)
    results = {}

    for Vc_file in Vc_files:
        Vc = import_vocab(Vc_file)

        if "IDF" in methods:
            idf_dict, S = calculate_idf(text_file, Vc)

        for w in window_sizes:
            counts =  distributional_counting(text_file, V, Vc, w)
            vectors = word_vector(counts, Vc)
            vectors_dict = {}
            
            if "counts" in methods:
                vectors_dict['counts'] = vectors

            if "IDF" in methods:
                vectors_tfidf = transform_vector(vectors, tfidf=True, idf=idf_dict, S=S)
                vectors_dict['IDF'] = vectors_tfidf

            if "PMI" in methods:
                vectors_pmi = transform_vector(vectors, pmi=True)
                vectors_dict['PMI'] = vectors_pmi

            for wordpairs_file in wordpairs_files:
                spearmans = calculate_spearman(vectors_dict, wordpairs_file)
                results[(Vc_file.split("/")[-1], wordpairs_file.split("/")[-1], w)] = spearmans

    
    return results

In [13]:
def query_cosine(query, vectors_pmi_w):
    """
    Calculates the cosine similarity between a query and the rest with pmi.

    Inputs:
        query: string
        vectors_pmi_w: dict of dicts, window sizes as keys, and corresponded vectors_pmi as values
    
    Returns:
        scores_dict: dict of dicts
    """
    scores_dict = defaultdict(dict)
    for w, vectors_pmi in vectors_pmi_w.items():
         query_vector = vectors_pmi[query]
         for word, vector in tqdm.tqdm(vectors_pmi.items()):
            if word != query:
                cos = cosine_similarity(query_vector, vector)
                scores_dict[w][word] = cos
    
    return scores_dict

In [14]:
def n_nearest(query_scores, n, w):
    """
    Find the n nearest neighbors of a query with a window size. 
    """
    d = query_scores[w]
    return [k for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)[:n]]

In [15]:
def queries_n_nearest(query_category, vectors_pmi_w, n):
    """
    Find the n nearest neighbors with multiple queries. 
    """
    queries_nearest = {}
    for category, queries in query_category.items():
        for query in queries:
            query_scores = query_cosine(query, vectors_pmi_w)
            for w in query_scores:
                nearest_n = n_nearest(query_scores, n, w)
                queries_nearest[(w, category, query)] = nearest_n
    
    df = pd.DataFrame(queries_nearest)
    
    return df

#### 1.2

In [18]:
text_file = "hw1-data/wiki-1percent.txt"
V_sample = ["chicken", "chicago", "coffee"]
Vc_sample = ["the", "wings", "chicago", "cup", "coffee"]

In [16]:
counts_sample_3 = distributional_counting(text_file, V_sample, Vc_sample, 3)

100%|██████████| 997898/997898 [00:03<00:00, 281097.48it/s]


In [17]:
print(counts_sample_3)

{('chicago', 'the'): 875, ('coffee', 'the'): 95, ('chicago', 'chicago'): 38, ('chicken', 'the'): 52, ('coffee', 'cup'): 10, ('chicken', 'wings'): 6, ('coffee', 'coffee'): 4, ('chicago', 'cup'): 1}


In [18]:
counts_sample_6 = distributional_counting(text_file, V_sample, Vc_sample, 6)

100%|██████████| 997898/997898 [00:03<00:00, 294473.08it/s]


In [19]:
print(counts_sample_6)

{('chicago', 'the'): 1467, ('coffee', 'the'): 201, ('chicago', 'chicago'): 122, ('chicken', 'the'): 103, ('coffee', 'cup'): 14, ('coffee', 'coffee'): 36, ('chicken', 'wings'): 7, ('chicago', 'cup'): 7, ('chicago', 'wings'): 2}


#### 1.3

In [21]:
V_file = "hw1-data/vocab-15kws.txt"
Vc_file = "hw1-data/vocab-5k.txt"
wordpairs_files = ["hw1-data/simlex-999.txt", "hw1-data/men.txt"]

In [22]:
V = import_vocab(V_file)
Vc = import_vocab(Vc_file)

In [24]:
counts = distributional_counting(text_file, V, Vc, 3)

100%|██████████| 997898/997898 [01:40<00:00, 9970.88it/s] 


In [25]:
vectors = word_vector(counts, Vc)

100%|██████████| 7692505/7692505 [00:07<00:00, 1093193.62it/s]


In [28]:
vectors_dict = {'counts': vectors}

In [29]:
wordpairs_cc = {}
for wordpairs in wordpairs_files:
    cc = calculate_spearman(vectors_dict, wordpairs)
    wordpairs_cc[wordpairs.split("/")[-1]] = cc

In [30]:
wordpairs_cc

{'simlex-999.txt': {'counts': 0.0587613533134978},
 'men.txt': {'counts': 0.2251396048448754}}

#### 2.1

In [33]:
idf, S = calculate_idf(text_file, Vc)

100%|██████████| 997898/997898 [00:07<00:00, 137263.47it/s]


In [34]:
vectors_idf = transform_vector(vectors, tfidf=True, idf=idf, S=S)

In [35]:
vectors_dict = {"IDF": vectors_idf}

In [36]:
wordpairs_cc = {}
for wordpairs in wordpairs_files:
    cc = calculate_spearman(vectors_dict, wordpairs)
    wordpairs_cc[wordpairs.split("/")[-1]] = cc

In [37]:
wordpairs_cc

{'simlex-999.txt': {'IDF': 0.1643113945921928},
 'men.txt': {'IDF': 0.47281906258988254}}

#### 3.1

In [38]:
vectors_pmi = transform_vector(vectors, pmi=True)

In [40]:
ctrword = "coffee"
top, bottem = top_bottom_n_pmi(vectors_pmi, ctrword, Vc, 10)

In [41]:
top

{'tea': 8.16600126243293,
 'drinking': 7.58797865873193,
 'shop': 7.411693771493207,
 'costa': 7.350256393786161,
 'shops': 7.260751873418467,
 'sugar': 6.533949521544205,
 'coffee': 6.501977131805925,
 'mix': 6.131195903101976,
 'seattle': 5.950816325067398,
 'houses': 5.868161497268183}

In [42]:
bottem

{'he': -2.26033826495274,
 'be': -2.1509730526875237,
 'had': -1.9875291676196303,
 'this': -1.979549817934235,
 'not': -1.9115928402014317,
 'its': -1.839457915441101,
 'after': -1.598505205571959,
 'more': -1.4785257922880328,
 'when': -1.4043486976803334,
 'page': -1.2805627423998573}

#### 3.2

In [43]:
vectors_dict = {"PMI": vectors_pmi}

In [44]:
wordpairs_cc = {}
for wordpairs in wordpairs_files:
    cc = calculate_spearman(vectors_dict, wordpairs)
    wordpairs_cc[wordpairs.split("/")[-1]] = cc

In [45]:
wordpairs_cc

{'simlex-999.txt': {'PMI': 0.18643183126956037},
 'men.txt': {'PMI': 0.4656324083603801}}

#### 4.1 Code

In [47]:
methods = ['counts', 'IDF', 'PMI']
window_sizes = [1, 3, 6]
Vc_files = ["hw1-data/vocab-5k.txt", "hw1-data/vocab-15kws.txt"]

In [48]:
results = comparison(methods, window_sizes, text_file, V_file, Vc_files, wordpairs_files)

100%|██████████| 997898/997898 [00:08<00:00, 122775.82it/s]
100%|██████████| 997898/997898 [01:09<00:00, 14293.04it/s]
100%|██████████| 2757620/2757620 [00:04<00:00, 599674.49it/s]
100%|██████████| 997898/997898 [01:56<00:00, 8557.05it/s] 
100%|██████████| 7692505/7692505 [00:09<00:00, 831192.89it/s] 
100%|██████████| 997898/997898 [03:39<00:00, 4540.07it/s]
100%|██████████| 12186839/12186839 [00:18<00:00, 644901.84it/s]
100%|██████████| 997898/997898 [00:11<00:00, 90200.80it/s] 
100%|██████████| 997898/997898 [01:07<00:00, 14702.58it/s]
100%|██████████| 3693994/3693994 [00:11<00:00, 309433.45it/s]
100%|██████████| 997898/997898 [02:20<00:00, 7095.16it/s] 
100%|██████████| 10549201/10549201 [00:38<00:00, 272654.61it/s]
100%|██████████| 997898/997898 [03:59<00:00, 4167.92it/s] 
100%|██████████| 17199804/17199804 [01:28<00:00, 193975.98it/s]


In [49]:
df_results = pd.DataFrame(results).T

In [50]:
df_results

Unnamed: 0,Unnamed: 1,Unnamed: 2,counts,IDF,PMI
vocab-5k.txt,simlex-999.txt,1,0.067786,0.189229,0.227498
vocab-5k.txt,men.txt,1,0.209092,0.347559,0.433603
vocab-5k.txt,simlex-999.txt,3,0.058761,0.164311,0.186432
vocab-5k.txt,men.txt,3,0.22514,0.472819,0.465632
vocab-5k.txt,simlex-999.txt,6,0.044696,0.110603,0.150331
vocab-5k.txt,men.txt,6,0.241067,0.532399,0.472408
vocab-15kws.txt,simlex-999.txt,1,0.070014,0.1872,0.268065
vocab-15kws.txt,men.txt,1,0.206398,0.366168,0.470237
vocab-15kws.txt,simlex-999.txt,3,0.057142,0.147853,0.212292
vocab-15kws.txt,men.txt,3,0.220778,0.480952,0.519393


#### 4.1

In general, PMI outperforms IDF and Counts (Counts always perfoms the worst), with some exceptions from `MEN.txt` using `vocab-5k.txt` and window size 3 and 6, where IDF performs the best. 

PMI's ability to capture the strength of word associations based on co-occurrence patterns makes it generally more effective for tasks related to word semantics, such as word similarity measurements in our case. 

Also, across all context vocabularies, window sizes, all three methods performs much better on `Men` than `Simlex`.

When context vocabulary changes, the correlations generally are very similar between the two vocabularies. The differences are minor, suggesting that the context vocabulary doesn't dramatically change the quality of the word vectors in this dataset.

While correlations for all methods decrease with larger windows for `SimLex`, they increase for `Men`.


#### 4.2 

For `Men`, the similarity scores are determined more based on relatedness (themes) than senses (semantics). For example, "morning" and "sunrise" are assigned very high similarity score in `Men`, but they are not synonyms. Therefore, a larger window which allows capturing more meaningful semantic relationships lead to higher correlations with `Men`. 

For `Simlex`, unlike `Men`, the similarity scores are determined more based on senses. For example, "reader" and "author" are annotated as very low similarity in `Simlex` since they are not synonyms, but they are closely related in terms of theme. Larger windows that can include more noise and less specific context tend to capture relatedness more than pure similarity in senses.


#### 5.1 

In [16]:
Vc = import_vocab("hw1-data/vocab-5k.txt")
window_sizes = [1, 6]

In [23]:
vectors_pmi_w = {}
for w in window_sizes:
    counts = distributional_counting(text_file, V, Vc, w)
    vectors = word_vector(counts, Vc)
    vectors_pmi = transform_vector(vectors, pmi=True)
    vectors_pmi_w[w] = vectors_pmi

100%|██████████| 997898/997898 [01:03<00:00, 15740.90it/s]
100%|██████████| 2757620/2757620 [00:03<00:00, 837369.72it/s] 
100%|██████████| 997898/997898 [03:10<00:00, 5231.97it/s]
100%|██████████| 12186839/12186839 [00:18<00:00, 645420.99it/s]


In [24]:
query = "judges"

In [25]:
query_scores = query_cosine(query, vectors_pmi_w)
nearest_10_1 = n_nearest(query_scores, 10, 1)
nearest_10_6 = n_nearest(query_scores, 10, 6)

100%|██████████| 15225/15225 [00:01<00:00, 12508.88it/s]
100%|██████████| 15225/15225 [00:00<00:00, 52084.18it/s]


In [26]:
nearest_10_1

['judge',
 'players',
 'appeals',
 'officials',
 'ministers',
 'justices',
 'leaders',
 'members',
 'unanimously',
 'contestants']

In [27]:
nearest_10_6

['judge',
 'jury',
 'appeals',
 'courts',
 'panel',
 'supreme',
 'justice',
 'contestants',
 'candidates',
 'appeal']

#### 5.2 Code

In [53]:
query_category = {"nouns": ["dog", "goal", "table", "vehicle"],
                  "verbs": ["pass", "passing", "passed"],
                  "adjs": ["happy", "negative", "visible"],
                  "preps": ["from", "above", "after"]}

In [58]:
df = queries_n_nearest(query_category, vectors_pmi_w, 10)

100%|██████████| 15225/15225 [00:01<00:00, 12486.26it/s]
100%|██████████| 15225/15225 [00:00<00:00, 44359.30it/s]
100%|██████████| 15225/15225 [00:00<00:00, 27243.87it/s]
100%|██████████| 15225/15225 [00:01<00:00, 12275.79it/s]
100%|██████████| 15225/15225 [00:00<00:00, 60265.49it/s]
100%|██████████| 15225/15225 [00:00<00:00, 59348.34it/s]
100%|██████████| 15225/15225 [00:00<00:00, 61837.97it/s]
100%|██████████| 15225/15225 [00:00<00:00, 65732.98it/s]
100%|██████████| 15225/15225 [00:00<00:00, 71177.14it/s]
100%|██████████| 15225/15225 [00:00<00:00, 71009.90it/s]
100%|██████████| 15225/15225 [00:00<00:00, 68298.72it/s]
100%|██████████| 15225/15225 [00:00<00:00, 71130.92it/s]
100%|██████████| 15225/15225 [00:00<00:00, 70230.58it/s]
100%|██████████| 15225/15225 [00:00<00:00, 70926.45it/s]
100%|██████████| 15225/15225 [00:00<00:00, 71094.01it/s]
100%|██████████| 15225/15225 [00:00<00:00, 67776.65it/s]
100%|██████████| 15225/15225 [00:00<00:00, 70905.66it/s]
100%|██████████| 15225/15225 [0

In [59]:
df

Unnamed: 0_level_0,1,6,1,6,1,6,1,6,1,6,...,1,6,1,6,1,6,1,6,1,6
Unnamed: 0_level_1,nouns,nouns,nouns,nouns,nouns,nouns,nouns,nouns,verbs,verbs,...,adjs,adjs,adjs,adjs,preps,preps,preps,preps,preps,preps
Unnamed: 0_level_2,dog,dog,goal,goal,table,table,vehicle,vehicle,pass,pass,...,negative,negative,visible,visible,from,from,above,above,after,after
0,cat,girl,purpose,goals,category,lists,car,vehicles,passes,passes,...,positive,positive,noticeable,surface,in,in,below,page,before,before
1,turtle,cat,goals,scored,map,tables,vehicles,engine,run,drive,...,critical,reaction,evident,shape,between,between,here,should,when,he
2,bear,boy,aim,scoring,tables,template,aircraft,fuel,passed,touchdown,...,adverse,critical,accessible,relatively,through,",",debate,discussion,while,during
3,goat,horse,match,score,section,bottom,boat,speed,reach,passing,...,favorable,effects,recognizable,inside,until,until,following,talk,during,later
4,horse,animals,objectives,match,lists,text,bus,cars,drive,ball,...,significant,serious,available,color,since,at,list,below,following,his
5,dogs,animal,position,points,file,box,ship,motor,running,yards,...,serious,behavior,noteworthy,objects,into,south,review,link,since,when
6,hat,joe,concern,win,text,results,automobile,equipment,go,passed,...,strong,certain,notable,walls,by,moved,however,article,by,until
7,rabbit,wild,debut,victory,template,function,traffic,car,look,goal,...,useful,impact,susceptible,yellow,",",),talk,here,until,had
8,girl,steve,task,winning,below,simple,operations,traffic,passing,running,...,specific,potential,significant,structures,after,graduated,link,do,into,first
9,pig,dogs,demise,touchdown,picture,value,engine,aircraft,turn,runs,...,personal,effect,active,skin,.,into,same,review,but,following


In [61]:
cols = [col for col in df.columns if col[1] == "verbs"]
df[cols]

Unnamed: 0_level_0,1,6,1,6,1,6
Unnamed: 0_level_1,verbs,verbs,verbs,verbs,verbs,verbs
Unnamed: 0_level_2,pass,pass,passing,passing,passed,passed
0,passes,passes,crossing,passes,enacted,pass
1,run,drive,running,pass,adopted,act
2,passed,touchdown,wrecked,crossing,approved,declared
3,reach,passing,ran,runs,changed,granted
4,drive,ball,heading,drive,introduced,ran
5,running,yards,runs,highway,passes,adopted
6,go,passed,passes,traffic,returned,approved
7,look,goal,carrying,route,dropped,legislature
8,passing,running,moving,creek,voted,issued
9,turn,runs,driving,onto,carried,bill


In [62]:
cols = [col for col in df.columns if col[1] == "adjs"]
df[cols]

Unnamed: 0_level_0,1,6,1,6,1,6
Unnamed: 0_level_1,adjs,adjs,adjs,adjs,adjs,adjs
Unnamed: 0_level_2,happy,happy,negative,negative,visible,visible
0,pleased,anyone,positive,positive,noticeable,surface
1,surprised,'ll,critical,reaction,evident,shape
2,worried,everyone,adverse,critical,accessible,relatively
3,glad,'d,favorable,effects,recognizable,inside
4,sorry,let,significant,serious,available,color
5,afraid,ask,serious,behavior,noteworthy,objects
6,proud,feel,strong,certain,notable,walls
7,satisfied,wants,useful,impact,susceptible,yellow
8,willing,hope,specific,potential,significant,structures
9,sure,saying,personal,effect,active,skin


In [63]:
cols = [col for col in df.columns if col[1] == "preps"]
df[cols]

Unnamed: 0_level_0,1,6,1,6,1,6
Unnamed: 0_level_1,preps,preps,preps,preps,preps,preps
Unnamed: 0_level_2,from,from,above,above,after,after
0,in,in,below,page,before,before
1,between,between,here,should,when,he
2,through,",",debate,discussion,while,during
3,until,until,following,talk,during,later
4,since,at,list,below,following,his
5,into,south,review,link,since,when
6,by,moved,however,article,by,until
7,",",),talk,here,until,had
8,after,graduated,link,do,into,first
9,.,into,same,review,but,following


#### 5.1

For nouns, the nearest neighbors predominantly belong to the noun category. However, for other categories like verbs, adjectives, and prepositions, there's a higher variation in the types of nearest neighbors, especially as the window size changes. A smaller window size captures more syntactic relationships, leading to a more homogeneous list of nearest neighbors in terms of part-of-speech. In contrast, a larger window size tends to emphasize semantic relationships, resulting in a diverse set of nearest neighbors.

- **Nouns:** 
    - For example, the nearest neighbors for "dog" are mostly nouns. In the case of a smaller window size, it's observed that it returns specific animals like cat, rabbit, and chicken. On the other hand, a larger window size captures a broader semantic context, thus returning words such as man, boy, wild, and breed. This suggests that the larger window size perceives the concept of "dog" more as an animal in a broader sense.
    
- **Verbs:** 
    - "Pass" and its variations produce a mix of verbs and non-verbs. For the window size of 1, the results are mostly verbs, while for window size 6, there's a blend.
    
- **Adjectives:** 
    - For example, "happy" predominantly generates adjectives for the window size of 1. However, with window size 6, the nearest neighbors include verbs and pronouns, indicating a shift towards capturing semantic relationships.
    
- **Prepositions:** 
    - For example, the nearest neighbors for "from" are mostly prepositions, but for a window size of 6, there are some outliers like "graduated".

#### 5.3 Code

In [64]:
query_category = {"polysemy": ["bank", "head", "fall", "book", "light"],
                  "homonymy": ['lead', 'bear', 'wound', "apple", "well"]}

In [65]:
df2 = queries_n_nearest(query_category, vectors_pmi_w, 10)

100%|██████████| 15225/15225 [00:00<00:00, 16423.74it/s]
100%|██████████| 15225/15225 [00:01<00:00, 15084.30it/s]
100%|██████████| 15225/15225 [00:00<00:00, 67527.09it/s]
100%|██████████| 15225/15225 [00:00<00:00, 50212.92it/s]
100%|██████████| 15225/15225 [00:00<00:00, 46705.33it/s]
100%|██████████| 15225/15225 [00:00<00:00, 60351.27it/s]
100%|██████████| 15225/15225 [00:00<00:00, 59319.45it/s]
100%|██████████| 15225/15225 [00:00<00:00, 43735.28it/s]
100%|██████████| 15225/15225 [00:00<00:00, 53413.35it/s]
100%|██████████| 15225/15225 [00:00<00:00, 59290.65it/s]
100%|██████████| 15225/15225 [00:00<00:00, 50990.72it/s]
100%|██████████| 15225/15225 [00:00<00:00, 48173.99it/s]
100%|██████████| 15225/15225 [00:00<00:00, 43974.65it/s]
100%|██████████| 15225/15225 [00:00<00:00, 45936.18it/s]
100%|██████████| 15225/15225 [00:00<00:00, 43629.50it/s]
100%|██████████| 15225/15225 [00:00<00:00, 40866.79it/s]
100%|██████████| 15225/15225 [00:00<00:00, 41709.36it/s]
100%|██████████| 15225/15225 [0

In [66]:
df2

Unnamed: 0_level_0,1,6,1,6,1,6,1,6,1,6,1,6,1,6,1,6,1,6,1,6
Unnamed: 0_level_1,polysemy,polysemy,polysemy,polysemy,polysemy,polysemy,polysemy,polysemy,polysemy,polysemy,homonymy,homonymy,homonymy,homonymy,homonymy,homonymy,homonymy,homonymy,homonymy,homonymy
Unnamed: 0_level_2,bank,bank,head,head,fall,fall,book,book,light,light,lead,lead,bear,bear,wound,wound,apple,apple,well,well
0,side,capital,director,coach,falling,fell,books,books,heavy,heavy,backing,role,pine,wild,injuries,injuries,pine,microsoft,however,such
1,coast,corporation,chief,chief,fell,spring,novel,published,water,surface,leading,character,dog,dog,broken,injury,atari,computers,united,other
2,railway,railway,club,director,go,falls,story,written,dark,dark,guest,featured,goose,golden,wounds,neck,cherry,os,preserved,many
3,park,northern,career,president,falls,summer,album,story,line,color,take,song,oak,bears,injury,arm,christmas,desktop,list,most
4,africa,branch,black,position,come,beginning,song,novel,large,body,main,away,beaver,mountain,washed,injured,olive,mac,discussion,are
5,banks,southern,former,former,break,finally,film,wrote,regiment,water,featured,stage,deer,wolf,thrown,leg,bear,hardware,debate,some
6,corporation,valley,member,white,move,ended,series,author,fire,sometimes,supporting,love,bird,blue,stretched,wounds,mini,macintosh,such,like
7,property,lake,assistant,manager,fallen,forced,game,film,power,low,single,guitar,trout,deep,suffer,knee,desktop,software,there,have
8,railroad,banks,life,assistant,summer,falling,music,song,battalion,blue,title,goal,maple,dragon,kicked,chest,egg,windows,known,more
9,province,centre,minister,brother,walk,winter,episode,series,small,type,bass,big,bears,creek,blow,severe,oak,devices,u,all


In [67]:
query_category = {"nouns": ["soviet"]}

In [68]:
queries_n_nearest(query_category, vectors_pmi_w, 10)

100%|██████████| 15225/15225 [00:00<00:00, 41861.32it/s]
100%|██████████| 15225/15225 [00:00<00:00, 46028.30it/s]


Unnamed: 0_level_0,1,6
Unnamed: 0_level_1,nouns,nouns
Unnamed: 0_level_2,soviet,soviet
0,german,russian
1,british,communist
2,israeli,union
3,russian,polish
4,french,forces
5,polish,russia
6,japanese,troops
7,italian,poland
8,iraqi,republic
9,allied,germany


#### 5.3
-  While embeddings capture various senses of polysemous words effectively, they often fall short with homonyms, barring an exception "apple." Homonymous words are inherently tricky. The embeddings' inability to capture all senses might be influenced by the frequency of usage of each sense in the training data. 

    - For "apple",  8/10 nearest neighbors with window size 1 are related to the fruit meaning, while all 10 nearest neighbors with window size 6 are related to the Apple company.

    - For "bear," it's surprising that the embeddings didn't capture the meaning of "endure" as a verb, as it is used as a verb quite often. 


- A smaller window size (w=1) hones in on syntactic relationships, focusing on immediate contexts. A larger window size (w=6), on the other hand, broadens its horizon to capture semantic relationships and multiple senses more effectively.
    - For example:
        - Book: 
            - Window size = 1: Nearest neighbors include: "novel," "story," "film," "song," "music," and "game." These terms suggest a strong association with different forms of storytelling and entertainment mediums
            
            - Window size = 6: Nearest neighbors are: "published," "written," "wrote," "author," and "series." This collection showcases a broader context around the book, focusing on aspects of publishing, writing, and a mix of related media.

        - Bank:
            - Window size = 1: Nearest neighbors include terms closely related to financial institutions, such as "company," "insurance," "corporation," "railway," and "banking."

            - Window size = 6: While it still retains some financial connotations like "banks," "company," and "capital," there are other senses like "river" and "west" that indicate the riverbank meaning.

- It is not likely that a query generates exactly the same nearest neighbors with the two window sizes. However, when the word is noun, very specific and concrete, and has only one meaning, the neighbors with two window sizes can be very similar. For example, "soviet" generates very similar neighbors. 