In [2]:
import numpy as np
import pandas as pd
from itertools import chain

from dataset import read_word_list, pairs, find_common_words, filter_pairs_by_vocabulary

In [3]:
def pmi_bigrams(cooccurence_matrix, vocabulary):
    """Creates a generator of bigrams with their PMI value

    Args:
        cooccurence_matrix (np.array): Occurence matrix for the words in vocabulary
        vocabulary (dict): Word - id mapping

    Yields:
        dict: word, context word and the PMI value
    """
    sum_occurences = np.sum(cooccurence_matrix)
    # todo: fix beginning/ending words
    cnts_word1 = np.sum(cooccurence_matrix, axis=1)
    cnts_word2 = np.sum(cooccurence_matrix, axis=0)
    
    # sanity check because of the sum axis argument:
    assert np.sum(cooccurence_matrix[42, :]) == cnts_word1[42]

    for word1, idx1 in vocabulary.items():
        for word2, idx2 in vocabulary.items():
            cnt_coocurence = cooccurence_matrix[idx1, idx2]
            cnt_word1 = cnts_word1[idx1]
            cnt_word2 = cnts_word2[idx2]
            if cnt_coocurence == 0 or cnt_word1 == 0 or cnt_word2 == 0:
                continue
            pmi = (cnt_coocurence * sum_occurences) / (cnt_word1 * cnt_word2)
            pmi = np.log2(pmi)
            yield {
                "word": word1,
                "context": word2,
                "pmi": pmi
            }

In [7]:
def conduct_experiment(name, data_path, distances):
    # Load the list of words
    word_list = read_word_list(data_path)

    # Find the words that occur 10 or more times
    vocabulary = find_common_words(word_list, occurence_threshold=10)

    # Add indices to words for lookup
    vocabulary = {word: index for index, word in enumerate(vocabulary)}
    # print(f"Size of the {language} vocabulary is", len(vocabulary))

    # Create the pairs for counting
    pair_iterables = []
    for distance in distances:
        # print(distance)
        all_pairs = pairs(word_list, distance=distance)
        pairs_without_rare_words = filter_pairs_by_vocabulary(all_pairs, vocabulary)
        
        pair_iterables.append(pairs_without_rare_words)
    
    final_pairs = chain.from_iterable(pair_iterables)

    # Create the coocurence matrix
    cooccurence_matrix = np.zeros((len(vocabulary), len(vocabulary)))
    for word1, word2 in final_pairs:
        idx1 = vocabulary[word1]
        idx2 = vocabulary[word2]
        cooccurence_matrix[idx1, idx2] += 1

    bigrams_pmi = list(pmi_bigrams(cooccurence_matrix, vocabulary))
    df = pd.DataFrame(bigrams_pmi).sort_values("pmi", ascending=False)
    print(f"{name} with the highest Pointwise Mutual Information:")
    display(df.head(20))
    print(f"{name} with the lowest PMI:")
    display(df.tail(5))

In [8]:
experiments = [
    ("Czech pairs with distance 1", "data/TEXTCZ1.txt", [1]),
    ("English pairs with distance 1", "data/TEXTEN1.txt", [1]),
    ("Czech pairs with distance [2,50]", "data/TEXTCZ1.txt", range(2, 51)),
    ("English pairs with distance [2,50]", "data/TEXTEN1.txt", range(2, 51)),
]

In [9]:
for name, datapath, distances in experiments:
    conduct_experiment(name, datapath, distances)

Czech pairs with distance 1 with the highest Pointwise Mutual Information:


Unnamed: 0,word,context,pmi
5046,zavedení,příspěvku,14.599157
26784,Peter,Carrington,14.277229
7882,pražském,hotelu,13.791802
35712,deník,The,13.599157
4937,starých,struktur,13.599157
35006,vojenského,materiálu,13.599157
26937,SE,,13.599157
33078,teplota,minus,13.469874
17728,platební,bilance,13.469874
34476,Hamburger,SV,13.429232


Czech pairs with distance 1 with the lowest PMI:


Unnamed: 0,word,context,pmi
2722,(,.,-6.140203
8620,na,.,-6.659
38084,.,že,-6.879808
38083,.,se,-7.194381
14315,",",.,-7.93717


English pairs with distance 1 with the highest Pointwise Mutual Information:


Unnamed: 0,word,context,pmi
27351,La,Plata,13.917209
43887,competent,observers,13.917209
33976,Asa,Gray,13.917209
638,de,Candolle,13.694816
42229,worth,while,13.402636
25569,faced,tumbler,13.332246
8422,Fritz,Muller,13.180243
37743,Malay,Archipelago,13.04274
36909,lowly,organised,13.04274
11094,shoulder,stripe,13.04274


English pairs with distance 1 with the lowest PMI:


Unnamed: 0,word,context,pmi
12613,in,of,-7.528919
44589,.,of,-7.56234
31007,of,.,-7.709464
44527,.,the,-8.268955
9000,the,",",-8.42218


Czech pairs with distance [2,50] with the highest Pointwise Mutual Information:


Unnamed: 0,word,context,pmi
202300,výher,výher,9.454761
823506,žel,žel,9.019011
109996,Sandžaku,Sandžaku,8.869873
743963,h,teplota,8.8567
129885,CIA,CIA,8.665419
771503,ODÚ,VPN,8.66048
341666,Petrof,Petrof,8.555163
657082,IFS,IFS,8.552574
346877,silniční,doprava,8.540066
578287,Bělehrad,Benfica,8.510431


Czech pairs with distance [2,50] with the lowest PMI:


Unnamed: 0,word,context,pmi
105816,1,kteří,-5.470394
135148,!,jsou,-5.52146
583862,6,jsem,-5.522305
105781,1,jednání,-5.546834
830064,2,jsem,-5.990622


English pairs with distance [2,50] with the highest Pointwise Mutual Information:


Unnamed: 0,word,context,pmi
454442,dried,floated,8.929476
474114,floated,dried,8.878393
454411,dried,germinated,8.619891
454438,dried,dried,8.524467
474085,floated,germinated,8.481611
548777,avicularia,vibracula,8.412526
474119,floated,floated,8.395611
139015,stripe,shoulder,8.380586
821461,layer,hexagonal,8.356975
718343,eastern,Pacific,8.296716


English pairs with distance [2,50] with the lowest PMI:


Unnamed: 0,word,context,pmi
586336,selection,islands,-3.787475
984133,genera,conditions,-3.842065
47397,species,wax,-4.0062
236503,wax,species,-4.02091
292786,varieties,organs,-4.148708
