In [None]:
# TODO:
# 2. Write test code for ppmi function.

In [13]:
# Step 1. Import NLTK in Python: http://www.nltk.org/.
# Download the Brown Corpus http://www.nltk.org/book/ch02.html for analyses below.

import nltk
# nltk.download('brown')
from nltk.corpus import brown

In [14]:
# Step 2. Extract the 5000 most common English words (denoted by W) based on unigram
# frequencies in the Brown corpus. Report the 5 most and least common words you have found
# in the 5000 words. Update W by adding n words where n is the set of words in Table 1
# of RG65 that were not included in the top 5000 words from the Brown corpus. Denote the
# total number of words in W as |W|.

from collections import Counter
import csv
import common


def get_most_common_words(n):
    unigram_counter = Counter()
    for sent in brown.sents():
        unigram_counter.update(sent)
    most_common_words = set([item[0] for item in unigram_counter.most_common(n)])
    return most_common_words


most_common_5000 = get_most_common_words(5000)
rg65_words, rg65_word_pairs = common.load_rg65()
w_vocab = most_common_5000.union(rg65_words)


In [15]:
# Step 3. Construct a word-context vector model (denoted by M1) by collecting bigram counts
# for words in W. The output should be a |W|×|W| matrix (consider using sparse matrices
# for better efficiency), where each row is a word in W, and each column is a context in W
# that precedes row words in sentences. For example, if the phrase taxi driver appears 5 times
# in the entire corpus, then row taxi and column driver should have a value of 5 in the matrix.

import scipy.sparse
from nltk.util import bigrams
from tqdm import tqdm
import numpy as np


def get_w_matrix(w_vocab):
    """
    Each row is a word and each column is a context.
    """
    # Identify which words and contexts to include (don't want any
    # zero rows or zero columns).
    w_vocab_contexts = set()
    w_vocab_words = set()
    for context, word in tqdm(bigrams(brown.words())):
        if context not in w_vocab or word not in w_vocab:
            continue
        w_vocab_contexts.add(context)
        w_vocab_words.add(word)    
    word_to_index = {word: i for i, word in enumerate(list(w_vocab_words))}
    context_to_index = {word: i for i, word in enumerate(list(w_vocab_contexts))}
    w_matrix = np.zeros((len(word_to_index), len(context_to_index)))
    
    
    for context, word in tqdm(bigrams(brown.words())):
        if word not in word_to_index or context not in context_to_index:
            continue
        w_matrix[word_to_index[word], context_to_index[context]] += 1
    return word_to_index, w_matrix


word_to_index, M1 = get_w_matrix(w_vocab)

1161191it [00:04, 277003.98it/s]
1161191it [00:05, 212287.37it/s]


In [6]:
# Step 4. Compute positive pointwise mutual information on M1. Denote this model as M1+.

def get_ppmi_matrix(count_matrix):
    count_matrix = count_matrix / np.sum(count_matrix)
    p_word = np.sum(count_matrix, axis=1) / np.sum(count_matrix)    
    p_context = np.sum(count_matrix, axis=0) / np.sum(count_matrix)

    count_matrix = np.divide(count_matrix, p_context, where=count_matrix!=0)
    count_matrix = np.divide(count_matrix.T, p_word, where=count_matrix.T!=0).T
    
    count_matrix = np.log2(count_matrix, where=count_matrix!=0)
    count_matrix[count_matrix < 0] = 0
    return count_matrix


M1_plus = get_ppmi_matrix(M1)

In [7]:
# Step 5. Construct a latent semantic model (denoted by M2) by applying principal components
# analysis to M1+. The output should return 3 matrices, with different truncated
# dimenions at 10 (or a |W|×10 matrix, denoted by M210), 100 (M2100), and 300 (M2300).

from sklearn.decomposition import PCA
    
M210 = PCA(n_components=10).fit_transform(M1_plus)
M2100 = PCA(n_components=100).fit_transform(M1_plus)
M2300 = PCA(n_components=300).fit_transform(M1_plus)

In [16]:
# Step 6. Find all pairs of words in Table 1 of RG65 that are also available in W. Denote
# these pairs as P. Record the human-judged similarities of these word pairs from the table
# and denote similarity values as S.

P = [(w1, w2) for w1, w2 in rg65_word_pairs.keys()
     if w1 in word_to_index and w2 in word_to_index]
S = [rg65_word_pairs[word_pair] for word_pair in P]


In [17]:
# Step 7. Perform the following calculations on each of these models M1, M1+, M210, M2100,
# M2300, separately: Calculate cosine similarity between each pair of words in P, based on the
# constructed word vectors. Record model-predicted similarities: SM1, SM210 , SM2100 , SM2300 .

from sklearn.metrics.pairwise import cosine_similarity

def get_sim(word_matrix, word_to_index, word_pairs):
    result = []
    for w1, w2 in word_pairs:
        w1_vec = word_matrix[word_to_index[w1]]
        w2_vec = word_matrix[word_to_index[w2]]
        result.append(cosine_similarity([w1_vec], [w2_vec])[0][0])
    return result


SM1 = get_sim(M1, word_to_index, P)
SM1_plus = get_sim(M1_plus, word_to_index, P)
SM210 = get_sim(M210, word_to_index, P)
SM2100 = get_sim(M2100, word_to_index, P)
SM2300 = get_sim(M2300, word_to_index, P)

In [18]:
# Step 8. Report Pearson correlation between S and each of the model-predicted similarities.
# Create a GitHub repository that implements all of your analyses; you will need this repo for
# the next lab.

from scipy.stats import pearsonr

name_to_sim = {
    "M1": SM1,
    "M1_plus": SM1_plus,
    "M210": SM210,
    "M2100": SM2100,
    "M2300": SM2300,  
}

for name, sim in name_to_sim.items():
    r, p = pearsonr(sim, S)
    print(f"{name}, r={r:.4f}, p={p:.4f}")
    

M1, r=0.0895, p=0.5078
M1_plus, r=0.2759, p=0.0378
M210, r=0.1004, p=0.4573
M2100, r=0.2700, p=0.0423
M2300, r=0.3085, p=0.0196


In [19]:
# Save word embeddings:
import common

common.write_embedding_dict("data/M2300.pickle", M2300, word_to_index)
