# CSC2611 Lab: Exercise

## Imports

In [1]:
import numpy as np
import nltk
from scipy import stats
from nltk.corpus import brown
from sklearn import decomposition

## Load corpus and table data

In [2]:
def load_table(file):
    word_pairs = []
    sim = []
    with open(file, 'r') as f:
        for line in f:
            data = line.split()
            word_pairs.append(tuple(data[:2]))
            sim.append(float(data[2]))
    return word_pairs, sim

P, S = load_table('table1.txt')
brown_corpus = brown.words()
ALL_WORDS = list(set(brown_corpus))

## Construct word vectors

In [3]:
def extract_most_freq(n, corpus):
    fdist = nltk.FreqDist(w.lower() for w in corpus if w.isalnum())
    most_common = [w for (w, _) in fdist.most_common(n)]
    return most_common

W = extract_most_freq(5000, brown_corpus)
print(f"5 most common words: {W[:5]}\n5 least common words: {W[-5:]}")

5 most common words: ['the', 'of', 'and', 'to', 'a']
5 least common words: ['tobacco', 'ignore', 'applies', 'relax', 'brass']


In [4]:
def get_uniq_words(word_pairs):
    uniq_words = []
    for pair in word_pairs:
        for word in pair:
            if word not in uniq_words:
                uniq_words.append(word)
    return uniq_words

In [5]:
def update_W(W, pairs):
    new_words = get_uniq_words(pairs)
    for word in new_words:
        if word not in W and word in ALL_WORDS:
            W.append(word)
    return W

W_= update_W(W, P)
print(f"\n|W| = {len(W)}")


|W| = 5028


In [6]:
# ('had', 'been') 760
def construct_word_context(words, corpus):
    n = len(words)
    word_idx = {words[i]: i for i in range(n)}
    bigrams = list(nltk.bigrams(corpus))
    frequency = nltk.FreqDist([(tup[0].lower(), tup[1].lower()) for tup in bigrams])
    mat = np.zeros((n,n))
    for key, value in frequency.items():
        w1, w2 = key[0], key[1]
        if w1 in words and w2 in words:
            i = word_idx[w2]
            j = word_idx[w1]
            mat[i, j] = value
    return mat, word_idx
            
M1, W_idx = construct_word_context(W, brown_corpus)

In [7]:
def ppmi(mat):
    col_sum = np.sum(mat, axis=0)
    mat_sum = np.sum(col_sum)
    row_sum = np.sum(mat, axis=1)
    denom = np.outer(row_sum, col_sum) / mat_sum
    with np.errstate(divide='ignore', invalid='ignore'):
        mat = mat / denom
    mat[np.isnan(mat)] = 0.0001
    mat[mat==0] = 0.0001
    pmi_mat = np.log(mat)
    pmi_mat[pmi_mat < 0] = 0.0
    return pmi_mat
    
M1_plus = ppmi(M1)

In [8]:
pca_10 = decomposition.PCA(n_components=10)
pca_100 = decomposition.PCA(n_components=100)
pca_300 = decomposition.PCA(n_components=300)

M1_10 = pca_10.fit_transform(M1_plus)
M1_100 = pca_100.fit_transform(M1_plus)
M1_300 = pca_300.fit_transform(M1_plus)

## Calculate and compare similarity

In [9]:
def get_valid_pairs(pairs, sims, words):
    new_pairs = []
    new_sims = []
    for i in range(len(pairs)):
        if pairs[i][0] in words and pairs[i][1] in words:
            new_pairs.append(pairs[i])
            new_sims.append(sims[i])
    return new_pairs, new_sims

P, S = get_valid_pairs(P, S, W)

In [10]:
def extract_embeddings(pairs, mat, w_idx):
    vec_pairs = []
    for p1, p2 in pairs:
        v1 = mat[w_idx[p1]]
        v2 = mat[w_idx[p2]]
        vec_pairs.append((v1, v2))
    return vec_pairs


In [11]:
def cosine_sim(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [12]:
def calc_pair_sims(pairs):
    sims = []
    for v1, v2 in pairs:
        sim = cosine_sim(v1, v2)
        sims.append(sim)
    return sims

In [13]:
def compare_sims(models, pairs, w_idx, similarities):
    i = 1
    for model in models:
        vec_pairs = extract_embeddings(pairs, model, w_idx)
        sim_calc = calc_pair_sims(vec_pairs)
        r, p = stats.pearsonr(similarities, sim_calc)
        print(f'Model {i}: {r:.4f}')
        i+=1
        
r = compare_sims([M1_10, M1_100, M1_300], P, W_idx, S)

Model 1: 0.1491
Model 2: 0.2686
Model 3: 0.3311
