# Meaning Construction from Text
**CSC2611** 

Kanika Chopra

In [1]:
import nltk
from nltk import bigrams
import numpy as np
from nltk.util import ngrams
from collections import Counter
from scipy.sparse import lil_matrix
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr
import math
import pandas as pd

#### Section 1

In [2]:
nltk.download('brown')
from nltk.corpus import brown

[nltk_data] Downloading package brown to /home/jupyter/nltk_data...
[nltk_data]   Package brown is already up-to-date!


##### Preliminary Analysis

In [3]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [4]:
brown.words(categories='adventure')

['Dan', 'Morgan', 'told', 'himself', 'he', 'would', ...]

In [5]:
brown.words()

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [6]:
len(brown.words())

1161192

We have 1,162, 192 words in our corpus. 

#### Section 2

First, we want to extract the 5000 most common English words (denoted W) based on unigram frequencies in the Brown corpus. 

In [7]:
# Get all unigrams (words)
words = brown.words()

To clean the data before we get the unigrams, we want to remove all punctuation.

In [8]:
# Remove punctuation
words = [word.lower() for word in words if word.isalpha()]

In [9]:
# Get the Frequency Distribution
fdist = nltk.FreqDist(words)

In [97]:
# Extract top 5000 most common words
W = [pair[0].lower() for pair in fdist.most_common(5000)]

Next, we want to report the 5 most and least common words we have found.

In [98]:
# Most common words 
W[:5]

['the', ',', '.', 'of', 'and']

In [99]:
# Least common words 
W[-5:]

['expanded', 'emphasize', 'manhattan', 'temporarily', 'puts']

Then, we update W by adding n words where n is the set of words in Table 1 of RG65 that were not included in the top 5000 words from the Brown corpus. Denote the total number of words in W as |W|.

Hence, we need to get a list of the words from Table 1 and then compare to see which words are already in W. If they are, we skip the word. If they aren't, we add it to our set. 

In [100]:
rg65 = pd.read_csv('rg65.csv')

In [101]:
table1 = np.array(rg65[['word1', 'word2']]).reshape(-1, 1)

First, we want to remove duplicates from our table 1.

In [102]:
new_words = np.unique(table1).tolist()

In [103]:
len(new_words)

48

In [104]:
len(W)

5000

Currently, we have that table 1 has 48 words and W has 5000 words.

In [105]:
missing_words = [word for word in new_words if word not in W]
W.extend(missing_words)

In [106]:
len(W)

5031

In [107]:
W = np.array(W)

We have that |W| = 5031. Hence, there were 31 words that were not in the original corpus that were in Table 1. 

#### Section 3
We want to construct a word-context model (denoted M1) by collecting bigram counts for the words in W. This should become a |W| x |W| matrix where each word is a row in W and each column is a context in W that precedes row words in sentences. 

For example, if the phrase _taxi driver_ appears 5 times in the entire corpus, then the row _taxi_, column _driver_ would have a value of 5 in the matrix.

In [108]:
# Get the bigrams
bwbg = bigrams(words)

# Get the Frequency Distribution
fdist_bwbg = nltk.FreqDist(bwbg)

In [109]:
fdist_bwbg

FreqDist({('of', 'the'): 9625, (',', 'and'): 6288, ('.', 'The'): 6081, ('in', 'the'): 5546, (',', 'the'): 3754, ('.', '``'): 3515, ('to', 'the'): 3426, ("''", '.'): 3332, (';', ';'): 2784, ('.', 'He'): 2660, ...})

In [110]:
fdist_bwbg[('the', 'driver')]

21

Now, we want to create our word-context model M1 for the bigram counts for words in W.

In [111]:
M1 = np.zeros((len(W), len(W)))

In [112]:
# Iterate through words in W and count frequencies
for i, word1 in enumerate(W):
    for j, word2 in enumerate(W):
        M1[i,j] = fdist_bwbg[(word1, word2)]

#### Section 4: Positive Pointwise Mutual Information
Using M1, we want to compute positive pointwise mutual information and denote this as M1+.

We want to only transform the values where our count is non-zero. When the count is zero, we would end up with -$\infty$ so we ignore these since we would set negative values to 0 in PPMI.

In [113]:
fdist.freq('the')

0.05400743374050114

In [114]:
M1.sum()

1688414.0

In [115]:
M1_plus_tmp = np.zeros((len(W), len(W)))

In [117]:
n = len(fdist_bwbg)
for i in range(len(W)):
    for j in range(len(W)):
        Pwc = M1[i,j]/n
        Pc = fdist.freq(W[i])
        Pw = fdist.freq(W[j])
        if (Pc != 0) & (Pw != 0) & (M1[i,j] != 0):
            M1_plus_tmp[i,j] = max(math.log(Pwc/(Pc*Pw),2),0)
        else:
            M1_plus_tmp[i,j] = 0 

In [118]:
M1_plus = lil_matrix(M1_plus_tmp)

#### Section 5: Principal Component Analysis

In [119]:
M2_10 = PCA(n_components=10).fit_transform(M1_plus_tmp)
M2_100 = PCA(n_components=100).fit_transform(M1_plus_tmp)
M2_300 = PCA(n_components=300).fit_transform(M1_plus_tmp)

#### Section 6: 
Next, we want to find all pairs of words in Table 1 of RG65 that are also available in W . Denote these pairs as P . Record the human-judged similarities of these word pairs from the table and denote similarity values as S.

Since, we have added all of the words in Table 1 that are not in W, we can simply use all of the pair combinations as P.

In [120]:
S = []
P = []

In [121]:
for i in range(rg65.shape[0]):
    word1 = rg65.word1[i]
    word2 = rg65.word2[i]
    
    if (word1 in W) & (word2 in W):
        pair = (word1, word2)
        P.append(pair)
        s = rg65.similarity[i]
        S.append(s)

In [122]:
len(S), len(P)

(65, 65)

#### Step 7: Cosine Similarity
We want the cosine similarity between each pair of words in P based on the constructed word vectors for. We store these in SM1, SM1_plus, SM210, SM2100, SM2300.

**Something is wrong with cosine similarity** 

In [146]:
def cos_sim(M, W, P):
    sim = []
    for (word1, word2) in P:
        arr1 = M[np.where(W == word1)[0]]
        arr2 = M[np.where(W == word2)[0]]
        val = cosine_similarity(arr1, arr2)[0][0]
        sim.append(val)
    return sim

In [147]:
SM1 = cos_sim(M1, W, P)

In [148]:
SM1_plus = cos_sim(M1_plus, W, P)

In [149]:
SM2_10 = cos_sim(M2_10, W, P)

In [150]:
SM2_100 = cos_sim(M2_100, W, P)

In [151]:
SM2_300 = cos_sim(M2_300, W, P)

#### Step 8: Pearson Correlation
Lastly, we want to report the Pearson correlation between S and each of our model-predicted similarities. 

##### S vs. $SM_1$

In [152]:
pearsonr(S, SM1)

(0.20088035191717596, 0.10859784707272817)

##### S vs. $SM_{1+}$

In [153]:
pearsonr(S, SM1_plus)

(0.28743460050663483, 0.02025265881547502)

##### S vs. $SM_{2_{10}}$

In [154]:
pearsonr(S, SM2_10)

(0.1364997946513758, 0.2782638242188669)

##### S vs. $SM_{2_{100}}$

In [155]:
pearsonr(S, SM2_100)

(0.30373949422240853, 0.013904293173821856)

##### S vs. $SM_{2_{300}}$

In [156]:
pearsonr(S, SM2_300)

(0.29904592325561313, 0.015526426496618749)