In [1]:
import numpy as np
import pickle
import nltk
nltk.download('brown')
nltk.download('stopwords')
from nltk.corpus import brown, stopwords
from scipy.cluster.vq import kmeans2
from sklearn.decomposition import PCA

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\josep\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\josep\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [5]:
#The corpus consists of 500 samples of text drawn from a wide range of sources. When these are concatenated, they form a very long stream of over a million words, which is available as brown.words(). Let's look at the first 50 words
for i in range(250):
    print (brown.words()[i],)

The
Fulton
County
Grand
Jury
said
Friday
an
investigation
of
Atlanta's
recent
primary
election
produced
``
no
evidence
''
that
any
irregularities
took
place
.
The
jury
further
said
in
term-end
presentments
that
the
City
Executive
Committee
,
which
had
over-all
charge
of
the
election
,
``
deserves
the
praise
and
thanks
of
the
City
of
Atlanta
''
for
the
manner
in
which
the
election
was
conducted
.
The
September-October
term
jury
had
been
charged
by
Fulton
Superior
Court
Judge
Durwood
Pye
to
investigate
reports
of
possible
``
irregularities
''
in
the
hard-fought
primary
which
was
won
by
Mayor-nominate
Ivan
Allen
Jr.
.
``
Only
a
relative
handful
of
such
reports
was
received
''
,
the
jury
said
,
``
considering
the
widespread
interest
in
the
election
,
the
number
of
voters
and
the
size
of
this
city
''
.
The
jury
said
it
did
find
that
many
of
Georgia's
registration
and
election
laws
``
are
outmoded
or
inadequate
and
often
ambiguous
''
.
It
recommended
that
Fulton
legislators
act
``
to
have
th

In [6]:
#Before doing anything else, let's remove stopwords and punctuation and make everything lowercase. The resulting sequence will be stored in my_word_strea
my_stopwords = set(stopwords.words('english'))
word_stream = [str(w).lower() for w in brown.words() if w.lower() not in my_stopwords]
my_word_stream = [w for w in word_stream if (len(w) > 1 and w.isalnum())]

In [8]:
my_word_stream[:40]

['fulton',
 'county',
 'grand',
 'jury',
 'said',
 'friday',
 'investigation',
 'recent',
 'primary',
 'election',
 'produced',
 'evidence',
 'irregularities',
 'took',
 'place',
 'jury',
 'said',
 'presentments',
 'city',
 'executive',
 'committee',
 'charge',
 'election',
 'deserves',
 'praise',
 'thanks',
 'city',
 'atlanta',
 'manner',
 'election',
 'conducted',
 'term',
 'jury',
 'charged',
 'fulton',
 'superior',
 'court',
 'judge',
 'durwood',
 'pye']

In [9]:
#Computing co-occurrence probabilities
#Step 1: Get a list of words and their frequencies.
N = len(my_word_stream)
words = []
totals = {}
for i in range(1, N-1):
    w = my_word_stream[i]
    if w not in words:
        words.append(w)
        totals[w] = 0
    totals[w] = totals[w] + 1

In [10]:
#Step 2: Decide on the vocabulary. There are two potentially distinct vocabularies: the words for which we will obtain embeddings (vocab_words) and the words we will consider when looking at context information (context_words). We will take the former to be all words that occur at least 20 times, and the latter to be all words that occur at least 100 times. These choices are pretty arbitrary: by all means, play around with them and find something bette
vocab_words = [w for w in words if totals[w] > 19]
context_words = [w for w in words if totals[w] > 99]

In [11]:
len(vocab_words), len(context_words)

(4720, 918)

In [12]:
#Step 3: Get co-occurrence counts. These are defined as follows, for a small constant window_size=2.

#Let w0 be any word in vocab_words and w any word in context_words.
#Each time w0 occurs in the corpus, look at the window of window_size words before and after it. If w appears in this window, we say it appears in the context of (this particular occurrence of) w0.
#Define counts[w0][w] as the total number of times w occurs in the context of w0.
#The function get_counts computes the counts array, and returns it as a dictionary (of dictionaries).
def get_counts(window_size=2):
    counts = {}
    for w0 in vocab_words:
        counts[w0] = {}
    for i in range(window_size, N-window_size):
        w0 = my_word_stream[i]
        if w0 in vocab_words:
            for j in (list(range(-window_size,0)) + list(range(1,window_size+1))):
                w = my_word_stream[i+j]
                if w in context_words:
                    if w not in counts[w0].keys():
                        counts[w0][w] = 1
                    else:
                        counts[w0][w] = counts[w0][w] + 1
    return counts

In [13]:
#Define probs[w0][] to be the distribution over the context of w0, that is:

#probs[w0][w] = counts[w0][w] / (sum of all counts[w0][])
#This is computed by the function get_co_occurrence_dictionary, given counts.
def get_co_occurrence_dictionary(counts):
    probs = {}
    for w0 in counts.keys():
        sum = 0
        for w in counts[w0].keys():
            sum = sum + counts[w0][w]
        if sum > 0:
            probs[w0] = {}
            for w in counts[w0].keys():
                probs[w0][w] = float(counts[w0][w])/float(sum)
    return probs

In [14]:
#The final piece of information we need is the frequency of different context words. The function below, get_context_word_distribution, takes counts as input and returns (again, in dictionary form) the array:

#context_frequency[w] = sum of all counts[][w] / sum of all counts[][]
def get_context_word_distribution(counts):
    counts_context = {}
    sum_context = 0
    context_frequency = {}
    for w in context_words:
        counts_context[w] = 0
    for w0 in counts.keys():
        for w in counts[w0].keys():
            counts_context[w] = counts_context[w] + counts[w0][w]
            sum_context = sum_context + counts[w0][w]
    for w in context_words:
        context_frequency[w] = float(counts_context[w])/float(sum_context)
    return context_frequency

In [15]:
#3. The embedding
#Based on the various pieces of information above, we compute the pointwise mutual information matrix:

#PMI[i,j] = MAX(0, log probs[ith vocab word][jth context word] - log context_frequency[jth context word])
#The embedding of any word can then be taken as the corresponding row of this matrix. However, to reduce the dimension, we will apply PCA.
print ("Computing counts and distributions")
counts = get_counts(2)
probs = get_co_occurrence_dictionary(counts)
context_frequency = get_context_word_distribution(counts)
#
print ("Computing pointwise mutual information")
n_vocab = len(vocab_words)
n_context = len(context_words)
pmi = np.zeros((n_vocab, n_context))
for i in range(0, n_vocab):
    w0 = vocab_words[i]
    for w in probs[w0].keys():
        j = context_words.index(w)
        pmi[i,j] = max(0.0, np.log(probs[w0][w]) - np.log(context_frequency[w]))

Computing counts and distributions
Computing pointwise mutual information


In [17]:
#Now reduce the dimension of the PMI vectors using principal component analysis. Here we bring it down to 100 dimensions, and then normalize the vectors to unit length
pca = PCA(n_components=100)
vecs = pca.fit_transform(pmi)
for i in range(0,n_vocab):
    vecs[i] = vecs[i]/np.linalg.norm(vecs[i])

In [18]:
#It is useful to save this embedding so that it doesn't need to be computed every time.
fd = open("embedding.pickle", "wb")
pickle.dump(vocab_words, fd)
pickle.dump(context_words, fd)
pickle.dump(vecs, fd)
fd.close()

In [19]:
#We can get some insight into the embedding by looking at the nearest neighbor of different words in the embedded space.
def word_NN(w):
    if not(w in vocab_words):
        print ("Unknown word")
        return
    v = vecs[vocab_words.index(w)]
    neighbor = 0
    curr_dist = np.linalg.norm(v - vecs[0])
    for i in range(1, n_vocab):
        dist = np.linalg.norm(v - vecs[i])
        if (dist < curr_dist) and (dist > 0.0):
            neighbor = i
            curr_dist = dist
    return vocab_words[neighbor]

In [20]:
word_NN('pulmonary')

'artery'

In [21]:
word_NN('communism')

'era'

In [22]:
word_NN('ass')

Unknown word


In [24]:
word_NN('friday')

'sunday'

In [26]:
word_NN('be')

Unknown word


In [27]:
word_NN('london')

'later'