In [2]:
# All Import Statements Defined Here
# Note: Do not add to this list.
# ----------------

import sys
assert sys.version_info[0]==3
assert sys.version_info[1] >= 5

from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import pprint
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 5]
import nltk
nltk.download('reuters')
from nltk.corpus import reuters
import numpy as np
import random
import scipy as sp
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA

START_TOKEN = '<START>'
END_TOKEN = '<END>'

np.random.seed(0)
random.seed(0)
# ----------------

[nltk_data] Downloading package reuters to
[nltk_data]     /Users/hamzagorgulu/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


In [3]:
def distinct_words(corpus):
    """ Determine a list of distinct words for the corpus.
        Params:
            corpus (list of list of strings): corpus of documents
        Return:
            corpus_words (list of strings): sorted list of distinct words across the corpus
            num_corpus_words (integer): number of distinct words across the corpus
    """
    
    
    corpus_words = []
    num_corpus_words = -1
    
    
    
    # ------------------
    # there are 578 pharagraphs in reuters_corpus and each of them represented inside list
    flattened_lst = []
    for x in corpus:
        for y in x:
            flattened_lst.append(y)
        
    unique_words = set(flattened_lst)
    
    corpus_words = sorted([word for word in unique_words])
    
    num_corpus_words = len(corpus_words)
    
    # ------------------

    return corpus_words, num_corpus_words

In [4]:
def compute_co_occurrence_matrix(corpus, window_size=4):
    """ Compute co-occurrence matrix for the given corpus and window_size (default of 4).
    
        Note: Each word in a document should be at the center of a window. Words near edges will have a smaller
              number of co-occurring words.
              
              For example, if we take the document "<START> All that glitters is not gold <END>" with window size of 4,
              "All" will co-occur with "<START>", "that", "glitters", "is", and "not".
    
        Params:
            corpus (list of list of strings): corpus of documents
            window_size (int): size of context window
        Return:
            M (a symmetric numpy matrix of shape (number of unique words in the corpus , number of unique words in the corpus)): 
                Co-occurence matrix of word counts. 
                The ordering of the words in the rows/columns should be the same as the ordering of the words given by the distinct_words function.
            word2ind (dict): dictionary that maps word to index (i.e. row/column number) for matrix M.
    """
    words, num_words = distinct_words(corpus)
    M = None
    word2ind = {}
    
    # ------------------
    word2ind = dict(zip(words, range(len(words))))
    # zip inside dict basically takes the words as key and gives it a number with the range of length of all words.
    
    M = np.zeros([num_words,num_words])  # M.shape is 8185x8185
    
    # create cooccurence matrix
    for doc in corpus:
        for i in range(len(doc)):
            for j in range(max(0,i-window_size),min(len(doc),i+window_size+1)):
                if i != j:
                    M[word2ind[doc[i]],word2ind[doc[j]]] += 1
    # ------------------

    return M, word2ind

In [5]:
def reduce_to_k_dim(M, k=2):
    """ Reduce a co-occurence count matrix of dimensionality (num_corpus_words, num_corpus_words)
        to a matrix of dimensionality (num_corpus_words, k) using the following SVD function from Scikit-Learn:
            - http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html
    
        Params:
            M (numpy matrix of shape (number of unique words in the corpus , number of unique words in the corpus)): co-occurence matrix of word counts
            k (int): embedding size of each word after dimension reduction
        Return:
            M_reduced (numpy matrix of shape (number of corpus words, k)): matrix of k-dimensioal word embeddings.
                    In terms of the SVD from math class, this actually returns U * S
    """    
    n_iters = 10     # Use this parameter in your call to `TruncatedSVD`
    M_reduced = None
    print("Running Truncated SVD over %i words..." % (M.shape[0]))
    
    # ------------------
    # Write your implementation here.
    # reduce dimention from 8185x8185 to 8185x2
    svd = TruncatedSVD(n_components=k, n_iter = n_iters, random_state=42)
    M_reduced = svd.fit_transform(M)
    # ------------------

    print("Done.")
    return M_reduced

In [6]:
def plot_embeddings(M_reduced, word2ind, words):
    """ Plot in a scatterplot the embeddings of the words specified in the list "words".
        NOTE: do not plot all the words listed in M_reduced / word2ind.
        Include a label next to each point.
        
        Params:
            M_reduced (numpy matrix of shape (number of unique words in the corpus , 2)): matrix of 2-dimensioal word embeddings
            word2ind (dict): dictionary that maps word to indices for matrix M
            words (list of strings): words whose embeddings we want to visualize
    """

    # ------------------
    # M_reduced is used like coordinates of the particular given words
    for i,word in enumerate(words):
        x = M_reduced[word2ind[words[i]]][0]
        y = M_reduced[word2ind[words[i]]][1]
        plt.scatter(x, y, marker='x', color='red')
        plt.text(x, y, words[i], fontsize=11)
    plt.show()
    # ------------------