<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#word-vector-exploration" data-toc-modified-id="word-vector-exploration-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>word vector exploration</a></span><ul class="toc-item"><li><span><a href="#First-manual-exploration-of-GloVe-vectors" data-toc-modified-id="First-manual-exploration-of-GloVe-vectors-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>First manual exploration of GloVe vectors</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Cosine-similarity" data-toc-modified-id="Cosine-similarity-1.1.0.1"><span class="toc-item-num">1.1.0.1&nbsp;&nbsp;</span>Cosine similarity</a></span></li></ul></li></ul></li><li><span><a href="#test-corpus-form-CS241-asignment-1" data-toc-modified-id="test-corpus-form-CS241-asignment-1-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>test corpus form CS241 asignment 1</a></span></li><li><span><a href="#Straight-summed-corpus" data-toc-modified-id="Straight-summed-corpus-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Straight summed corpus</a></span></li><li><span><a href="#tf-idf-exploration" data-toc-modified-id="tf-idf-exploration-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>tf-idf exploration</a></span></li></ul></li></ul></div>

# word vector exploration

In [None]:
import numpy as np
import sklearn

## First manual exploration of GloVe vectors

Glove data from https://nlp.stanford.edu/projects/glove/ we will start with the 50 dimensional vectors

In [None]:
fname ="/Users/leo.browning/ml_data/glove.6B/glove.6B.50d.txt"

Note that there are 400K embeddings, we can use this to chunk things into maneagable bits.

In [None]:
def split_gloveline(path):
    with open(path) as f:
        for line in f:
            cleanline = line.strip().split(' ')
            yield cleanline

In [None]:
glove_array = np.array([line for line in split_gloveline(fname)])

In [None]:
glove_array.shape

In [None]:
words=glove_array[:,0]
embeddings=glove_array[:,1:].astype(np.float16)

In [None]:
word_index = {word:index for word,index in zip(words,np.arange(len(words)))}

In [None]:
embedding_norms = np.linalg.norm(embeddings,axis=1).reshape((-1,1))

In [None]:
normalized_embeddings=np.divide(embeddings,embedding_norms)

In [None]:
test_embed = normalized_embeddings[word_index["test"]]

In [None]:
test_embed

In [None]:
normalized_embeddings[word_index["test"]].dot(test_embed)

In [None]:
# sim_matrix = normalized_embeddings.dot(normalized_embeddings.T)

In [None]:
test_embed.dot(normalized_embeddings.T)

In [None]:
top_index  = np.argsort(test_embed.dot(normalized_embeddings.T))[-10:]
top_index

In [None]:
words[top_index]

In [None]:
def cosine_closest_words(vec,embeddings=normalized_embeddings,words=words, num =10):
    indices = np.argsort(vec.dot(embeddings.T))[-num:]
    return words[indices][::-1]
def embedding_of(word,word_index=word_index,embeddings=normalized_embeddings):
    return embeddings[word_index[word]]
    

In [None]:
cosine_closest_words(test_embed)

#### Cosine similarity


$$ \text{similarity} =\frac{\vec A\cdot\vec B}{\| A\|  \|B\| }$$

quote from https://scikit-learn.org/stable/modules/metrics.html#cosine-similarity :

cosine_similarity computes the L2-normalized dot product of vectors. That is, if  and  are row vectors, their cosine similarity  is defined as:

This is called cosine similarity, because Euclidean (L2) normalization projects the vectors onto the unit sphere, and their dot product is then the cosine of the angle between the points denoted by the vectors.

This [cosine similarity] kernel is a popular choice for computing the similarity of documents represented as tf-idf vectors. cosine_similarity accepts scipy.sparse matrices. (Note that the tf-idf functionality in sklearn.feature_extraction.text can produce normalized vectors, in which case cosine_similarity is equivalent to linear_kernel, only slower.)

## test corpus form CS241 asignment 1

In [None]:
import sys
assert sys.version_info[0]==3
assert sys.version_info[1] >= 5

from gensim.models import KeyedVectors
from gensim.test.utils import datapath
from pprint import pprint
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 5]
import nltk
nltk.download('reuters')
nltk.download('stopwords')
from nltk.corpus import reuters,stopwords
import numpy as np
import random
import scipy as sp
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
import string
import re


np.random.seed(0)
random.seed(0)

In [None]:
def select_corpus(category="crude"):
    return reuters.raw(category="crude")

def process_doc(doc, v=False):
    reject =set(stopwords.words('english'))|set(string.punctuation)
    doc1 = [w.lower() for w in doc]
    doc2 = [w for w in doc1 if w not in reject]
    doc2 = [w for w in doc2 if len(w)>1]
    doc3 = [w for w in doc2 if (w in word_index)]
    doc4 = [w for w in doc3 if not(re.search('\d+',w))]
    if v and len(doc4)==0:
        print(f'{len(doc)} initially')
        print(f'{len(doc1)} words after lower')
        print(f'{len(doc2)} words after stopwords/punctuation')
        print(f'{len(doc2)} words after len=1 removal')
        print(f'{len(doc3)} words after check in index')
        print(f'{len(doc4)} words after number removal')
    return doc4
                                



def read_corpus(category="crude",v=False):
    """ Read files from the specified Reuter's category.
        Params:
            category (string): category name
        Return:
            list of lists, with words from each of the processed files
    """
    files = reuters.fileids(category)
    corpus_words = [[w for w in list(reuters.words(f))] for f in files ]
    corpus = [process_doc(doc,v=v) for doc in corpus_words]
    return corpus,[' '.join(doc) for doc in corpus_words]

In [None]:
corpus,raw_corpus = read_corpus(category='crude',v=True)

In [None]:
pprint(raw_corpus[0], compact=True, width=100)

In [None]:
test_doc=corpus[0]

In [None]:
test_doc_indices = np.array([word_index.get(word,None) for word in test_doc])
test_doc_indices = np.array([index for index in test_doc_indices if index is not None])
test_doc_indices

In [None]:
docsum = np.sum(normalized_embeddings[test_doc_indices], axis=0)

In [None]:
docsum.shape

In [None]:
cosine_closest_words(docsum,normalized_embeddings,words,20)

This rightly shows that the sum reflects the most common words found in that document, which are mostly useless words. 

Need to try searching the corpus to see if differentiating words are still captured, ie searching for ocean should ideally cut through the 'the' 'a' crap.

alternatively need to weight the vectors in the sum according to their frequency of occurrence, see tf-idf.

## Straight summed corpus

In [None]:
corpus, raw_corpus = read_corpus()

In [None]:
for i,doc in enumerate(corpus):
    if not(doc):
        print(raw_corpus[i])
        

In [None]:
def doc_embeddings(doc,word_index=word_index,embeddings=normalized_embeddings):
    indices = [word_index.get(word,None) for word in doc]
    cleaned_indices = [index for index in indices if index is not None]
    if not(cleaned_indices):
        print(doc)
    embeddings = embeddings[np.array(cleaned_indices)]
    return np.array(embeddings)

In [None]:
embedded_corpus = np.array([doc_embeddings(doc) for doc in corpus])

In [None]:
summed_corpus = np.array([np.sum(doc,axis=0) for doc in embedded_corpus])
summed_norms =np.linalg.norm(summed_corpus,axis=1).reshape((-1,1))
summed_normalized_corpus=np.divide(summed_corpus,summed_norms)

In [None]:
summed_normalized_corpus.shape

In [None]:
def search(word, embedded_corpus=summed_normalized_corpus, text_corpus=corpus, num_display=5,v=False):
    wordvec = embedding_of(word)
    similarity =wordvec.dot(embedded_corpus.T)
    indices = np.argsort(similarity)[:-num_display-1:-1]
    scores = similarity[indices]
    if v:
        for i,index in enumerate(indices):
            print(f'==================================================')
            print(f'{i+1}: score = {scores[i]} document #{index}')
            print(f'==================================================')
            if v>1:
                pprint(' '.join(corpus[i]), compact=True, width=100)  
    return np.array(indices)

In [None]:
search("flower",num_display=2,v=True)

In [None]:
doc_lengths = np.array([len(doc) for doc in embedded_corpus])

In [None]:
doc_lengths[search("flower",num_display=10)]

In [None]:
search("japan",num_display=1,v=True)

## tf-idf exploration

tf-idf stands for term frequency inverse document frequency.

$$ \text{tfidf}(t,d,D)=\text{tf}(t,d)\cdot \text{idf}(t,D) $$

where $t\in d$ is the term, $d\in D$ is the document in the corpus $D$.

we will use $\text{tf}(t,d) = f_{t,d}/|d|$ as the term freqency which is the count of a term in a document $f_{t,d}$ normalized for document length, and then using a log scaled inverse of the document frequency $\text{idf}{t,D} = \log \left[ \frac{N}{1+|\{d \in D : t \in d\}|} \right]$ to give:

$$ \text{tfidf}(t,d,D)=\frac{f_{t,d}}{|d|}\cdot \log \left[ \frac{N}{1+|\{d \in D : t \in d\}|} \right]$$

In [None]:
def idf(term,corpus):
    N=len(corpus)
    indoc =sum([1 for doc in corpus if term in doc ])
    # +1 to remove divbyzero errors
    return np.log(N/(indoc+1))
    

In [None]:
corpus_idf = {word:idf(word,corpus) for word in set(corpus[0]).union(*corpus[1:])}

In [None]:
def tfidf(term,doc,corpus_idf=corpus_idf):
    tf = doc.count(term)/len(doc)
    idf = corpus_idf[term]
    return tf*idf

In [None]:
#wordset in corpus
len(set(corpus[0]).union(*corpus[1:]))

In [None]:
len(corpus[0])

In [None]:
corpus[0].count('long')

In [None]:
tfidf('long',corpus[0])

In [None]:
tfidf('japan',corpus[0])

In [None]:
def tfidf_doc_embeddings(doc,corpus_idf=corpus_idf,word_index=word_index,embeddings=normalized_embeddings):
    scaled_embeddings=[]
    for word in doc:
        if word in word_index:
            index=word_index.get(word)
            tfidf_scaling=tfidf(word,doc)
            scaled_embeddings.append(embeddings[index]*tfidf_scaling)
    return np.array(scaled_embeddings)

In [None]:
tfidf_embedded_corpus = [tfidf_doc_embeddings(doc) for doc in corpus]

In [None]:
tfidf_doc_embeddings(corpus[0]).shape

In [None]:
tfidf_summed_corpus = np.array([np.sum(doc,axis=0) for doc in tfidf_embedded_corpus])
tfidf_summed_norms =np.linalg.norm(tfidf_summed_corpus,axis=1).reshape((-1,1))
tfidf_summed_normalized_corpus=np.divide(tfidf_summed_corpus,tfidf_summed_norms)

In [None]:
tfidf_summed_normalized_corpus.shape

### normal vs tfidf search

#### normal search

In [None]:
search("japan",num_display=5,v=1)

In [None]:
search("japan",num_display=2,v=2)

#### tfidf search

In [None]:
search("japan",num_display=5,v=1,embedded_corpus=tfidf_summed_normalized_corpus)

In [None]:
doc_lengths = np.array([len(doc) for doc in embedded_corpus])

In [None]:
search("japan",num_display=2,embedded_corpus=tfidf_summed_normalized_corpus,v=2)

### tfidf summary