# Document Embedding and Similarity
> Lilian Luong

In [58]:
# Imports
import docx
import fitz
import gensim
import numpy as np
import pickle as pkl
import time

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation

In [56]:
# SETUP
# If this is your first time running, uncomment the code below and run once

# # Load model from Google .bin and pickle

# model = gensim.models.KeyedVectors.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin', binary=True)
# with open("./data/model.pkl", "wb") as f:
#     pkl.dump(model, f)
    
# # Download from nltk

# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')

# # Custom model

# with open("./data/custom_model.pkl", "wb") as f:
#     custom_model = pkl.dump({}, f)

In [57]:
# Load model from pickle
t0 = time.time()
with open("./data/model.pkl", "rb") as f:
    model = pkl.load(f)
print("Model loaded in", time.time() - t0, "seconds")

t0 = time.time()
with open("./data/custom_model.pkl", "rb") as f:
    custom_model = pkl.load(f)
print("Custom model loaded in", time.time() - t0, "seconds")

# Load stopwords from nltk
eng_stopwords = stopwords.words('english')
print("Loaded list of English stopwords")

Model loaded in 8.513442993164062 seconds
Custom model loaded in 0.005997896194458008 seconds
Loaded list of English stopwords


In [25]:
# Document loading functions
# @param filepath   path to a file of the appropriate type
# @return           string of text in the document

def read_docx(filepath):
    doc = docx.Document(filepath)
    text = "\n".join([par.text for par in doc.paragraphs])
    return text

def read_pdf(filepath):
    doc = fitz.open(filepath)
    text = "\n".join([doc[i].getText("text") for i in range(doc.pageCount)])
    doc.close()
    return text

def read_txt(filepath):
    with open(filepath, "r") as f:
        text = f.read()
    return text

def read_file(filepath):
    """Loader for general files, which calls one of the other three."""
    file_extension = filepath.split(".")[-1].lower()
    read_func = {
        "docx": read_docx,
        "pdf": read_pdf,
        "txt": read_txt
    }[file_extension]
    return read_func(filepath)

In [34]:
docx_text = read_file('./data/docx_test.docx')
pdf_text = read_file('./data/pdf_test.pdf')
txt_text = read_file('./data/txt_test.txt')

In [62]:
def embed_document(text):
    """Return a 300-dim embedding for document with text text."""
    for punc in punctuation:
        text = text.replace(punc, " ")
    for numb in "0123456789":
        text = text.replace(numb, " $number ")
    tokens = word_tokenize(text)
    embedding = np.zeros((300,))
    for token in tokens:
        if token in eng_stopwords:
            continue
        if token not in model:
            # token doesn't exist in the model, for now skip
            # future approach: generate random vector and save it
            # print(token)
            if token not in custom_model:
                custom_model[token] = np.random.random(size=(300,)) - 0.5
            embedding += custom_model[token]
            continue
        embedding += model[token]
    return embedding

In [60]:
def cos_similarity(v1, v2):
    return np.dot(v1, v2) / np.linalg.norm(v1) / np.linalg.norm(v2)

In [63]:
documents = (docx_text, pdf_text, txt_text)
for a in documents:
    for b in documents:
        if a != b:
            print(cos_similarity(embed_document(a), embed_document(b)))

0.648661611959087
0.6763562931654774
0.648661611959087
0.3869043236108866
0.6763562931654774
0.3869043236108865
