# BLM Keyword Identification
Sandboxing some simple keyword identification routines.
Full scale work will probably not be done in a notebook, mostly exploration here.

In [15]:
from os import getcwd, listdir
import os.path
from concurrent.futures import ThreadPoolExecutor, wait

import spacy
from scipy import sparse
import numpy as np

In [2]:
nlp = spacy.load("en_core_web_lg")

Gather a list of codex entries to analyze.

In [20]:
DOC_LIMIT = 1000

base_path = os.path.join(getcwd(), "research/data/codex")

file_names = [f for f in listdir(base_path) if os.path.isfile(os.path.join(base_path, f)) and f.endswith(".txt")]

Identifing an algorithm for named entity extraction.

In [4]:
def extract_entities(text):
    '''
    Identify entities and noun_chunks within the document,
    and filter them such that there are no overlapping spans.
    Also taking this opportunity to convert text to lower-case
    to avoid having to do this later on.
    '''
    doc = nlp(text)
    spans = spacy.util.filter_spans(
        set(doc.ents).union(set(doc.noun_chunks)))
    entities = [t.lower_ for t in [span for span in spans]]
    return entities

Identify unique entities across the corpus, and build a numeric index of each document and entity.
The document and entitity indices will be used to reference values in a sparse matrix in a later step.

In [17]:
opts = {
    "docs_m" : dict(),
    "spans_n" : dict(),
    "doc_index_m" : -1,
    "span_index_n" : -1
}

def inspect_file(file_name, o):
    with open(os.path.join(base_path, file_name), 'r') as f:
        text = f.read()
    if not file_name in o["docs_m"]:
        o["doc_index_m"] = o["doc_index_m"] + 1
        o["docs_m"][file_name] = o["doc_index_m"]
    entities = extract_entities(text)
    for entity in entities:
        if not entity in o["spans_n"]:
            o["span_index_n"] = o["span_index_n"] + 1
            o["spans_n"][entity] = o["span_index_n"]  

In [21]:
futures = []
with ThreadPoolExecutor(max_workers=25) as e:
    for file_name in file_names[:DOC_LIMIT]:
        futures.append(e.submit(inspect_file, file_name, opts))
done, not_done = wait(futures)
for d in done:
    if d.exception():
        raise d.exception()
print("done")

done


Pass through all of the documents a second time.
This time tabulating the span (entity) frequency per document for all known entities across the corpus.

In [None]:
M = sparse.lil_matrix((len(opts["docs_m"]), len(opts["spans_n"])))
for file_name in file_names[:DOC_LIMIT]:
    doc_index = opts["docs_m"][file_name]
    with open(os.path.join(base_path, file_name), 'r') as f:
        text = f.read().lower()
    for span, span_index in opts["spans_n"].items():
        M[doc_index, span_index] = text.count(span)

Checking the density of the matrix for sanity.

In [12]:
density = 100 * M.nnz / np.prod(M.shape)
print("Matrix density: {}%".format(round(density,2)))

Matrix density: 14.9%


Calculate tfidf values within the matrix.