# BLM Keyword Identification
Sandboxing some simple keyword identification routines.
Full scale work will probably not be done in a notebook, mostly exploration here.

In [18]:
from os import getcwd, listdir
import os.path

import spacy
from scipy import sparse
import numpy as np

In [2]:
nlp = spacy.load("en_core_web_lg")

Gather a list of codex entries to analyze.

In [49]:
DOC_LIMIT = 5

base_path = os.path.join(getcwd(), "research/data/codex")

file_names = [f for f in listdir(base_path) if os.path.isfile(os.path.join(base_path, f)) and f.endswith(".txt")]

The documents and "spans" (named entities) need to be assigned unique category numbers, so they can be operated upon numerically.

In [50]:
docs_m = dict()
spans_n = dict()

doc_index_m = -1
span_index_n = -1
for file_name in file_names[:DOC_LIMIT]:
    with open(os.path.join(base_path, file_name), 'r') as f:
        doc = nlp(f.read())
    if not file_name in docs_m:
        doc_index_m = doc_index_m + 1
        docs_m[file_name] = doc_index_m
    spans = spacy.util.filter_spans(
        set(doc.ents).union(set(doc.noun_chunks)))
    spans = [t.lower_ for t in [span for span in spans]]
    for span in spans:
        if not span in spans_n:
            span_index_n = span_index_n + 1
            spans_n[span] = span_index_n  

Pass through all of the documents a second time.
This time tabulating the span (entity) frequency per document for all known entities across the corpus.

In [51]:
M = sparse.lil_matrix((len(docs_m), len(spans_n)))
for file_name in file_names[:DOC_LIMIT]:
    doc_index = docs_m[file_name]
    with open(os.path.join(base_path, file_name), 'r') as f:
        text = f.read().lower()
    for span, span_index in spans_n.items():
        M[doc_index, span_index] = text.count(span)

Checking the density of the matrix for sanity.

In [52]:
density = 100 * M.nnz / np.prod(M.shape)
print("Matrix density: {}%".format(round(density,2)))

Matrix density: 25.64%


Calculate tfidf values within the matrix.