# BLM Keyword Identification
Sandboxing some simple keyword identification routines.
Full scale work will probably not be done in a notebook, mostly exploration here.

In [1]:
from os import getcwd, listdir
import os.path
from concurrent.futures import ThreadPoolExecutor, wait
from pickle import Pickler, Unpickler

import spacy
from scipy import sparse
import numpy as np
import matplotlib.pyplot as plt

In [2]:
nlp = spacy.load("en_core_web_lg")

Gather a list of codex entries to analyze.

In [3]:
DOC_LIMIT = 500
MAX_WORKERS = None

base_path = os.path.join(getcwd(), "research/data/codex")

file_names = [f for f in listdir(base_path) if os.path.isfile(os.path.join(base_path, f)) and f.endswith(".txt")]

Identifing an algorithm for named entity extraction.

In [4]:
def extract_entities(text):
    '''
    Identify entities and noun_chunks within the document,
    and filter them such that there are no overlapping spans.
    Also taking this opportunity to convert text to lower-case
    to avoid having to do this later on.
    '''
    doc = nlp(text)
    spans = spacy.util.filter_spans(
        set(doc.ents).union(set(doc.noun_chunks)))
    entities = [t.lower_ for t in [span for span in spans]]
    return entities

Identify unique entities across the corpus, and build a numeric index of each document and entity.
The document and entitity indices will be used to reference values in a sparse matrix in a later step.

In [5]:
class Opts:
    def __init__(self, init_from=None):
        if init_from:
            self.docs_m = init_from.docs_m
            self.spans_n = init_from.spans_n
            self.doc_index_m = init_from.doc_index_m
            self.span_index_n = init_from.span_index_n
        else:
            self.docs_m = dict()
            self.spans_n = dict()
            self.doc_index_m = -1
            self.span_index_n = -1
        
    def __str__(self):
        return "doc_index_m: {}, span_index_n: {}".format(self.doc_index_m, self.span_index_n)
    
    def dump(self, file_name):
        with open(file_name, 'bw') as f:
            Pickler(f).dump(self)
            
    def load(self, file_name):
        with open(file_name, 'br') as f:
            self = Unpickler(f).load()

In [None]:
opts = Opts()

def inspect_file(file_name, o):
    with open(os.path.join(base_path, file_name), 'r') as f:
        text = f.read()
    if not file_name in o.docs_m:
        o.doc_index_m = o.doc_index_m + 1
        o.docs_m[file_name] = o.doc_index_m
    entities = extract_entities(text)
    for entity in entities:
        if not entity in o.spans_n:
            o.span_index_n = o.span_index_n + 1
            o.spans_n[entity] = o.span_index_n 

In [None]:
futures = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as e:
    for file_name in file_names[:DOC_LIMIT]:
        futures.append(e.submit(inspect_file, file_name, opts))
done, not_done = wait(futures, return_when="FIRST_EXCEPTION")
for d in done:
    if d.exception():
        [n.cancel() for n in not_done]
        raise d.exception()
print("done")

In [6]:
opts_file_name = os.path.join(os.getcwd(), "research/data/doc_entity_indices.pickle")

In [None]:
opts = Opts(init_from=opts) # make sure we're dumping whatever is currently in the repl's memory.
opts.dump(opts_file_name)

Pass through all of the documents a second time.
This time tabulating the span (entity) frequency per document for all known entities across the corpus.

In [None]:
opts.load(opts_file_name)

M = sparse.lil_matrix((len(opts.docs_m), len(opts.spans_n)))
def analyze_corpus(file_name, matrix, o):
    doc_index = o.docs_m[file_name]
    with open(os.path.join(base_path, file_name), 'r') as f:
        text = f.read().lower()
    for span, span_index in o.spans_n.items():
        matrix[doc_index, span_index] = text.count(span)

In [None]:
futures = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as e:
    for file_name in file_names[:DOC_LIMIT]:
        futures.append(e.submit(analyze_corpus, file_name, M, opts))
done, not_done = wait(futures, return_when="FIRST_EXCEPTION")
for d in done:
    if d.exception():
        [n.cancel() for n in not_done]
        raise d.exception()
print("done")   

In [8]:
npz_file_name = os.path.join(os.getcwd(), "research/data/corpus_entity_matrix.npz")

In [None]:
sparse.save_npz(npz_file_name), M.tocsr())

Checking the density of the matrix for sanity.

In [111]:
def density(M):
    d = 100 * M.nnz / np.prod(M.shape)
    print("Matrix density: {}%".format(round(d,2)))

In [112]:
density(M)

Matrix density: 1.28%


Calculate tfidf values within the matrix.

In [43]:
M = sparse.load_npz(npz_file_name)

In [100]:
M.shape

(500, 63271)

In [98]:
AXIS_DOCS = 0 # rows
AXIS_ENTS = 1 # columns

In [99]:
def tfidf(M):
    tf = sparse.csr_matrix(M / M.sum(axis=AXIS_ENTS))
    N = M.shape[AXIS_DOCS]
    Nt = np.ravel(M.astype(bool).sum(axis=AXIS_DOCS))
    idf = sparse.csr_matrix(np.log10(N/Nt))
    return tf.multiply(idf)

In [113]:
M_tfidf = tfidf(M)
density(M_tfidf)
print(M_tfidf)


  return np.true_divide(self.todense(), other)


Matrix density: 1.88%
  (0, 62605)	6.78082746157581e-05
  (0, 62388)	0.00012057166043859091
  (0, 61564)	6.33254851920391e-05
  (0, 61392)	0.000430852176695942
  (0, 61270)	0.00025546199680175226
  (0, 61211)	0.00031891741038330063
  (0, 61139)	0.00012749133060802967
  (0, 61134)	4.023624894882511e-05
  (0, 60875)	0.000215426088347971
  (0, 60755)	0.0001023368407786555
  (0, 60617)	4.04998435310284e-05
  (0, 60404)	8.291302605978031e-05
  (0, 60373)	0.0002210745475872741
  (0, 59882)	6.412770517409933e-05
  (0, 59376)	4.895986904193222e-05
  (0, 59234)	0.0002954979052555335
  (0, 58645)	3.447163046175633e-05
  (0, 58637)	4.0690954603795234e-05
  (0, 58411)	1.7380074495854882e-05
  (0, 58111)	0.00011992251402139868
  (0, 58110)	8.342435758010343e-05
  (0, 57751)	2.1843259557225113e-05
  (0, 57451)	4.45514263316302e-05
  (0, 57416)	9.531836298662726e-05
  (0, 57285)	0.0002659928181939088
  :	:
  (499, 201)	5.4591378080075324e-05
  (499, 197)	4.890036014954087e-05
  (499, 192)	0.000143351

Testing tfidf function using data from https://en.wikipedia.org/wiki/Tf%E2%80%93idf

In [92]:
N = sparse.csr_matrix(np.array([[1,1,2,1,0,0],
                                [1,1,0,0,2,3]]))
print(N)

  (0, 0)	1
  (0, 1)	1
  (0, 2)	2
  (0, 3)	1
  (1, 0)	1
  (1, 1)	1
  (1, 4)	2
  (1, 5)	3


In [97]:
print(tfidf(N))

  (0, 3)	0.06020599913279624
  (0, 2)	0.12041199826559248
  (1, 5)	0.12901285528456335
  (1, 4)	0.08600857018970891
