In [1]:
import sys

In [2]:
sys.path.append(os.path.join(os.path.abspath(os.path.join('../..')), 'src'))

In [3]:
import numpy
import scipy
import pandas

In [4]:
import mysql_utils
import doc_proc
import init_tdm_tables

## Init Needed Tools

In [5]:
dd = lambda doc: doc_proc.build_text_feature(doc, 
                                                 components = ['title', 'summary'],
                                                 lower=False, 
                                                 remove_stops=False,
                                                 html_text=True,
                                                 )

In [6]:
dp = init_tdm_tables.DocProcessor()

In [7]:
cnx = mysql_utils.getCnx()
cur = mysql_utils.getCur(cnx)

## Process

In [8]:
def query_wordIDLookup(cur, words):
    format_strings = ','.join(["%s"] * len(words))
    query = ("SELECT id, word FROM words "
             "WHERE word IN ({})".format(format_strings))
    cur.execute(query, list(words))
    cols = cur.column_names
    word_lookup = [{cols[0] : e[0], cols[1] : e[1]} for e in cur.fetchall()]
    word_lookup = {e['id'] : e['word'] for e in word_lookup}
    return(word_lookup)


def query_idWordLookup(cur, wids):
    format_strings = ','.join(["%s"] * len(wids))
    query = ("SELECT id, word FROM words "
             "WHERE id IN ({})".format(format_strings))
    cur.execute(query, list(wids))
    cols = cur.column_names
    word_lookup = [{cols[0] : e[0], cols[1] : e[1]} for e in cur.fetchall()]
    word_lookup = {e['id'] : e['word'] for e in word_lookup}
    return(word_lookup)

In [9]:
stops_lookup = query_wordIDLookup(cur, doc_proc.nltk_stops)

### Get Query Words

In [10]:
doc = {"title" : "FBI refused White House request to knock down recent Trump-Russia stories",
       "summary" : "Washington (CNN) The FBI rejected a recent White House request to publicly knock down media reports about communications between Donald Trump's associates and Russians known to US intelligence during the 2016 presidential campaign, multiple US officials briefed on the matter tell CNN."}

In [11]:
bow = dp.doc2BOW(dd(doc))

In [12]:
query_words = [w for w in bow if w not in doc_proc.nltk_stops]

### Query for Docs that Share Words

In [13]:
def query_docsOnWords(cur, words, word_type="word", exclude_docs=set()):
    
    if word_type == "word":
        word_doc_query = ("SELECT doc_bows.doc_id, doc_bows.wcount "
                          "FROM  doc_bows LEFT JOIN words ON (doc_bows.word_id = words.id) "
                          "WHERE words.word = '{}'")
    elif word_type == "id":
        word_doc_query = ("SELECT doc_bows.doc_id, doc_bows.wcount "
                          "FROM  doc_bows LEFT JOIN words ON (doc_bows.word_id = words.id) "
                          "WHERE words.id = '{}'")

    doc_ids = set()
    word_count_store = []
    for word in words:
        cur.execute(word_doc_query.format(word))
        result = mysql_utils.dfDocsFromCursor(cur)
        result = result[[i not in exclude_docs for i in result['doc_id']]]
        if result.shape[0] > 0:
            doc_ids.update(set(result['doc_id']))
            word_count_store.append({'word' : word,
                                     'n_docs' : result.shape[0],
                                     'n_tot' : result['wcount'].sum()})

    doc_ids = [int(i) for i in doc_ids]
    word_count_store = pandas.DataFrame(word_count_store)
        
    return(doc_ids, word_count_store)


def word_summary_info(cur, words, wtype='id', exclude_docs=set()):
    
    if exclude_docs:
        exclude_docs = [str(int(did)) for did in exclude_docs]
        exclude_docs = ", ".join(exclude_docs)
        exclude_where_text = "AND doc_bows.doc_id NOT IN ({}) ".format(exclude_docs)
        
    format_strings = ', '.join(['%s'] * len(words))
    
    if wtype=='word':
        where_clause = "WHERE words.word IN ({}) ".format(format_strings)
        if exclude_docs:
            where_clause += exclude_where_text
        query = "SELECT doc_bows.word_id, COUNT(doc_bows.doc_id) as n_docs, SUM(doc_bows.wcount) as n_total " +\
                 "FROM  doc_bows LEFT JOIN words ON (doc_bows.word_id = words.id) " +\
                 where_clause +\
                 "GROUP BY doc_bows.word_id"
        
    elif wtype=='id':
        where_clause = "WHERE doc_bows.word_id IN ({}) ".format(format_strings)
        if exclude_docs:
            where_clause += exclude_where_text
        query = "SELECT doc_bows.word_id, COUNT(doc_bows.doc_id) as n_docs, SUM(doc_bows.wcount) as n_total " +\
                 "FROM  doc_bows " +\
                 where_clause +\
                 "GROUP BY doc_bows.word_id"
            
    cur.execute(query, (words))
    result = mysql_utils.dfDocsFromCursor(cur)
    return(result)

#### Orig Doc

In [14]:
word_count_info = word_summary_info(cur, query_words, wtype='word')

In [15]:
word_count_info.n_docs.describe()

count      31.000000
mean      723.741935
std      1476.303152
min        10.000000
25%        61.500000
50%       233.000000
75%       705.500000
max      7586.000000
Name: n_docs, dtype: float64

In [16]:
# Arbit...this should be "decayed"....see below
ndoc_cutoff = 100

qw_l01 = [int(w) for w in list(word_count_info[word_count_info.n_docs < ndoc_cutoff].word_id)]

In [17]:
# 2nd Pass: Get Doc IDs to use
docs_l01, wcs_l01 = query_docsOnWords(cur, qw_l01, word_type='id')

In [18]:
qw_l01

[2294, 2900, 2901, 3072, 4148, 5427, 6202, 6327, 12578, 27973]

#### Level 1 Documents

In [19]:
def query_docBOW(cur, doc_id, word_list = []):
    query = "SELECT word_id, wcount FROM doc_bows WHERE doc_id = {}".format(doc_id)
    cur.execute(query)
    cols = cur.column_names
    bow = [{cols[0] : e[0], cols[1] : e[1]} for e in cur.fetchall()]
    if word_list:
        bow = [e for e in bow if e['word_id'] in word_list]
    return(bow)

In [20]:
def query_AllDocWords(cur, doc_ids):
    doc_ids = list(doc_ids)
    format_strings = ','.join(['%s'] * len(doc_ids))
    query = ("SELECT DISTINCT word_id "
             "FROM doc_bows "
             "WHERE doc_id IN ({})".format(format_strings))
    cur.execute(query, (doc_ids))
    words = [e[0] for e in cur.fetchall()]
    return(words)

### This stepo is a bottleneck; can i pre-reduce the query words by using info from the sub-set?

Have the word stats for all words queued up already..

In [21]:
query_words = query_AllDocWords(cur, docs_l01)
query_words = [w for w in query_words if w not in stops_lookup.keys()]

In [22]:
word_count_info = word_summary_info(cur, query_words, wtype='id', exclude_docs=docs_l01)

In [23]:
word_count_info.n_docs.describe()

count    3786.000000
mean      110.771527
std       247.038269
min         1.000000
25%        13.000000
50%        45.000000
75%       123.750000
max      7506.000000
Name: n_docs, dtype: float64

In [24]:
ndoc_cutoff_l2 = 15
qw_l02 = [int(w) for w in list(word_count_info[word_count_info.n_docs < ndoc_cutoff_l2].word_id)]
docs_l02, wcs_l02 = query_docsOnWords(cur, qw_l02,
                                      word_type="id", exclude_docs=docs_l01)

In [25]:
len(docs_l02)

5141

In [26]:
qww = query_idWordLookup(cur, qw_l02)

In [27]:
for i in qw_l02[:10]:
    print(qww[i])

surgeon
tales
bureau
barker
directed
jacob
masked
afp
headscarf
mufti


## Generate Network from Selected Vocab, Docs

In [28]:
words = set(qw_l01).copy()
words.update(qw_l02)

In [29]:
bows = {}
for d in docs_l01:
    bows[d] = query_docBOW(cur, d, word_list=words)
for d in docs_l02:
    bows[d] = query_docBOW(cur, d, word_list=words)

In [30]:
cnx.close()