# Looking into creating a bag of words document model

In [1]:
import os
import sys
from pathlib import Path
from collections import defaultdict
from itertools import chain

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel

# Project level imports
sys.path.insert(0, '../lib')
from ncbi_remap.notebook import Nb
from ncbi_remap.plotting import make_figs

# Connect to data store
store = pd.HDFStore('../output/sra.h5', mode='r')
samples = store['aln/complete'].srx.unique().tolist()
store.close()

In [2]:
from pymongo import MongoClient
try:
    with open('../output/.mongodb_host', 'r') as fh:
        host = fh.read().strip()
except FileNotFoundError:
    host = 'localhost'

mongoClient = MongoClient(host=host, port=27017)
db = mongoClient['sra']
ncbi = db['ncbi']

In [3]:
def get_document(vals):
    string = []
    srx = vals['srx']
    sample_title = vals.get('sample_title', '')
    attrs = vals.get('attrs', [])
    
    string.append(sample_title)
        
    for attr in attrs:
        string.append(attr['value'])
        
    return srx, ' '.join([str(x) for x in string])

In [4]:
docs = [get_document(x) for x in ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': samples}
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$srx',
            'sample_title': '$sra.sample.title',
            'attrs': '$sra.sample.attributes',
        }
    }
])]

In [5]:
def tokenize_document(doc):
    # tokens document into individual words
    tokens = regexp_tokenize(doc.lower(), r"[\w\(\);\-\+\[\]\/]+")
    
    # remove punctuation
    alpha_num = [token for token in tokens if token.isalnum()]
    
    # remove lone numbers
    no_num = [token for token in alpha_num if not token.isnumeric()]
    
    # remove single characters
    not_single = [token for token in no_num if len(token) > 1]
    
    # remove stop words
    eng_stops = stopwords.words('english') + ['drosophila', 'melanogaster']
    no_stops = [token for token in not_single if not token in eng_stops]
    
    # lemmatize
    wordnet_lemmatizer = WordNetLemmatizer()
    lemma = [wordnet_lemmatizer.lemmatize(token) for token in no_stops]
    
    return lemma

In [6]:
# tokenize document
tokenized_documents = list(map(tokenize_document, np.asarray(docs)[:, 1]))

# create bag of words
dictionary = Dictionary(tokenized_documents)
corpus = [dictionary.doc2bow(document) for document in tokenized_documents]

# Get corpus level counts
total_cnts = defaultdict(int)
for word_id, word_cnt in chain.from_iterable(corpus):
    total_cnts[word_id] += word_cnt

# Remove tokens that are only in one document
def drop_unique(document):
    res = []
    for token_id, token_cnt in document:
        if token_cnt == total_cnts[token_id]:
            continue
        res.append((token_id, token_cnt))
    return res

corpus_no_unique = list(map(drop_unique, corpus))

# Create document model
tfidf = TfidfModel(corpus_no_unique)

# Calculate weights
tfidf_weights = [tfidf[document] for document in corpus_no_unique] 

In [7]:
def human_weights(document):
    sorted_weights = sorted(document, key=lambda w: w[1], reverse=True)
    
    human = []
    for wt in sorted_weights:
        if wt[1] > 0.25:
            word = dictionary.get(wt[0])
            human.append((word, wt[1]))
            
    return human

In [8]:
# pull out the best terms and make them readable
weights = list(map(human_weights, tfidf_weights))

# Concatenate words in order of weight
bows = []
for srx, wts in zip(np.asarray(docs)[:, 0], weights):
    if len(wts) == 0:
        string = ''
    else:
        string = '|'.join(np.asarray(wts)[:, 0])
    bows.append((srx, string))

In [9]:
keywords = pd.DataFrame(bows, columns=['srx', 'keywords'])
keywords.set_index('srx', inplace=True)
keywords.to_parquet('../output/notebook/2018-07-11_bow_keywords.parquet')