# Looking into creating a bag of words document model

In [1]:
import os
import sys
from pathlib import Path
from collections import defaultdict
from itertools import chain

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer

# Project level imports
sys.path.insert(0, '../lib')
from ncbi_remap.notebook import Nb
from ncbi_remap.plotting import make_figs

# Connect to data store
store = pd.HDFStore('../output/sra.h5', mode='r')
samples = store['aln/complete'].srx.unique().tolist()
store.close()

In [2]:
from pymongo import MongoClient
try:
    with open('../output/.mongodb_host', 'r') as fh:
        host = fh.read().strip()
except FileNotFoundError:
    host = 'localhost'

mongoClient = MongoClient(host=host, port=27017)
db = mongoClient['sra']
ncbi = db['ncbi']

In [320]:
def get_document(vals):
    string = []
    srx = vals['srx']
    sample_title = vals.get('sample_title', '')
    attrs = vals.get('attrs', [])
    
    string.append(sample_title)
        
    for attr in attrs:
        if attr['value'] == 'not applicable':
            continue
            
        string.append(attr['value'])
    
    doc = ' '.join([str(x) for x in string])
    #doc = doc.lower()
    #doc = doc.replace('_', ' ').replace('-', ' ')
    return srx, doc

In [321]:
docs = [get_document(x) for x in ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': samples}
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'sample_title': '$sra.sample.title',
            'attrs': '$sra.sample.attributes',
        }
    }
])]

In [322]:
ids = [x[0] for x in docs]
dds = [x[1].replace('_', ' ') for x in docs]

In [323]:
vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words='english', min_df=0, max_df=1.0, sublinear_tf=True)
#token_pattern=r"[\w\(\);\-\+\[\]\/]+",
#r'(?u)\b\w\w+\b'

In [324]:
tfidf_mat = vectorizer.fit_transform(dds)

In [325]:
feature_names = vectorizer.get_feature_names()

In [326]:
row = np.asarray(tfidf_mat[9500].todense())[0]

In [327]:
names = np.asarray(feature_names)[(row > 0)]
values = row[(row > 0)]
sr = pd.Series(values, index=names).sort_values(ascending=False)
sr

h4         0.554909
100u       0.496991
mnase      0.366444
ab10158    0.293531
s2         0.263175
chip       0.232604
100        0.203676
abcam      0.176606
rep2       0.124992
cells      0.113886
dtype: float64

In [328]:
bob = np.asarray(feature_names)
mask = np.squeeze(np.asarray((tfidf_mat > .5).todense().any(axis=0)))

In [329]:
bob[mask].tolist()

['000',
 '0001',
 '0002',
 '0003',
 '0004',
 '0005',
 '0006',
 '0007',
 '0008',
 '0009',
 '001',
 '0010',
 '0011',
 '0012',
 '0013',
 '0014',
 '0015',
 '0016',
 '0017',
 '0018',
 '0019',
 '002',
 '0020',
 '0021',
 '0022',
 '0023',
 '0024',
 '0025',
 '0026',
 '0027',
 '0028',
 '0029',
 '003',
 '0030',
 '0031',
 '0032',
 '0033',
 '0034',
 '0035',
 '0036',
 '0037',
 '0038',
 '0039',
 '0040',
 '0041',
 '0042',
 '0043',
 '0044',
 '0045',
 '0046',
 '0047',
 '0048',
 '0049',
 '0050',
 '0051',
 '0052',
 '0053',
 '0054',
 '0055',
 '0056',
 '0057',
 '0058',
 '0059',
 '0060',
 '0061',
 '0062',
 '0063',
 '0064',
 '0065',
 '0066',
 '0067',
 '0068',
 '0069',
 '0070',
 '0071',
 '0072',
 '0073',
 '0074',
 '0075',
 '0076',
 '0077',
 '0078',
 '0079',
 '0080',
 '0081',
 '0082',
 '0083',
 '0084',
 '0085',
 '0086',
 '0087',
 '0088',
 '0089',
 '0090',
 '0091',
 '011',
 '012',
 '016',
 '017',
 '01a',
 '01n',
 '01ng',
 '02',
 '020',
 '022',
 '023',
 '024',
 '026',
 '027',
 '028',
 '029',
 '030',
 '031',
 '032

In [157]:
ncbi.find_one({'_id': 'SRX2261278'}, {'sra.sample': 1})

{'_id': 'SRX2261278',
 'sra': {'sample': {'sample_id': 'SRS1755604',
   'BioSample': 'SAMN05931230',
   'external_id': [],
   'secondary_id': [],
   'submitter_id': [],
   'uuid': [],
   'title': 'Gateway Entry Clones from Drosophila melanogaster Gold Collection ORFs',
   'taxon_id': '7227',
   'scientific_name': 'Drosophila melanogaster',
   'common_name': 'fruit fly',
   'attributes': [{'name': 'strain', 'value': 'y1; cn1 bw1 sp1'},
    {'name': 'age', 'value': 'not applicable'},
    {'name': 'dev_stage', 'value': 'not applicable'},
    {'name': 'sex', 'value': 'not applicable'},
    {'name': 'tissue', 'value': 'not applicable'},
    {'name': 'BioSampleModel', 'value': 'Model organism or animal'}],
   'url_links': [],
   'xref_links': [],
   'entrez_links': [],
   'ddbj_links': [],
   'ena_links': []}}}

In [5]:
def tokenize_document(doc):
    # tokens document into individual words
    tokens = regexp_tokenize(doc.lower(), r"[\w\(\);\-\+\[\]\/]+")
    
    # remove punctuation
    alpha_num = [token for token in tokens if token.isalnum()]
    
    # remove lone numbers
    no_num = [token for token in alpha_num if not token.isnumeric()]
    
    # remove single characters
    not_single = [token for token in no_num if len(token) > 1]
    
    # remove stop words
    eng_stops = stopwords.words('english') + ['drosophila', 'melanogaster']
    no_stops = [token for token in not_single if not token in eng_stops]
    
    # lemmatize
    wordnet_lemmatizer = WordNetLemmatizer()
    lemma = [wordnet_lemmatizer.lemmatize(token) for token in no_stops]
    
    return lemma

In [6]:
# tokenize document
tokenized_documents = list(map(tokenize_document, np.asarray(docs)[:, 1]))

# create bag of words
dictionary = Dictionary(tokenized_documents)
corpus = [dictionary.doc2bow(document) for document in tokenized_documents]

# Get corpus level counts
total_cnts = defaultdict(int)
for word_id, word_cnt in chain.from_iterable(corpus):
    total_cnts[word_id] += word_cnt

# Remove tokens that are only in one document
def drop_unique(document):
    res = []
    for token_id, token_cnt in document:
        if token_cnt == total_cnts[token_id]:
            continue
        res.append((token_id, token_cnt))
    return res

corpus_no_unique = list(map(drop_unique, corpus))

# Create document model
tfidf = TfidfModel(corpus_no_unique)

# Calculate weights
tfidf_weights = [tfidf[document] for document in corpus_no_unique] 

In [7]:
def human_weights(document):
    sorted_weights = sorted(document, key=lambda w: w[1], reverse=True)
    
    human = []
    for wt in sorted_weights:
        if wt[1] > 0.25:
            word = dictionary.get(wt[0])
            human.append((word, wt[1]))
            
    return human

In [8]:
# pull out the best terms and make them readable
weights = list(map(human_weights, tfidf_weights))

# Concatenate words in order of weight
bows = []
for srx, wts in zip(np.asarray(docs)[:, 0], weights):
    if len(wts) == 0:
        string = ''
    else:
        string = '|'.join(np.asarray(wts)[:, 0])
    bows.append((srx, string))

In [9]:
keywords = pd.DataFrame(bows, columns=['srx', 'keywords'])
keywords.set_index('srx', inplace=True)
keywords.to_parquet('../output/notebook/2018-07-11_bow_keywords.parquet')