In [1]:
import os
import sys
from pathlib import Path
import re

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams

from gensim.corpora.dictionary import Dictionary
from gensim.models.phrases import Phrases

# Project level imports
from ncbi_remap.notebook import Nb
from ncbi_remap.plotting import make_figs
from ncbi_remap.nlp import lookups

  _nan_object_mask = _nan_object_array != _nan_object_array


In [2]:
# Setup notebook
nbconfig = Nb.setup_notebook()

Please check output/fbgn2chrom.tsv. If it does not exist, run bin/fbgn2chrom.py
last updated: 2018-10-08 
Git hash: d9f50945fa864956cc17f22a30aafc5244874783


In [3]:
# Connect to data store
store = pd.HDFStore('../sra.h5', mode='r')

In [4]:
from pymongo import MongoClient
try:
    with open('../output/.mongodb_host', 'r') as fh:
        host = fh.read().strip()
except FileNotFoundError:
    host = 'localhost'

mongoClient = MongoClient(host=host, port=27017)
db = mongoClient['sra']
ncbi = db['ncbi']

In [20]:
def get_documents(doc):
    srx = doc['_id']
    del doc['_id']
    txt = ' '.join(doc.values()).lower()\
        .replace('_', ' ')\
        .replace('hi-c', 'hic')\
        .replace('3-c', '3c')\
        .replace('4-c', '4c')\
        .replace("3'", '3prime')\
        .replace('-', ' ')\
        .replace('sequencing', 'seq')\
        .replace('sequenced', 'seq')\
        .replace('sequence', 'seq')
    
    # Translate based on known phrases in lookup table
    for k, v in lookups['library_strategy'].items():
        txt = txt.replace(k, v)
        
    return srx, txt
    
documents = [get_documents(x) for x in ncbi.aggregate([
    {
        '$project': {
            'title': '$sra.study.title',
            'abstract': '$sra.study.abstract',
            'type': '$sra.study.study_type',
            'exp_title': '$sra.experiment.title',
            'exp_design': '$sra.experiment.design',
            'exp_library_name': '$sra.experiment.library_name',
        }
    },
])]

In [21]:
# list of common stop words along with some custom ones
eng_stops = stopwords.words('english') + lookups['stopwords']

# initialize the lemmatizer
wordnets_lemmatizer = WordNetLemmatizer()

In [22]:
# tokenize documents
tokenized_documents = []
for doc in np.asarray(documents)[:, 1]:
    tokens = regexp_tokenize(doc, r"[\w-]+")
    
    lemma = []
    for token in tokens:
        if token.isnumeric():
            continue
        elif len(token) <= 1:
            continue
        elif token in eng_stops:
            continue
            
        lemma.append(wordnets_lemmatizer.lemmatize(token))
    
    # store
    tokenized_documents.append(lemma)

In [23]:
strategies = set(lookups['library_strategy'].values())

In [24]:
doc_strategies = []
for doc in tokenized_documents:
    token_strategies = set()
    for token in doc:
        if token in strategies:
            token_strategies.add(token)
    doc_strategies.append('|'.join(list(token_strategies)))

In [25]:
pd.DataFrame(doc_strategies, index=np.asarray(documents)[:, 0])

Unnamed: 0,0
SRX4104113,HiC-Seq|WGS
SRX4104112,HiC-Seq|WGS
SRX4104111,HiC-Seq|WGS
SRX4104110,HiC-Seq|WGS
SRX4104109,HiC-Seq|WGS
SRX4104108,HiC-Seq|WGS
SRX4104107,HiC-Seq|WGS
SRX4104106,HiC-Seq|WGS
SRX4104105,HiC-Seq|WGS
SRX4104104,HiC-Seq|WGS
