In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [19]:
import os, re
from collections import defaultdict, deque
import pymongo
from datetime import datetime
import pandas as pd
import random
import numpy as np
from common import Label_DbFields, Synthetic_Category_Group_Names, Other_Synthetic_Group_Names, MultiLabel_Group_Name, Labels

random.seed(42)

In [3]:
USE_TRUECASER = True
if USE_TRUECASER:
    from truecaser.Truecaser import getTrueCase
    try:
        import cPickle as pickle
    except:
        import pickle
    import nltk
    import nltk.data
    from nltk.tokenize.treebank import TreebankWordDetokenizer as Detok
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    detokenizer = Detok()

In [4]:
def loadTrueCaserModel(model_filename):
    print('Loading truecaser model ...')
    if not os.path.exists(model_filename):
        model_filename = '../' + model_filename
    with open(model_filename, 'rb') as f:
        uniDist = pickle.load(f)
        backwardBiDist = pickle.load(f)
        forwardBiDist = pickle.load(f)
        trigramDist = pickle.load(f)
        wordCasingLookup = pickle.load(f)
    return (wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)

In [8]:
TC_MODEL = loadTrueCaserModel('truecaser/distributions.obj')

Loading truecaser model ...


### Pull down from mongodb and preprocess all snippets for later classification, saving by quarter (4 per year)

In [9]:
def trueCaseSnippets(snippets, model):
    '''true-case a list of (usually 3) snippets, and remove control characters within them'''
    truecase_snippets = []
    for part in snippets:
        part = re.sub(r"[\x00-\x0c]+", '', part) # remove control chars
        sentences = sent_detector.tokenize(part)
        trueCaseSentences = []
        for sentence in sentences:
            speech_start = False
            if len(sentence) > 2 and sentence[:2] == "> ":
                speech_start = True
                s = sentence[2:]
            else:
                s = sentence
            s = re.sub(r"&#[xX][\da-fA-F]+;", 'xxbell', s)
            tokens = nltk.word_tokenize(s.lower())
            tokens = getTrueCase(tokens, 'lower', *model)
            trueCaseSentence = detokenizer.detokenize(tokens)
            if speech_start:
                trueCaseSentence = "> " + trueCaseSentence          
            trueCaseSentences.append(trueCaseSentence)
        truecase_snippets.append(' '.join(trueCaseSentences))
    return truecase_snippets

In [15]:
def genSnippet(doc):
    return " ".join(trueCaseSnippets([doc['snippet_part1'], doc['snippet_part2'], doc['snippet_part3']], TC_MODEL))

In [None]:
# Note: the TC_MODEL uses most of the memory... almost 8gb
MONGODB_PORT = 25541

# generate a prepped-snippets file for each quarter in 2010-2016
with pymongo.MongoClient('localhost', MONGODB_PORT) as mclient:
    mdb = mclient.snippetdb
    msnippets = mdb.snippets
    #msnippets.create_index([('holdout', pymongo.ASCENDING)])
    #msnippets.create_index([('train_set', pymongo.ASCENDING)])
    #msnippets.create_index([('airdatetime', pymongo.ASCENDING)])
    
    #doc_cursor = msnippets.find({'holdout': {'$exists': False}, 'train_set': {'$exists': False}})
    
    date_ranges = deque()
    date_to = None
    for year in reversed(range(2010,2017)):
        for month in reversed(["01", "04", "07", "10"]):
            date_from = str(year) + '-' + month + "-01"
            date_ranges.appendleft((date_from, date_to))
            date_to = date_from
    date_ranges = list(date_ranges)

    for quarter in date_ranges:
        date_query = { "$gte" : datetime.strptime(quarter[0], "%Y-%m-%d") }
        if quarter[1]:
            date_query["$lt"] = datetime.strptime(quarter[1], "%Y-%m-%d")

        count = msnippets.count({"airdatetime": date_query})
        print("%s: Processing %d snippets from quarter beginning %s" % (str(datetime.today()), count, quarter[0]))
        
        # get all snippets in that quarter (even for the rare case they're in train/holdout sets)
        # do this sequentially to save memory
        doc_cursor = msnippets.find({"airdatetime": date_query}, ['airdatetime', 'station', 'filename', 'file_idx', 'snippet_part1', 'snippet_part2', 'snippet_part3']) #, 'holdout': {'$exists': False}, 'train_set': {'$exists': False}}))
 
        fields_to_save = ['airdatetime', 'file_idx', 'filename', 'station', 'snippet']    
        with open('/data/' + quarter[0] + '_snippets.tsv', 'w') as f:
            f.write("\t".join(fields_to_save) + "\n")

            for doc in doc_cursor:
                snippet = genSnippet(doc)
                f.write("\t".join([str(doc[field]) for field in fields_to_save[:-1]] + [snippet]) + "\n")
                

2019-03-16 11:35:12.108427: Processing 629380 snippets from quarter beginning 2010-01-01
2019-03-16 12:17:02.272085: Processing 532573 snippets from quarter beginning 2010-04-01
2019-03-16 12:51:54.451804: Processing 697109 snippets from quarter beginning 2010-07-01


In [None]:
    # iterative version, saves on memory, slightly slower
    fields_to_save = ['airdatetime', 'file_idx', 'filename', 'station', 'snippet']    
    with open(quarter[0] + '_snippets.csv', 'w') as f:
        f.write("\t".join(fields_to_save) + "\n")

        for doc in doc_cursor:
            snippet = genSnippet(doc)
            f.write("\t".join([doc[field] for field in fields_to_save[:-1]] + [snippet]) + "\n")

In [None]:
        # batch version (uses too much memory)
        df = pd.DataFrame.from_records(doc_cursor)
        
        # clean up, true-case, and join snippet parts into a single snippet
        df['snippet'] = df.apply(genSnippet, axis=1) # apply function to each row

        # remove index & snippet parts
        #del df['_id'], etc ... is slower (although this presumably needs more memory)
        df = df[['airdatetime', 'file_idx', 'filename', 'station', 'snippet']]
        
        #save to file
        df.to_csv(quarter[0] + '_snippets.tsv', sep='\t', index=False)