## Convert snippets to our best guess for correct capitalization ("true case")

In [15]:
%reload_ext autoreload
%autoreload 2

# NOTE: better to use python 2.7; python 3.7 may perform slightly differently due to differences in .lower()
# but should work ok in either

import re
from Truecaser import *
try:
    import cPickle as pickle
except:
    import pickle
import nltk
import nltk.data
from nltk.tokenize.treebank import TreebankWordDetokenizer as Detok
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
detokenizer = Detok()

In [16]:
def loadTrueCaserModel(model_filename):
    with open(model_filename, 'rb') as f:
        uniDist = pickle.load(f)
        backwardBiDist = pickle.load(f)
        forwardBiDist = pickle.load(f)
        trigramDist = pickle.load(f)
        wordCasingLookup = pickle.load(f)
    return (wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)

model = loadTrueCaserModel('distributions.obj')

In [17]:
def trueCaseSnippet(snippet, model):
    snippet_parts = snippet.split(" \\\\ ")
    truecase_snippet_parts = []
    for part in snippet_parts:
        sentences = sent_detector.tokenize(part)
        trueCaseSentences = []
        for sentence in sentences:
            speech_start = False
            if len(sentence) > 2 and sentence[:2] == "> ":
                speech_start = True
                s = sentence[2:]
            else:
                s = sentence
            s = re.sub(r"&#[xX][\da-fA-F]+;", 'xxbell', s)
            tokens = nltk.word_tokenize(s.lower())
            tokens = getTrueCase(tokens, 'lower', *model)
            # recreate sentence from tokens using regular expression to get punctuation right
            #trueCaseSentence = re.sub(" (?=[\.,'!?:;%])", "", ' '.join(tokens))
            trueCaseSentence = detokenizer.detokenize(tokens)
            #trueCaseSentence = re.sub(r"& #X([0-9A-F]{4})\s?;", r"&#x\1;", trueCaseSentence)
            if speech_start:
                trueCaseSentence = "> " + trueCaseSentence          
            trueCaseSentences.append(trueCaseSentence)
        truecase_snippet_parts.append(' '.join(trueCaseSentences))
    return ' '.join(truecase_snippet_parts) # ' \\\\ ' (remove \\)
            

In [18]:
#%%timeit
#snippet = "> I WANT TO TALK ABOUT THE HOME STANDBY GENERATOR MARKET SINCE A LOT OF VIEWERS ARE OUT THERE AND MAYBE HAVE A GENERATOR OR THINKING OF GETTING ONE. ABOUT 53% MORE THAN HALF OF SALES. \\\\ I'M JUST KAURS. THEY SEEM TO BE A BETTER MARGIN. IT'S A MORE FAVORABLE PRODUCT MIX. \\\\ > THE TREND HAS BEEN FANTASTIC. IT'S A PRETTY NEW MARKET, STILL. ONLY ABOUT 3 PRS OF HOMES HAVE THE PRODUCT TODAY."
snippet = "&#x266a; EVERY LITTLE BABY WANTS  50% MORE CASH... &#x266a;  PT! FINE, YOU TRY. HA HA. \\\\ THE    Capital One Cash Rewards Card. THE CARD FOR PEOPLE WHO WANT 50% MORE CASH. &#x266a;"
trueCaseSnippet(snippet, model)

'xxbell every little baby wants 50% more cash...xxbell PT! Fine, you try. Ha ha. The Capital One cash rewards card. The card for people who want 50% more cash. xxbell'

In [19]:
with open('../snippets_sample.1pct-nocontrolchars.tsv') as input_file:
    with open('../snippets_sample.1pct-truecase-tmp5.tsv', 'w') as output_file:
        for line in input_file:
            output_file.write(trueCaseSnippet(line, model) + '\n')

In [17]:
with open('../labeled_data/fold_0/label_category/label_category_valid.tsv') as input_file:
    with open('../labeled_data/fold_0/label_category/label_category_valid-truecase.tsv', 'w') as output_file:
        for line in input_file:
            parts = line.split('\t')
            assert(len(parts) == 2)
            output_file.write(parts[0] + '\t' + trueCaseSnippet(parts[1], model) + '\n')