### tldr prototyping

short-term goal:

* focus on tools to generate input functions for tensorflow Estimator API for different categories of problem

seems like there are 3 generic steps before you get to the tensorflow part:

  1. **loading** the data. depends on how data is saved (e.g. a CSV, folder of text files, etc) but not language or model directly
  2. **preprocessing** the text. tokenizing, stemming, stopwords, etc. depends on language and application, and (to a lesser extent) the model. but not on how the files were saved.
  3. **encoding** the data to send to tensorflow. whatever process maps tokens to a bag of words, sequence of one-hot encoded vectors, etc. depends on the model structure more than anything

In [63]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import re
import nltk
import string
import pandas as pd

%matplotlib inline
from IPython.core.pylabtools import figsize

In [2]:
# load our sample data
textfile = "winereviews.txt"
rawtext = [x.decode("ascii", errors="ignore") for x in open(textfile, "rb").readlines() if len(x) > 5]
len(rawtext)

1230

In [3]:
print(rawtext[0])

Lovely delicate, fragrant Rhone wine. Polished leather and strawberries. Perhaps a bit dilute, but good for drinking now. ***



In [4]:
def extract_rating(x):
    x = x.lower().strip()
    if "no stars" in x.lower():
        return 0, x.replace("no stars", "")
    r = re.findall("\*+", x)
    if len(r) > 0:
        return len(r[0]), x.replace("*", "")
    else:
        return np.nan, x

In [5]:
extracted = [extract_rating(r) for r in rawtext]

In [6]:
text = [x[1] for x in extracted if not np.isnan(x[0])]
ratings = [x[0] for x in extracted if not np.isnan(x[0])]

In [66]:
df = pd.DataFrame({"text":text, "rating":ratings})
df.head()

Unnamed: 0,rating,text
0,3,"lovely delicate, fragrant rhone wine. polished..."
1,2,"liquorice, cherry fruit. simple and coarse at ..."
2,1,thin and completely uninspiring.
3,0,rough.
4,3,"big, fat, textured chardonnay - nuts and butte..."


In [68]:
df.to_csv("wineratings.tsv", sep="\t", index=False)

In [7]:
tokens = [nltk.word_tokenize(x) for x in text]

In [8]:
text[0]

'lovely delicate, fragrant rhone wine. polished leather and strawberries. perhaps a bit dilute, but good for drinking now. '

In [9]:
tokens[0]

['lovely',
 'delicate',
 ',',
 'fragrant',
 'rhone',
 'wine',
 '.',
 'polished',
 'leather',
 'and',
 'strawberries',
 '.',
 'perhaps',
 'a',
 'bit',
 'dilute',
 ',',
 'but',
 'good',
 'for',
 'drinking',
 'now',
 '.']

In [10]:
wordlist = list(set([token for doc in tokens for token in doc]))
word_index = {wordlist[i]:i for i in range(len(wordlist))}
print(len(word_index))

2757


In [11]:
word_index["dilute"]

240

In [12]:
def doc_to_array(doc, wi):
    arr = np.zeros(len(wi))
    for token in doc:
        arr[wi[token]] += 1
    return arr

In [13]:
doc_to_array(tokens[0], word_index)

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [57]:
def generic_tokenizer(x):
    """
    Nothing fancy here.
    """
    x = x.lower().strip()
    for c in string.digits+string.punctuation:
        x = x.replace(c, " ")
    return x.split()
    
class Prepper(object):
    """
    Boring prototype class for step 2
    """
    
    def __init__(self, corpus, tokenizer=generic_tokenizer):
        """
        """
        self.tokenize = tokenizer
        self.token_list = self._make_tokenlist(corpus)
        self.token_index = self._make_index(self.token_list)
        self._numtokens = len(self.token_index)

        
    def _make_tokenlist(self, corpus, minlen=2):
        """
        Input a list of strings representing the documents in the
        corpus; return a list of all the distinct words in the corpus
        """
        return list(set([token for doc in corpus 
                         for token in self.tokenize(doc)
                        if len(token) >= minlen]))
    
    def _make_index(self, tokenlist):
        """
        invert the token list to get a dictionary, where each
        key is a token and each value is the token's index
        """
        return {tokenlist[i]:i for i in range(len(tokenlist))}
        
    def __len__(self):
        return self._numtokens
    
    def __call__(self, tokens):
        """
        Input a list of tokens, return a list of indices
        """
        return [self.token_index[t] for t in tokens if t in self.token_index]
    
    def __getitem__(self, indices):
        """
        Input a list of indices, return the associated tokens
        """
        return [self.token_list[i] for i in indices if i < self._numtokens]
        

In [58]:
text[0]

'lovely delicate, fragrant rhone wine. polished leather and strawberries. perhaps a bit dilute, but good for drinking now. '

In [59]:
prep = Prepper(text)
len(prep)

2579

In [60]:
text[0]

'lovely delicate, fragrant rhone wine. polished leather and strawberries. perhaps a bit dilute, but good for drinking now. '

In [61]:
ind = prep(text[0].split())
ind

[1908, 2282, 1534, 831, 386, 1396, 1661, 1563, 946, 718, 485, 1474]

In [62]:
prep[ind]

['lovely',
 'fragrant',
 'rhone',
 'polished',
 'leather',
 'and',
 'perhaps',
 'bit',
 'but',
 'good',
 'for',
 'drinking']

In [73]:
def densify(x, N):
    """
    input a list of token indices and a vector length; return a dense array
    """
    dense = np.zeros(N, dtype=int)
    dense[np.array(x)] = 1
    return dense

In [74]:
densify(ind, len(prep))

array([0, 0, 0, ..., 0, 0, 0])

In [75]:
tf.estimator.inputs.numpy_input_fn?

In [78]:
age = np.arange(4) * 1.0
height = np.arange(32, 36)
x = {'age': age, 'height': height}
y = np.arange(-32, -28)

with tf.Session() as session:
    for i in range(10):
        input_fn = tf.estimator.inputs.numpy_input_fn(
          x, y, batch_size=2, shuffle=False, num_epochs=1)