In [2]:
import numpy as np
import pandas as pd
%load_ext autoreload
%autoreload 2

# Overview

What does this thing look like?
- Object that you can import
- Can call train, load, featurize, import
- Inherits from sklearn.transform? Multiple inheritance is hard...

# I. Load Data

- words: np.ndarray of all characters
- dataset: np.ndarray of character indices

In [137]:
import codecs 

#=====[ Load a whole corpus ]=====
def load_data(data_dir='./data/tinyshakespeare/'):
    vocab = {}
    print ('%s/input.txt'% data_dir)
    words = codecs.open('%s/input.txt' % data_dir, 'rb', 'utf-8').read()
    words = list(words)
    dataset = np.ndarray((len(words),), dtype=np.int32)
    for i, word in enumerate(words):
        if word not in vocab:
            vocab[word] = len(vocab)
        dataset[i] = vocab[word]
    print 'corpus length (in characters):', len(words)
    print 'vocab size:', len(vocab)
    return dataset, words, vocab
#print 'corpus length (in characters):', len(words)
#dataset, words, vocab = load_data()

#=====[ Load only the vocabulary ]=====
vocab = pickle.load(open('./data/audit_data/vocab.bin', 'rb'))
ivocab = {i:c for c, i in vocab.items()}
print 'vocab size:', len(vocab)

vocab size: 125


# II. Load Model

In [139]:
import pickle
from CharRNN import CharRNN, make_initial_state
from chainer import cuda

#####[ PARAMS ]#####
n_units = 128
seq_length = 50
batchsize = 50
seed = 123
length = 50
####################

np.random.seed(seed)
model = pickle.load(open('./data/audit_data/audit_model.chainermodel', 'rb'))
n_units = model.embed.W.data.shape[1]
initial_state = make_initial_state(n_units, batchsize=1, train=False)
print '# of units: ', n_units

# of units:  128


# III. Create TextFeaturizer

In [202]:
class TextFeaturizer(object):
    """Featurizes Text using a CharRNN"""
    def __init__(self, model, vocab):
        self.__dict__.update(locals())
        self.n_units = model.embed.W.data.shape[1]
        
    def preprocess(self, text):
        """returns preprocessed version of text"""
        if not isinstance(text, str):
            raise NotImplementedError("Must pass in a string")
        return np.array([vocab[c] for c in text]).astype(np.int32)
    
    def featurize(self, text):
        """returns a list of feature vectors for the text"""
        #=====[ Step 1: Convert to an array ]=====
        dataset = self.preprocess(text)
        
        #=====[ Step 2: Create initial state ]=====
        initial_state = make_initial_state(n_units, batchsize=1, train=False)
        init_char = np.array([0]).astype(np.int32)
        state, prob = rnn.forward_one_step(init_char, init_char, initial_state, train=False)
        
        #=====[ Step 3: Find feature vectors ]=====
        states = []
        for i in range(len(dataset)):
            cur_char = np.array([dataset[i]]).astype(np.int32)
            state, prob = model.forward_one_step(cur_char, cur_char, state, train=False)
            states.append(state['h2'].data.copy())

        #=====[ Step 4: Sanity check ]=====
        if not all([s.shape == (1, self.n_units) for s in states]):
            raise Exception("For some reason, generated the wrong shape! {}".format(np.array(states).shape))
        return states

In [203]:
featurizer = TextFeaturizer(model, vocab)

#=====[ TEST ]=====
text = 'Conducted an investigation of WalMart and concluded air and fire safety were correct'
states = featurizer.featurize(text)