In [30]:
import string

class Vectorizer:
    def _standardize(self, text):
        text = text.lower()
        return "".join(char for char in text
                       if char not in string.punctuation)
    
    def _tokenize(self, text):
        return text.split()
    
    def make_vocabulary(self, dataset):
        self.vocabulary = { "": 0, "[UNK]": 1 }
        for text in dataset:
            text = self._standardize(text)
            tokens = self._tokenize(text)
            for token in tokens:
                if token not in self.vocabulary:
                    self.vocabulary[token] = len(self.vocabulary)
        
        self.inverse_vocabulary = dict((v, k) for k, v in self.vocabulary.items())

    def encode(self, text):
        text = self._standardize(text)
        tokens = self._tokenize(text)
        return [self.vocabulary.get(token, 1) for token in tokens]

    def decode(self, int_seq):
        return " ".join(self.inverse_vocabulary.get(i, "[UNK]") for i in int_seq)



In [32]:
vectorizer = Vectorizer()

In [8]:
convert_text = vectorizer.standardize("I write, you write, and someone writes.")

In [9]:
print(convert_text)

i write you write and someone writes


In [10]:
vectorizer.tokenize(convert_text)

['i', 'write', 'you', 'write', 'and', 'someone', 'writes']

In [25]:
dataset = [
    "Keras Applications are deep learning models that are made available alongside pre-trained weights.",
    "Arithmetic Error These models can be used for prediction, feature extraction, and fine-tuning.",
    "Weights are downloaded automatically when instantiating a model. They are stored at ~/.keras/models/."
]

In [33]:
vectorizer.make_vocabulary(dataset)

In [15]:
vectorizer.vocabulary

{'': 0,
 '[UNK]': 1,
 'keras': 2,
 'applications': 3,
 'are': 4,
 'deep': 5,
 'learning': 6,
 'models': 7,
 'that': 8,
 'made': 9,
 'available': 10,
 'alongside': 11,
 'pretrained': 12,
 'weights': 13,
 'arithmetic': 14,
 'error': 15,
 'these': 16,
 'can': 17,
 'be': 18,
 'used': 19,
 'for': 20,
 'prediction': 21,
 'feature': 22,
 'extraction': 23,
 'and': 24,
 'finetuning': 25,
 'downloaded': 26,
 'automatically': 27,
 'when': 28,
 'instantiating': 29,
 'a': 30,
 'model': 31,
 'they': 32,
 'stored': 33,
 'at': 34,
 'kerasmodels': 35}

In [16]:
vectorizer.inverse_vocabulary

{0: '',
 1: '[UNK]',
 2: 'keras',
 3: 'applications',
 4: 'are',
 5: 'deep',
 6: 'learning',
 7: 'models',
 8: 'that',
 9: 'made',
 10: 'available',
 11: 'alongside',
 12: 'pretrained',
 13: 'weights',
 14: 'arithmetic',
 15: 'error',
 16: 'these',
 17: 'can',
 18: 'be',
 19: 'used',
 20: 'for',
 21: 'prediction',
 22: 'feature',
 23: 'extraction',
 24: 'and',
 25: 'finetuning',
 26: 'downloaded',
 27: 'automatically',
 28: 'when',
 29: 'instantiating',
 30: 'a',
 31: 'model',
 32: 'they',
 33: 'stored',
 34: 'at',
 35: 'kerasmodels'}

In [34]:
encoded_text = vectorizer.encode("hello, world. They are stored.")

In [35]:
vectorizer.decode(encoded_text)

'[UNK] [UNK] they are stored'