# Text features

This notebook explores tokenization on a few examples.

In [None]:
import awe.utils
awe.utils.init_notebook()

In [None]:
import pandas as pd

In [None]:
import awe.features.text
import awe.training.params
import awe.training.trainer
awe.utils.reload('awe', exclude=['awe.data.glove'])

## Load parameters

This determines which tokenizer to use.

In [None]:
params = awe.training.params.Params.load_user(normalize=True)
trainer = awe.training.trainer.Trainer(params)

## Initialize feature

Create the textual feature which instantiates the proper tokenizer.

In [None]:
trainer.init_features()
word_identifiers = awe.features.text.WordIdentifiers(trainer)

## Examples

In [None]:
def tokenize(text: str, humanize: bool = False):
    return pd.DataFrame([
        {
            'token': token,
            'id':  word_identifiers.get_token_id(token)
        }
        for token in word_identifiers.tokenize(text, humanize=humanize)
    ])

In [None]:
tokenize('$20,750.00')

In [None]:
tokenize('2.0L I4, 16 valves, 138 hp @ 6000 rpm')

In [None]:
tokenize('$ 2000')

In [None]:
tokenize('£20')

In [None]:
tokenize('€30')

In [None]:
tokenize('MSRP Price:')

In [None]:
tokenize('SnapShotControlvehicle', humanize=True)

In [None]:
tokenize('yat-market-pricing-bd', humanize=True)