In [1]:
import pandas as pd

Load training data

In [2]:
data_train = pd.read_csv('data/kaggle/train.csv')

## Evaluate tokenizers

To find a suitable tokenization model a number of them are qualitatively evaluated on the training data.

#### NLTK TweetTokenizer

In [3]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
for i in range(5):
    comment = data_train.ix[i, 'comment_text']
    print(comment)
    print('----------------------------------------------')
    print(tknzr.tokenize(comment))
    print('==============================================')

Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27
----------------------------------------------
['Explanation', 'Why', 'the', 'edits', 'made', 'under', 'my', 'username', 'Hardcore', 'Metallica', 'Fan', 'were', 'reverted', '?', 'They', "weren't", 'vandalisms', ',', 'just', 'closure', 'on', 'some', 'GAs', 'after', 'I', 'voted', 'at', 'New', 'York', 'Dolls', 'FAC', '.', 'And', 'please', "don't", 'remove', 'the', 'template', 'from', 'the', 'talk', 'page', 'since', "I'm", 'retired', 'now', '.', '89.205', '.', '38.27']
D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)
----------------------------------------------
["D'aww", '!', 'He', 'matches', 'this', 'background', 'colour', "I'm", 'seemingly', 'stuck', 'with', '.',

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  after removing the cwd from sys.path.


#### Keras text_to_word_sequence with default filter
This tokenizer is very basic and has a default filter set which removes punctuation and special characters.

In [18]:
from keras.preprocessing.text import text_to_word_sequence
for i in range(5):
    comment = data_train.ix[i, 'comment_text']
    print(comment)
    print('----------------------------------------------')
    print(text_to_word_sequence(comment))
    print('==============================================')

Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27
----------------------------------------------
['explanation', 'why', 'the', 'edits', 'made', 'under', 'my', 'username', 'hardcore', 'metallica', 'fan', 'were', 'reverted', 'they', "weren't", 'vandalisms', 'just', 'closure', 'on', 'some', 'gas', 'after', 'i', 'voted', 'at', 'new', 'york', 'dolls', 'fac', 'and', 'please', "don't", 'remove', 'the', 'template', 'from', 'the', 'talk', 'page', 'since', "i'm", 'retired', 'now', '89', '205', '38', '27']
D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)
----------------------------------------------
["d'aww", 'he', 'matches', 'this', 'background', 'colour', "i'm", 'seemingly', 'stuck', 'with', 'thanks', 'talk', '21', '51'

#### Keras text_to_word_sequence with default filter
This tokenizer is very basic and no characters are removed. With this setting it does not recognize punctuation as separate tokens. Also, it does not remove the newline character.

In [21]:
from keras.preprocessing.text import text_to_word_sequence
for i in range(5):
    comment = data_train.ix[i, 'comment_text']
    print(comment)
    print('----------------------------------------------')
    print(text_to_word_sequence(comment, filters=''))
    print('==============================================')

Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27
----------------------------------------------
['explanation\nwhy', 'the', 'edits', 'made', 'under', 'my', 'username', 'hardcore', 'metallica', 'fan', 'were', 'reverted?', 'they', "weren't", 'vandalisms,', 'just', 'closure', 'on', 'some', 'gas', 'after', 'i', 'voted', 'at', 'new', 'york', 'dolls', 'fac.', 'and', 'please', "don't", 'remove', 'the', 'template', 'from', 'the', 'talk', 'page', 'since', "i'm", 'retired', 'now.89.205.38.27']
D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)
----------------------------------------------
["d'aww!", 'he', 'matches', 'this', 'background', 'colour', "i'm", 'seemingly', 'stuck', 'with.', 'thanks.', '(talk)', '21:51,', 'janua

In [4]:
tknzr.tokenize("You, sir, are my hero. Any chance you remember what page that's on? :-)")

['You',
 ',',
 'sir',
 ',',
 'are',
 'my',
 'hero',
 '.',
 'Any',
 'chance',
 'you',
 'remember',
 'what',
 'page',
 "that's",
 'on',
 '?',
 ':-)']