In [1]:
from nltk.tokenize import WordPunctTokenizer, TweetTokenizer, sent_tokenize, word_tokenize

In [9]:
sample_text = "The Mark XIV bomb sight was developed by Royal Air Force Bomber Command during the Second World War. It was their standard bombsight for the second half of the War, replacing the First World War-era CSBS beginning in 1942. Essentially an automated version of the CSBS, it used a mechanical computer to update the sights in real-time. It required only 10 seconds of straight flight before a bomb drop, and automatically accounted for shallow climbs and dives. It contained a gyro stabilization platform that kept the sight pointed at the target as the bomber manoeuvred, dramatically increasing its accuracy and ease of sighting. It demonstrated accuracy roughly equal to the contemporary Norden bombsight, and was smaller, easier to use, faster-acting and better suited to night bombing. "

In [3]:
## Tokenize a string to split off punctuation other than periods
word_tokenize(sample_text)

['The',
 'Mark',
 'XIV',
 'bomb',
 'sight',
 'was',
 'developed',
 'by',
 'Royal',
 'Air',
 'Force',
 'Bomber',
 'Command',
 'during',
 'the',
 'Second',
 'World',
 'War',
 '.',
 'It',
 'was',
 'their',
 'standard',
 'bombsight',
 'for',
 'the',
 'second',
 'half',
 'of',
 'the',
 'War',
 ',',
 'replacing',
 'the',
 'First',
 'World',
 'War-era',
 'CSBS',
 'beginning',
 'in',
 '1942',
 '.',
 'Essentially',
 'an',
 'automated',
 'version',
 'of',
 'the',
 'CSBS',
 ',',
 'it',
 'used',
 'a',
 'mechanical',
 'computer',
 'to',
 'update',
 'the',
 'sights',
 'in',
 'real-time',
 '.',
 'It',
 'required',
 'only',
 '10',
 'seconds',
 'of',
 'straight',
 'flight',
 'before',
 'a',
 'bomb',
 'drop',
 ',',
 'and',
 'automatically',
 'accounted',
 'for',
 'shallow',
 'climbs',
 'and',
 'dives',
 '.',
 'It',
 'contained',
 'a',
 'gyro',
 'stabilization',
 'platform',
 'that',
 'kept',
 'the',
 'sight',
 'pointed',
 'at',
 'the',
 'target',
 'as',
 'the',
 'bomber',
 'manoeuvred',
 ',',
 'dramatic

In [4]:
## The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank. 
## This is the method that is invoked by word_tokenize()
sent_tokenize(sample_text)

['The Mark XIV bomb sight was developed by Royal Air Force Bomber Command during the Second World War.',
 'It was their standard bombsight for the second half of the War, replacing the First World War-era CSBS beginning in 1942.',
 'Essentially an automated version of the CSBS, it used a mechanical computer to update the sights in real-time.',
 'It required only 10 seconds of straight flight before a bomb drop, and automatically accounted for shallow climbs and dives.',
 'It contained a gyro stabilization platform that kept the sight pointed at the target as the bomber manoeuvred, dramatically increasing its accuracy and ease of sighting.',
 'It demonstrated accuracy roughly equal to the contemporary Norden bombsight, and was smaller, easier to use, faster-acting and better suited to night bombing.']

In [5]:
## Tokenizer for tweets
TweetTokenizer().tokenize(sample_text)

['The',
 'Mark',
 'XIV',
 'bomb',
 'sight',
 'was',
 'developed',
 'by',
 'Royal',
 'Air',
 'Force',
 'Bomber',
 'Command',
 'during',
 'the',
 'Second',
 'World',
 'War',
 '.',
 'It',
 'was',
 'their',
 'standard',
 'bombsight',
 'for',
 'the',
 'second',
 'half',
 'of',
 'the',
 'War',
 ',',
 'replacing',
 'the',
 'First',
 'World',
 'War-era',
 'CSBS',
 'beginning',
 'in',
 '1942',
 '.',
 'Essentially',
 'an',
 'automated',
 'version',
 'of',
 'the',
 'CSBS',
 ',',
 'it',
 'used',
 'a',
 'mechanical',
 'computer',
 'to',
 'update',
 'the',
 'sights',
 'in',
 'real-time',
 '.',
 'It',
 'required',
 'only',
 '10',
 'seconds',
 'of',
 'straight',
 'flight',
 'before',
 'a',
 'bomb',
 'drop',
 ',',
 'and',
 'automatically',
 'accounted',
 'for',
 'shallow',
 'climbs',
 'and',
 'dives',
 '.',
 'It',
 'contained',
 'a',
 'gyro',
 'stabilization',
 'platform',
 'that',
 'kept',
 'the',
 'sight',
 'pointed',
 'at',
 'the',
 'target',
 'as',
 'the',
 'bomber',
 'manoeuvred',
 ',',
 'dramatic

In [6]:
## Tokenize a text into a sequence of alphabetic and non-alphabetic characters, 
## using the regexp \w+|[^\w\s]+.
WordPunctTokenizer().tokenize(sample_text)

['The',
 'Mark',
 'XIV',
 'bomb',
 'sight',
 'was',
 'developed',
 'by',
 'Royal',
 'Air',
 'Force',
 'Bomber',
 'Command',
 'during',
 'the',
 'Second',
 'World',
 'War',
 '.',
 'It',
 'was',
 'their',
 'standard',
 'bombsight',
 'for',
 'the',
 'second',
 'half',
 'of',
 'the',
 'War',
 ',',
 'replacing',
 'the',
 'First',
 'World',
 'War',
 '-',
 'era',
 'CSBS',
 'beginning',
 'in',
 '1942',
 '.',
 'Essentially',
 'an',
 'automated',
 'version',
 'of',
 'the',
 'CSBS',
 ',',
 'it',
 'used',
 'a',
 'mechanical',
 'computer',
 'to',
 'update',
 'the',
 'sights',
 'in',
 'real',
 '-',
 'time',
 '.',
 'It',
 'required',
 'only',
 '10',
 'seconds',
 'of',
 'straight',
 'flight',
 'before',
 'a',
 'bomb',
 'drop',
 ',',
 'and',
 'automatically',
 'accounted',
 'for',
 'shallow',
 'climbs',
 'and',
 'dives',
 '.',
 'It',
 'contained',
 'a',
 'gyro',
 'stabilization',
 'platform',
 'that',
 'kept',
 'the',
 'sight',
 'pointed',
 'at',
 'the',
 'target',
 'as',
 'the',
 'bomber',
 'manoeuvre