# Finding Unusual Words for a Given Language

In [1]:
text = "Truly Kryptic is a free browser-based puzzle game. Google it if you like puzzles."

## 1. Tokenization

In [2]:
from nltk import word_tokenize

text_tokenized = word_tokenize(text.lower())
text_tokenized

['truly',
 'kryptic',
 'is',
 'a',
 'free',
 'browser-based',
 'puzzle',
 'game',
 '.',
 'google',
 'it',
 'if',
 'you',
 'like',
 'puzzles',
 '.']

## 2. Importing and exploring the `words` corpus

In [3]:
from nltk.corpus import words

print(words.readme()[:65])

Wordlists

en: English, http://en.wikipedia.org/wiki/Words_(Unix)


In [4]:
words.fileids()

['en', 'en-basic']

In [5]:
words.words('en')[:10]

['A',
 'a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'Aani',
 'aardvark',
 'aardwolf',
 'Aaron']

In [6]:
words.words('en-basic')[:10]

['I',
 'a',
 'able',
 'about',
 'account',
 'acid',
 'across',
 'act',
 'addition',
 'adjustment']

In [7]:
len(words.words('en'))

235886

In [8]:
len(words.words('en-basic'))

850

## 3. Finding unusual words

In [9]:
english_vocab = {w.lower() for w in words.words()}
text_vocab = {w.lower() for w in text_tokenized if w.isalpha()}
unusual = text_vocab.difference(english_vocab)
unusual

{'google', 'puzzles'}

**NB:** By using `isalpha()` to remove punctuation tokens, we also removed hyphenated words. In a real application, we would probably prefer to split them around the hyphen and check their parts too.