# Finding Unusual Words for a Given Language

In [1]:
text = "Truly Kryptic is a free browser-based puzzle game. Google it if you like puzzles."

## 1. Tokenizing text

In [2]:
from nltk import word_tokenize
text_tokenized = word_tokenize(text.lower())
text_tokenized

['truly',
 'kryptic',
 'is',
 'a',
 'free',
 'browser-based',
 'puzzle',
 'game',
 '.',
 'google',
 'it',
 'if',
 'you',
 'like',
 'puzzles',
 '.']

## 2. Importing and exploring the words corpus

In [3]:
from nltk.corpus import words
words.readme().replace('\n', ' ')

'Wordlists  en: English, http://en.wikipedia.org/wiki/Words_(Unix) en-basic: 850 English words: C.K. Ogden in The ABC of Basic English (1932) '

In [4]:
words

<WordListCorpusReader in 'C:\\Users\\hzsab\\AppData\\Roaming\\nltk_data\\corpora\\words'>

In [5]:
words.fileids()

['en', 'en-basic']

In [6]:
words.words('en')[:10]

['A',
 'a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'Aani',
 'aardvark',
 'aardwolf',
 'Aaron']

In [7]:
words.words('en-basic')[:10]

['I',
 'a',
 'able',
 'about',
 'account',
 'acid',
 'across',
 'act',
 'addition',
 'adjustment']

In [8]:
len(words.words('en'))

235886

In [9]:
len(words.words('en-basic'))

850

## 3. Finding unusual words

In [10]:
english_vocab = set(w.lower() for w in words.words())
text_vocab = set(w.lower() for w in text_tokenized if w.isalpha()) # Note .isalpha() removes punctuation tokens. However, tokens with a hyphen like 'browser-based' are totally skipped over because .isalpha() would be false.
unusual = text_vocab.difference(english_vocab)
unusual

{'google', 'puzzles'}