# Using a Dictionary in Python via NLTK

Documentation:  https://www.nltk.org/howto/corpus.html#corpus-reader-objects

Dataset Options:  https://www.nltk.org/nltk_data/


In [1]:
from nltk.corpus import words
import re
import pandas as pd


## Find words that contain certain letters

### Specify letters to search for

In [2]:
# Enter the letters that must be included
letters_to_include = 'Et'


### Create the search pattern

In [3]:
# Create the search string with the appropriate beggining
pattern_temp = "^"

# For each letter in the list, create a search string and append it to the pattern
for l in letters_to_include:
    pattern_temp += f"(?=.*{l})"
    
# Complete the search pattern with the ending string
pattern_temp += ".*$"
    
# Now we have the following search pattern
print(f'pattern_temp = {pattern_temp}')

# Compile the string pattern_temp into a REGEX pattern that we can match on
pattern01 = re.compile(pattern_temp, re.IGNORECASE)
print(f'pattern01 = {pattern01}')

pattern_temp = ^(?=.*E)(?=.*t).*$
pattern01 = re.compile('^(?=.*E)(?=.*t).*$', re.IGNORECASE)


### Test the pattern matching

In [4]:

w = 'test'
if bool(re.match(pattern01, w)):
    print(w)

test


### Search through the words corpus

In [5]:
matched_words = []

# For each word in nltk.corpus.reader.wordlist.WordListCorpusReader
for w in words.words():
    # If the word matches our pattern, add it to the list
    if bool(re.match(pattern01, w)):
        matched_words.append(w)
        
print(f"The letters [{letters_to_include}] matched {len(matched_words)} words")
matched_words

The letters [Et] matched 84725 words


['Aaronite',
 'abacate',
 'abacinate',
 'Abadite',
 'abalienate',
 'abalienation',
 'abandonment',
 'Abantes',
 'abasement',
 'abashment',
 'abastardize',
 'abatable',
 'abate',
 'abatement',
 'abater',
 'abatised',
 'abature',
 'abbeystede',
 'abbreviate',
 'abbreviately',
 'abbreviation',
 'abbreviator',
 'abbreviatory',
 'abbreviature',
 'Abderite',
 'abdest',
 'abdicate',
 'abdicative',
 'abditive',
 'abdominoanterior',
 'abdominocentesis',
 'abdominogenital',
 'abdominohysterectomy',
 'abdominohysterotomy',
 'abdominoposterior',
 'abducent',
 'Abelite',
 'abelite',
 'abeltree',
 'abenteric',
 'abepithymia',
 'aberrant',
 'aberrate',
 'aberration',
 'aberrational',
 'aberrator',
 'aberrometer',
 'aberuncator',
 'abet',
 'abetment',
 'abettal',
 'abettor',
 'abevacuation',
 'abeyant',
 'abhorrent',
 'abhorrently',
 'abietate',
 'abietene',
 'abietic',
 'abietin',
 'Abietineae',
 'abietineous',
 'abietinic',
 'abigeat',
 'abintestate',
 'abiogenesist',
 'abiogenetic',
 'abiogenetical

## Sort words into lists depending on patterns matched

In [6]:
# Create regex filters for different types of words
pattern01 = re.compile(".*ie.*")
pattern02 = re.compile(".*ei.*")

# Create empty lists to hold the words that match
pattern01_words = []
pattern02_words = []

# Iterate through the corpus looking for words that match one of the filters
for w in words.words():
    w = w.lower()
    if bool(re.match(pattern01, w)): 
        pattern01_words.append(w)
    elif bool(re.match(pattern02, w)): 
        pattern02_words.append(w)
        
print("Done!")

Done!


In [7]:
pattern01_words

['abalienate',
 'abalienation',
 'abbie',
 'abdiel',
 'abie',
 'abies',
 'abietate',
 'abietene',
 'abietic',
 'abietin',
 'abietineae',
 'abietineous',
 'abietinic',
 'abiezer',
 'abortient',
 'abortifacient',
 'absorbefacient',
 'abthainrie',
 'acadie',
 'accidie',
 'accipient',
 'accompanier',
 'acetifier',
 'acetothienone',
 'achievable',
 'achieve',
 'achievement',
 'achiever',
 'achromobacterieae',
 'acidifier',
 'acier',
 'acierage',
 'acieral',
 'acierate',
 'acieration',
 'acquiesce',
 'acquiescement',
 'acquiescence',
 'acquiescency',
 'acquiescent',
 'acquiescently',
 'acquiescer',
 'acquiescingly',
 'acrasieae',
 'actifier',
 'actinodielectric',
 'acturience',
 'adagietto',
 'addie',
 'adiel',
 'adieu',
 'adieux',
 'adoulie',
 'adrienne',
 'advenience',
 'advenient',
 'aeolsklavier',
 'aerie',
 'aeried',
 'afield',
 'afterfriend',
 'aftergrief',
 'afterpiece',
 'aggie',
 'aggrievance',
 'aggrieve',
 'aggrieved',
 'aggrievedly',
 'aggrievedness',
 'aggrievement',
 'agiel',
 