In [14]:
#open the words file
wordFile = open('words','r')

In [15]:
#read from the wordFile
words = wordFile.readlines()

#look at the first ten words
words[:10]

['A\n',
 'a\n',
 'aa\n',
 'aal\n',
 'aalii\n',
 'aam\n',
 'Aani\n',
 'aardvark\n',
 'aardwolf\n',
 'Aaron\n']

In [19]:
#clean the words
cleanWords = [word.strip().lower() for word in words]
cleanWords[:10]

['a',
 'a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'aani',
 'aardvark',
 'aardwolf',
 'aaron']

In [22]:
#only the unique words
uniqueWords = list(set(cleanWords))

#sort the words abc...
uniqueWords.sort()
uniqueWords[:10]

['a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'aani',
 'aardvark',
 'aardwolf',
 'aaron',
 'aaronic']

In [27]:
#video did this with a list comprehension but I think this is too messy and error prone
uniqueWords = sorted(list(set([word.strip().lower() for word in open('words','r')])))
uniqueWords[:10]

['a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'aani',
 'aardvark',
 'aardwolf',
 'aaron',
 'aaronic']

In [32]:
#anagram detection
sorted('lives') == sorted('elvis')

True

In [31]:
#not an anagram
sorted('love') == sorted('hate')

False

In [33]:
#joins
def signature(word):
    #join with an empty str
    return ''.join(sorted(word))

In [35]:
signature('lives')

'eilsv'

In [36]:
#anagram detection on all words
def anagram(w):
    return [word for word in uniqueWords if signature(word) == signature(w)]

In [37]:
anagram('dictionary')

['dictionary', 'indicatory']

In [45]:
%timeit anagram('dictionary')

928 ms ± 76.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [47]:
wordBySignature = {}
for word in uniqueWords:
    wordBySignature[signature(word)].append(word)

KeyError: 'a'

In [51]:
import collections
#has a method to fix this previous error
wordBySignature = collections.defaultdict(list)

In [52]:
for word in uniqueWords:
    wordBySignature[signature(word)].append(word)

In [53]:
#by using a dict we are able to get a much more performant algorithim using the dict
def anagram_fast(word):
    return(wordBySignature[signature(word)])

In [54]:
%timeit anagram_fast('dictionary')

1.78 µs ± 172 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [66]:
%timeit anagrams_nonTrivial = {word: anagram_fast(word) for word in uniqueWords if len(anagram_fast(word))> 1}

605 ms ± 52.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [57]:
#so on over 230k words it ran in stastically less time than using just an array
len(uniqueWords)

234371

In [65]:
#the close of that comprehension got all only the non-trivial anagrams
len(anagrams_nonTrivial)

32890