In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [4]:
import seaborn as sns

In [5]:
sns.set_context('poster', font_scale=1.25)

In [6]:
import findspark as fs

In [7]:
fs.init()

In [8]:
import pyspark as ps

In [9]:
import multiprocessing as mp

In [10]:
mp.cpu_count()

12

In [11]:
config = ps.SparkConf()
config = config.setMaster('local[' + str(2*mp.cpu_count()) + ']')
config = config.setAppName('anagram_solver')

In [12]:
sc = ps.SparkContext(conf=config)

In [13]:
wlist = sc.textFile('EOWL_words.txt', use_unicode=True)

In [14]:
word_count = wlist.map(lambda x: 1)

In [15]:
word_count.sum()

128985

Ok. That is a lot of words. So, calculating permutations for each is likely hopeless. We probably have to do it one at a time...not sure.

In [16]:
wlist.take(10)

[u'aa',
 u'aah',
 u'aal',
 u'aalii',
 u'aardvark',
 u'aardvarks',
 u'aardwolf',
 u'aardwolves',
 u'aargh',
 u'aarrghh']

Wow. We need to compute all possible anagrams of *each word* in this anagram. Hardcore. 

For a given word, we need to create all possible mixes of a word. And then filter those by real words. Let's figure out how to do that.

In [17]:
import string

In [18]:
alphabet = string.ascii_lowercase
alphabet_broadcast = sc.broadcast(alphabet)

In [19]:
def get_anagrams_key(input_str):
    key = [0 for z in range(26)]
    for count, cur_letter in enumerate(alphabet_broadcast.value):
        key[count] = input_str.count(cur_letter)
    key = tuple(key)
    return (key, input_str)

What we need to do is make a key corresponding to 26 values...and then loop through each key, count the a's, b's, c's, etc. and go from there.

In [20]:
key_string_rdd = wlist.map(get_anagrams_key)

In [21]:
grouped_by_anagram_rdd = key_string_rdd.groupByKey()

We now need to convert this into the final form!

In [22]:
def get_into_final_form(x):
    words = list(x[1])
    num_anagrams = len(words)
    sorted_key = ''.join(sorted(words[0]))
    return (sorted_key, num_anagrams, words)

In [23]:
final_form_rdd = grouped_by_anagram_rdd.map(get_into_final_form)

Well...this is working better than my last attempt. Still sucking though. It's gonna take too long. What am I missing here? There has to be a faster way.

In [24]:
final_form_result = final_form_rdd.collect()

Cool. Let's take the 20 most common words.

In [25]:
final_form_result.sort(key=lambda x: x[1], reverse=True)

In [26]:
final_form_result[0:5]

[(u'aerst\xea',
  12,
  [u'ar\xeates',
   u'arets',
   u'aster',
   u'rates',
   u'reast',
   u'resat',
   u'stare',
   u'stear',
   u'strae',
   u'tares',
   u'tears',
   u'teras']),
 (u'aeprs',
  11,
  [u'asper',
   u'pares',
   u'parse',
   u'pears',
   u'prase',
   u'presa',
   u'rapes',
   u'reaps',
   u'spaer',
   u'spare',
   u'spear']),
 (u'aelst',
  11,
  [u'least',
   u'leats',
   u'salet',
   u'slate',
   u'stale',
   u'steal',
   u'stela',
   u'taels',
   u'tales',
   u'teals',
   u'tesla']),
 (u'aelrst',
  10,
  [u'alerts',
   u'alters',
   u'artels',
   u'laster',
   u'ratels',
   u'salter',
   u'slater',
   u'staler',
   u'stelar',
   u'talers']),
 (u'aelpst',
  10,
  [u'palest',
   u'palets',
   u'pastel',
   u'peltas',
   u'petals',
   u'plaste',
   u'plates',
   u'pleats',
   u'septal',
   u'staple'])]