In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download()
from nltk import ngrams, FreqDist
import re
import string
from multiprocessing import Pool
from nltk.corpus import brown
from nltk.stem import PorterStemmer
import pandas as pd

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [2]:
#Creating a wordlist with all the words for this script
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]
print('Entire word list has {} words.'.format(len(set(wordlist))))
print(wordlist[:100])
#list of the alphabet to iterate through later
alphabet = list(string.ascii_lowercase)

Entire word list has 210687 words.
['a', 'aa', 'aal', 'aalii', 'aam', 'aardvark', 'aardwolf', 'aba', 'abac', 'abaca', 'abacate', 'abacay', 'abacinate', 'abacination', 'abaciscus', 'abacist', 'aback', 'abactinal', 'abactinally', 'abaction', 'abactor', 'abaculus', 'abacus', 'abaff', 'abaft', 'abaisance', 'abaiser', 'abaissed', 'abalienate', 'abalienation', 'abalone', 'abampere', 'abandon', 'abandonable', 'abandoned', 'abandonedly', 'abandonee', 'abandoner', 'abandonment', 'abaptiston', 'abarthrosis', 'abarticular', 'abarticulation', 'abas', 'abase', 'abased', 'abasedly', 'abasedness', 'abasement', 'abaser', 'abash', 'abashed', 'abashedly', 'abashedness', 'abashless', 'abashlessly', 'abashment', 'abasia', 'abasic', 'abask', 'abastardize', 'abatable', 'abate', 'abatement', 'abater', 'abatis', 'abatised', 'abaton', 'abator', 'abattoir', 'abature', 'abave', 'abaxial', 'abaxile', 'abaze', 'abb', 'abbacomes', 'abbacy', 'abbas', 'abbasi', 'abbassi', 'abbatial', 'abbatical', 'abbess', 'abbey', '

In [3]:
#Creating a stemmed word list to eliminate derivatives of words
#Actually not going to use stemmed words for this analysis since the spelling of the words is integral in the game
porter = PorterStemmer()
stem_list = []
for word in wordlist:
    temp = porter.stem(word)
    stem_list.append(temp)
stem_list = list(dict.fromkeys(stem_list))
print('Entire stemmed word list has {} words.'.format(len(stem_list)))
print(stem_list[:100])

Entire stemmed word list has 156802 words.
['a', 'aa', 'aal', 'aalii', 'aam', 'aardvark', 'aardwolf', 'aba', 'abac', 'abaca', 'abacay', 'abacin', 'abaciscu', 'abacist', 'aback', 'abactin', 'abact', 'abactor', 'abaculu', 'abacu', 'abaff', 'abaft', 'abais', 'abaiss', 'abalien', 'abalon', 'abamper', 'abandon', 'abandonedli', 'abandone', 'abaptiston', 'abarthrosi', 'abarticular', 'abarticul', 'abas', 'abasedli', 'abased', 'abash', 'abashedli', 'abashed', 'abashless', 'abashlessli', 'abasia', 'abask', 'abastard', 'abat', 'abati', 'abatis', 'abaton', 'abattoir', 'abatur', 'abav', 'abaxi', 'abaxil', 'abaz', 'abb', 'abbacom', 'abbaci', 'abba', 'abbasi', 'abbassi', 'abbati', 'abbat', 'abbess', 'abbey', 'abbeysted', 'abbot', 'abbotci', 'abbotnulliu', 'abbotship', 'abbrevi', 'abbreviatori', 'abbreviatur', 'abcoulomb', 'abdal', 'abdat', 'abdest', 'abdic', 'abdit', 'abditori', 'abdomen', 'abdomin', 'abdominalian', 'abdominoanterior', 'abdominocardiac', 'abdominocentesi', 'abdominocyst', 'abdominoge

In [4]:
def createWordList(letter):
    '''
    function to create the list of all the words that begin with that letter
    '''
    my_regex = '^' + re.escape(letter)
    root_word = [w for w in wordlist if re.search(my_regex, w, re.IGNORECASE)]
    
    letter_list = []

    for w in range(len(root_word)):
        letter_list.append(root_word[w])
        
    return letter_list

In [31]:
def getRoots(word_list, j):
    '''
    function to find the best root (most off-shoot words) for the given number of letters, along with the number of off-shoots
    ex. If you say 4, how many words start with 'anti', how many start with 'alph', the root with the most words will be returned
    '''
    n = 0
    m = 0
    winner_root = None

    for i in range(len(word_list)):
        last_word = word_list[i-1]
        word = word_list[i]
        try:
            if last_word[:j] == word[:j]:
                n = n + 1                      
            else:
                #replace the winner root with the new one that has more off-shoots
                if n > m:
                    n = n + 1
                    m = n
                    winner_root = last_word[:j]
                    n = 0
                else:
                    #if the word's root wasn't better then
                    n = 0
        except:
            #This exception runs once because you can't see the i-1 word the first time
            this = none
    if winner_root == None:
        winner_root = word_list[0][0]
        m = len(word_list)
    
    return winner_root, m

In [32]:
getRoots(wordlist, 4)

('over', 2043)

In [33]:
def bestRoots(rootNumber):
    '''
    input: number for how many letters in root
    output: the best root with the most off-shoot words for every letter, along with the count of how many off-shoot words
    '''
    root = []
    for i in range(len(alphabet)):
        root.append(getRoots(createWordList(alphabet[i]), rootNumber))
        #print(getRoots(createWordList(alphabet[i]), rootNumber))
    
    df = pd.DataFrame(root)
    df.columns = ['Root' + str(rootNumber), 'Count' + str(rootNumber)]
    df.rename(index = {0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e', 5: 'f', 6: 'g', 7: 'h', 8: 'i', 9: 'j', 10: 'k', 11: 'l', 12: 'm', 13: 'n', 14: 'o', 15: 'p', 16: 'q', 17: 'r', 18: 's', 19: 't', 20: 'u', 21: 'v', 22: 'w', 23: 'x', 24: 'y', 25: 'z'}, inplace=True)        
        
    return df

In [34]:
#example - getting the best root for 'q' obviously yields 'qu', and we learn it has 1067 words that start with 'qu'
print(getRoots(createWordList('q'), 2))
print(getRoots(createWordList('a'), 2)[0])
print(bestRoots(1))

('qe', 2)
an
  Root1  Count1
a     a   14537
b     b    9675
c     c   17406
d     d    9946
e     e    7818
f     f    6382
g     g    5843
h     h    7889
i     i    8303
j     j    1158
k     k    1735
l     l    5211
m     m   10709
n     n    6098
o     o    7219
p     p   22171
q     q    1075
r     r    8955
s     s   22759
t     t   11389
u     u   16179
v     v    3079
w     w    3607
x     x     293
y     y     532
z     z     719


In [17]:
#df = pd.DataFrame({'root':[getRoots(createWordList('a'), 2)[0], getRoots(createWordList('b'), 2)[0]]}, index = ['a', 'b'])
df = bestRoots(1)
i=2
while True:
    df = pd.concat([df, bestRoots(i)], axis=1, join = 'inner')
    i = i + 1
    if i > 5:
        break
               
df.to_csv("best_roots.csv")

In [21]:
yellow = []
for word in wordlist:
    if word.startswith('anthropomo'):
        yellow.append(word)
print(yellow)
print(len(yellow))

['anthropomorph', 'anthropomorphic', 'anthropomorphical', 'anthropomorphically', 'anthropomorphism', 'anthropomorphist', 'anthropomorphite', 'anthropomorphitic', 'anthropomorphitical', 'anthropomorphitism', 'anthropomorphization', 'anthropomorphize', 'anthropomorphological', 'anthropomorphologically', 'anthropomorphology', 'anthropomorphosis', 'anthropomorphotheist', 'anthropomorphous', 'anthropomorphously']
19


In [22]:
#best four letter roots for all 26 letters
bestRoots(4)

Unnamed: 0,Root4,Count4
a,anti,1078
b,back,152
c,coun,512
d,disc,338
e,elec,333
f,fore,468
g,gast,199
h,hype,486
i,inte,1323
j,jack,42


In [23]:
#best three letter roots for all 26 letters
bestRoots(3)

Unnamed: 0,Root3,Count3
a,ant,1622
b,bra,460
c,con,1895
d,dis,1687
e,epi,567
f,for,886
g,gra,546
h,hyp,1034
i,int,1799
j,jac,76


In [24]:
def wordsWithThatRoot(root):
    root_list = []
    for word in wordlist:
        if word.startswith(root):
            root_list.append(word)
        else:
            continue

    return root_list

In [25]:
brown_list = set(brown.words())
'accommodate' in brown_list

True

In [26]:
def countWithThatRoot(root):
    root_list = []
    for word in wordlist:
        if word.startswith(root):
            root_list.append(word)
        else:
            continue

    return len(root_list)

In [27]:
print(wordsWithThatRoot('accom'))

['accombination', 'accommodable', 'accommodableness', 'accommodate', 'accommodately', 'accommodateness', 'accommodating', 'accommodatingly', 'accommodation', 'accommodational', 'accommodative', 'accommodativeness', 'accommodator', 'accompanier', 'accompaniment', 'accompanimental', 'accompanist', 'accompany', 'accompanyist', 'accompletive', 'accomplice', 'accompliceship', 'accomplicity', 'accomplish', 'accomplishable', 'accomplished', 'accomplisher', 'accomplishment', 'accomplisht', 'accompt']


In [28]:
def getLetterBreakdown(word):
    n = 1
    total = 0
    number = []
    results = []
    word_list = list(word)
    for i in range(len(word_list)):
        short = word[:n]
        each = wordsWithThatRoot(short)
        number = countWithThatRoot(short)
        results.append(each)
        if number == 1:
            total = total + number
            print(short + ' - ' + str(number) + " "+ str(', '.join(each)))
            break
        n = n + 1
        total = total + number
        if number < 100:
            print(short + ' - ' + str(number) + " "+ str(', '.join(each)))
        else:
            print(short + ' - ' + str(number))
    print("The total number of possible words for " + word + ": " + str(total))

In [29]:
def getLetterCount(word):
    n = 1
    total = 0
    number = []
    results = []
    word_list = list(word)
    for i in range(len(word_list)):
        short = word[:n]
        number = countWithThatRoot(short)
        if number == 1:
            total = total + number
            break
        n = n + 1
        total = total + number

    return total

In [30]:
getLetterBreakdown("apple")

a - 14537
ap - 899
app - 296
appl - 54 applanate, applanation, applaud, applaudable, applaudably, applauder, applaudingly, applause, applausive, applausively, apple, appleberry, appleblossom, applecart, appledrane, applegrower, applejack, applejohn, applemonger, applenut, appleringy, appleroot, applesauce, applewife, applewoman, appliable, appliableness, appliably, appliance, appliant, applicability, applicable, applicableness, applicably, applicancy, applicant, applicate, application, applicative, applicatively, applicator, applicatorily, applicatory, applied, appliedly, applier, applique, applosion, applosive, applot, applotment, apply, applyingly, applyment
apple - 15 apple, appleberry, appleblossom, applecart, appledrane, applegrower, applejack, applejohn, applemonger, applenut, appleringy, appleroot, applesauce, applewife, applewoman
The total number of possible words for apple: 15801


In [19]:
getLetterCount("acanthopterygian")

16072

In [20]:
def bestWord(words):
    best_word = None
    count = 0
    for i in range(len(words)):
        last_word = words[i-1]
        current_word = words[i]
        try:
            if getLetterCount(current_word) > getLetterCount(best_word):
                best_word = current_word
                count = getLetterCount(current_word)
            else:
                continue
        except:
            best_word = current_word

    return best_word, count

In [21]:
letterWordCount = []
for letter in alphabet:
    letterWordCount.append(tuple((letter, str(len(createWordList(letter))))))
    print(letter + ': ' + str(len(createWordList(letter))))

a: 14537
b: 9675
c: 17406
d: 9946
e: 7818
f: 6382
g: 5843
h: 7889
i: 8303
j: 1158
k: 1735
l: 5211
m: 10709
n: 6098
o: 7219
p: 22171
q: 1075
r: 8955
s: 22759
t: 11389
u: 16179
v: 3079
w: 3607
x: 293
y: 532
z: 719


In [24]:
best_words = []
for letter in alphabet[15]:
    temp = bestWord(createWordList(letter))
    print(temp)
    best_words.append(temp)

('preconcentratedly', 32100)


In [48]:
best_words2 = [('antiparliamentarist', 20641), ('bacterioscopically', 12285), ('contradictiously', 26519), ('discontinuously', 15751), ('extraterritoriality', 10335), ('forestaller', 9323), ('gastroenterological', 7935), ('hyperconscientiousness', 11860), ('intercommunication', 18192), ('jackassification', 1638), ('kinetographer', 2395), ('laryngoscopical', 7155), ('microcolorimetrically', 14430), ('nonconformistically', 13243), ('overstridently', 13928), ('preconcentratedly', 32100), ('quadricuspidal', 3238), ('reconciliatory', 15613), ('supersensualistic', 30454), ('transportational', 16688), ('understandableness', 35127), ('vermiculated', 4601), ('waterloggedness', 4843), ('xanthochromia', 709), ('yellowshanks', 854), ('zoophytological', 1363)]

In [49]:
print(best_words2)

[('antiparliamentarist', 20641), ('bacterioscopically', 12285), ('contradictiously', 26519), ('discontinuously', 15751), ('extraterritoriality', 10335), ('forestaller', 9323), ('gastroenterological', 7935), ('hyperconscientiousness', 11860), ('intercommunication', 18192), ('jackassification', 1638), ('kinetographer', 2395), ('laryngoscopical', 7155), ('microcolorimetrically', 14430), ('nonconformistically', 13243), ('overstridently', 13928), ('preconcentratedly', 32100), ('quadricuspidal', 3238), ('reconciliatory', 15613), ('supersensualistic', 30454), ('transportational', 16688), ('understandableness', 35127), ('vermiculated', 4601), ('waterloggedness', 4843), ('xanthochromia', 709), ('yellowshanks', 854), ('zoophytological', 1363)]


In [105]:
fdist = FreqDist(i.lower() for i in brown.words())

In [223]:
fdist['zygozoospore']

0

In [231]:
def mostRare(word_list):
    best_word = None
    for i in range(len(word_list)):
        last_word = word_list[i-1]
        current_word = word_list[i]
        if i == 0:
            best_word = current_word
        if fdist[current_word] < fdist[best_word] and fdist[current_word] > 1:
            best_word = current_word
        if fdist[current_word] == fdist[best_word] and (len(current_word) > len(last_word)):
            best_word = current_word
        else:
            continue

    return best_word

In [236]:
back_words = [w for w in nltk.corpus.words.words('en') if w.islower() and w.startswith('zoo')]

In [237]:
print(back_words)

['zoo', 'zoobenthos', 'zooblast', 'zoocarp', 'zoocecidium', 'zoochemical', 'zoochemistry', 'zoochemy', 'zoochore', 'zoocoenocyte', 'zoocultural', 'zooculture', 'zoocurrent', 'zoocyst', 'zoocystic', 'zoocytial', 'zoocytium', 'zoodendria', 'zoodendrium', 'zoodynamic', 'zoodynamics', 'zooecia', 'zooecial', 'zooecium', 'zooerastia', 'zooerythrin', 'zoofulvin', 'zoogamete', 'zoogamous', 'zoogamy', 'zoogene', 'zoogenesis', 'zoogenic', 'zoogenous', 'zoogeny', 'zoogeographer', 'zoogeographic', 'zoogeographical', 'zoogeographically', 'zoogeography', 'zoogeological', 'zoogeologist', 'zoogeology', 'zoogloea', 'zoogloeal', 'zoogloeic', 'zoogonic', 'zoogonidium', 'zoogonous', 'zoogony', 'zoograft', 'zoografting', 'zoographer', 'zoographic', 'zoographical', 'zoographically', 'zoographist', 'zoography', 'zooid', 'zooidal', 'zooidiophilous', 'zooks', 'zoolater', 'zoolatria', 'zoolatrous', 'zoolatry', 'zoolite', 'zoolith', 'zoolithic', 'zoolitic', 'zoologer', 'zoologic', 'zoological', 'zoologically', '

In [238]:
print(mostRare(back_words))

zoologist


In [241]:
def bestRoots2(rootNumber):
    '''
    input: number for how many letters in root
    output: the best root with the most off-shoot words for every letter, along with the count of how many off-shoot words
    '''
    root = []
    for i in range(len(alphabet)):
        rootLetters = getRoots(createWordList(alphabet[i]), rootNumber)
        rareList = [w for w in nltk.corpus.words.words('en') if w.islower() and w.startswith(rootLetters[0])]
        rareWord = mostRare(rareList)
        root.append(tuple((rootLetters[0], rootLetters[1], rareWord)))
    
    df = pd.DataFrame(root)
    df.columns = ['Root' + str(rootNumber), 'Count' + str(rootNumber), 'Rare Word' + str(rootNumber)]
    df.rename(index = {0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e', 5: 'f', 6: 'g', 7: 'h', 8: 'i', 9: 'j', 10: 'k', 11: 'l', 12: 'm', 13: 'n', 14: 'o', 15: 'p', 16: 'q', 17: 'r', 18: 's', 19: 't', 20: 'u', 21: 'v', 22: 'w', 23: 'x', 24: 'y', 25: 'z'}, inplace=True)        
        
    return df

In [242]:
bestRoots2(3)

Unnamed: 0,Root3,Count3,Rare Word3
a,ant,1621,anthropologist
b,bra,459,brazilwood
c,con,1895,convincingly
d,dis,1687,disyllabic
e,epi,567,epizootiology
f,for,885,forthrightness
g,gra,545,grazingly
h,hyp,1033,hypsothermometer
i,int,1799,intussusception
j,jac,75,jaculiferous


In [102]:
df = bestRoots2(1)
i=2
while True:
    df = pd.concat([df, bestRoots2(i)], axis=1, join = 'inner')
    i = i + 1
    if i > 4:
        break
               
print(df)

  Root1  Count1      Rarest Word1 Root2  Count2 Rarest Word2 Root3  Count3  \
a     a   14537           azymous    an    3151    anywither   ant    1621   
b     b    9675            bywork    be    2063        bezzo   bra     459   
c     c   17405          czarship    co    6357         cozy   con    1895   
d     d    9945            dzeren    di    3471        dizzy   dis    1687   
e     e    7817              ezba    en    1611    enzymotic   epi     567   
f     f    6381              fyrd    fo    1433         fozy   for     885   
g     g    5843              gyve    ga    1209     gazzetta   gra     545   
h     h    7889  hystricomorphous    he    2297       heyday   hyp    1033   
i     i    8303            izzard    in    5237       inyoke   int    1799   
j     j    1157              jynx    ja     339        jazzy   jac      75   
k     k    1735              kyte    ki     395         kiyi   kin     145   
l     l    5211            lyxose    la    1553    lazzaroni   l