In [24]:
import numpy as np
import pandas as pd

Let's read the one column csv file into a dataframe variable

In [25]:
word_df = pd.read_csv('./csv/words.csv')

Now lets clean it up slightly by dropping "na"  and take a look at it's shape

In [26]:
word_df = word_df.dropna()
word_df.shape

(370100, 1)

We can also take a quick peek at the first 5 rows

In [27]:
word_df.head()

Unnamed: 0,a
0,aa
1,aaa
2,aah
3,aahed
4,aahing


Or the last 5 rows

In [28]:
word_df.tail()

Unnamed: 0,a
370097,zwinglianism
370098,zwinglianist
370099,zwitter
370100,zwitterion
370101,zwitterionic


Or maybe just a random sampling.  By passing a value to the method, we can specify how may rows are returned in head(), tail() and sample()

In [29]:
word_df.sample(5)

Unnamed: 0,a
135870,hemagogs
264948,refectorary
168301,laparotomist
131041,guidonian
346531,unmollifiable


Since our csv file did not have a header.  Let's name the column 'word'

In [30]:
word_df.columns = ['word']

In [31]:
word_df.sample(5)

Unnamed: 0,word
310736,sulphonyl
115333,flukeworm
232123,perturbative
349569,unrepining
351142,unsilent


How long is the longest word in a this dataframe?

In [32]:

word_df.word.str.len().max()

31

What word is that?!?

In [33]:
word_df.loc[word_df.word.str.len().idxmax(), 'word']

'dichlorodiphenyltrichloroethane'

How many unique letters make up that word and what is their frequency?

In [34]:
longest_word = word_df.loc[word_df.word.str.len().idxmax(), 'word']
longest_word_letters = list(longest_word)
longest_word_letters = pd.DataFrame(longest_word_letters)
longest_word_letters.columns = ['Letters']
longest_word_letters['Count'] = np.zeros(len(longest_word_letters))
letter_count = longest_word_letters.groupby('Letters').count()
letter_count.sort_values('Count', ascending=False, inplace=True)
letter_count

Unnamed: 0_level_0,Count
Letters,Unnamed: 1_level_1
h,4
o,4
e,3
i,3
l,3
r,3
c,2
d,2
n,2
t,2


What is the average length (rounded down) of words in this dataframe?

In [35]:
word_df.word.str.len().mean().round()

9.0

How many words start with the letter 'j'?

In [36]:
word_df.word.str.startswith('j').sum()

2840

How many words start with the letter 's'?

In [37]:
word_df.word.str.startswith('s').sum()

38764

How many words contain the word 'shaker'?

In [38]:
word_df.word.str.contains('shaker', regex=False).sum()

13

I'd like a list of those cool sounding words

In [39]:
shaker_list = word_df[word_df.word.str.contains(pat='shaker', regex=True)]
shaker_df = pd.DataFrame(shaker_list)
shaker_df

Unnamed: 0,word
37867,boneshaker
94857,earthshaker
132698,hallanshaker
133277,handshaker
134705,headshaker
277343,saltshaker
287204,shaker
287205,shakerag
287206,shakerdom
287207,shakeress


The word 'shaker' IS cool.  But what is the word right before it?

In [40]:
right_before = word_df.loc[word_df['word'] == 'shaker'].index[0] - 1
word_df.loc[right_before, 'word']

'shakeproof'

How about 10 after?

In [41]:
ten_after = word_df.loc[word_df['word'] == 'shaker'].index[0] + 10
word_df.loc[ten_after, 'word']

'shakespearean'

Now find all the palindromes and provide a sample.  Exclude 1, 2 and 3 letter words.

In [42]:
# Convert the column into a list
words_as_list = word_df.word.tolist()
# Use list comprehension to 
# 1. Equal to or larger than 3 characters
# 2. Is a palindrome.
# Return all the values that pass the conditionals to a list
palindromes = [w for w in words_as_list if len(w) >= 4 and w == w[::-1]]
# Convert the list to a dataframe so that we can draw a small sample.
palindrome_df = pd.DataFrame(palindromes, columns=["Palindromes"])
palindrome_df.sample(5)

Unnamed: 0,Palindromes
100,ululu
67,reifier
93,teet
86,solos
68,renner


Find all the anagrams using a small sample size to reduce processing time. <br>
Only return anagrams that have 3 or more word combinations.<br>
*Note*: Random word samples can give incomplete results.

In [43]:
# Convert the column into a list
words_as_list = word_df.word.sample(15000).tolist()
sorted_words = {}
for word in words_as_list:
    sorted_words[word] = ''.join(sorted(word))
anagrams = []
for i in range(len(words_as_list)):
    temp_anagram = [words_as_list[i]]
    for j in range(i + 1, len(words_as_list)):
        if sorted_words[words_as_list[i]] == sorted_words[words_as_list[j]]:
            temp_anagram.append(words_as_list[j])
    if len(temp_anagram) != 1:
        anagrams.append(temp_anagram)

for current in anagrams:
    if len(current) >= 3:
    	print(current)

['striate', 'artiste', 'attires']
['rache', 'chare', 'reach']


<br> Collect 7 letter words from a small sample and place them in their own DataFrame

In [44]:
# Convert column to a list using a small sample
words_as_list = word_df.word.sample(50).tolist()
temp_list = []
letters = {}
for word in words_as_list:
    if len(word) == 7:
        temp_list.append(word)        

word_sample_df = pd.DataFrame(temp_list, columns=['7 Letter Word'])
word_sample_df

Unnamed: 0,7 Letter Word
0,reefing
1,insults
2,infaust
3,palilia
4,beleper


What is the frequency of each letter in the '7 Letter Word' DataFrame above?

In [45]:
word_series = pd.Series(word_sample_df['7 Letter Word'])
list_of_letters = []
for single_word in word_series:
    split_word = list(single_word)
    for new_letter in split_word:
        list_of_letters.append(new_letter)
            
letters_df = pd.DataFrame(list_of_letters)
letters_df.columns = ['Letters']
letters_df['Count'] = np.zeros(len(letters_df))
letter_count = letters_df.groupby('Letters').count()
letter_count.sort_values('Count', ascending=False, inplace=True)
letter_count

Unnamed: 0_level_0,Count
Letters,Unnamed: 1_level_1
e,5
i,5
l,4
a,3
n,3
s,3
f,2
p,2
r,2
t,2


Collect 7 letter words from a small sample and place them in their own DataFrame <br>
From small 7 letter word sample DataFrame, calculate their Scrabble score and sort by value.


In [46]:

def scrabble_points(scrabble_words):
    word_points = {
        'a' : 1, 'e' : 1, 'i' : 1, 'o' : 1, 'u' : 1, 'l' : 1, 'n' : 1, 's' : 1,
        't' : 1, 'r' : 1, 'd' : 2, 'g' : 2, 'b' : 3, 'c' : 3, 'm' : 3, 'p' : 3,
        'f' : 4, 'h' : 4, 'v' : 4, 'w' : 4, 'y' : 4, 'k' : 5, 'j' : 8, 'x' : 8,
        'q' : 10, 'z' : 10,
    }

    results = {}
    
    for s_word in scrabble_words:
        score = 0
        for letter in list(s_word):
            score = score + word_points[letter]
            results[s_word] = score
            
    return results
 
words_as_list = word_df.word.sample(50).tolist()
word_results = []
wor = []
for word in words_as_list:
    if len(word) == 7:
        wor.append(word)

scrabble_results = scrabble_points(wor)
results_df = pd.DataFrame(scrabble_results.items(), columns=['Word', 'Scrabble Value'])
results_df.set_index('Word', inplace=True)
results_df.sort_values(by=['Scrabble Value'], inplace=True, ascending=False)

results_df

Unnamed: 0_level_0,Scrabble Value
Word,Unnamed: 1_level_1
exhumes,19
divided,13
midrash,13
fisetin,10
posting,10
caserne,9
alunite,7
