# [#408: Word Cloud Data](https://www.interviewcake.com/question/python3/word-cloud?)

You want to build a word cloud, an infographic where the size of a word corresponds to how often it appears in the body of text.
To do this, you'll need data. Write code that takes a long string and builds its word cloud data in a dictionary, where the keys are words and the values are the number of times the words occurred.

Think about capitalized words. For example, look at these sentences:

```
'After beating the eggs, Dana read the next step:'
'Add milk and eggs, then add flour and sugar.'
```

What do we want to do with "After", "Dana", and "add"? In this example, your final dictionary should include one "Add" or "add" with a value of 2. Make reasonable (not necessarily perfect) decisions about cases like "After" and "Dana".

Assume the input will only contain words and standard punctuation.

You could make a reasonable argument to use regex in your solution. We won't, mainly because performance is difficult to measure and can get pretty bad.

In [29]:
def count_words_naive(string: str) -> dict:
    word = []
    words_dict = {}
    sentence_terminators = set(('.','?','!'))
    non_word_chars = sentence_terminators.union(set((' ','-', '\n', ',', ':', ';', '\"', '\'','(', ')')))
    new_sentence = True
    
    for idx, char in enumerate(string):
        if char not in non_word_chars:
            if new_sentence:
                new_sentence = False
                char = char.lower()
            if char == 's' and last_char == "\'":
                continue
            word.append(char)
            last_char = char
        else:
            if char in sentence_terminators:
                new_sentence = True
            if last_char not in non_word_chars:
                new_word = ''.join(word)
                if new_word[0].isupper() and new_word.lower() in words_dict:
                    words_dict[new_word] = words_dict[new_word.lower()] 
                    words_dict.pop(new_word.lower())
                if new_word in words_dict:
                    words_dict[new_word] += 1
                else:
                    words_dict[new_word] = 1
            word = []
            last_char = char
       
    if char in sentence_terminators:
        new_sentence = True
    if last_char not in non_word_chars:
        new_word = ''.join(word)
        if new_word[0].isupper() and new_word.lower() in words_dict:
            words_dict[new_word] = words_dict[new_word.lower()] 
            words_dict.pop(new_word.lower())
        if new_word in words_dict:
            words_dict[new_word] += 1
        else:
            words_dict[new_word] = 1
    return words_dict


In [30]:
import unittest

unittest.util._MAX_LENGTH=2000

strings_counts_dict = {
    'After beating the eggs, Dana read the next step:': {
        'after': 1,
        'beating': 1,
        'the': 2,
        'eggs': 1,
        'Dana': 1,
        'read': 1,
        'next': 1,
        'step': 1
    },
    'Add milk and eggs, then add flour and sugar.': {
        'add': 2,
        'milk': 1,
        'and': 2,
        'eggs': 1,
        'then': 1,
        'flour': 1,
        'sugar': 1
    },
    'Pseudo-sentence with a hyphenated word': {
        'pseudo': 1,
        'sentence': 1,
        'with': 1,
        'a': 1,
        'hyphenated': 1,
        'word': 1
    },
    "We came, we saw, we conquered...then we ate Bill's (Mille-Feuille) cake.": {
        'we': 4,
        'came': 1,
        'saw': 1,
        'conquered': 1,
        'then': 1,
        'ate': 1,
        'Bill': 1,
        'Mille':1,
        'Feuille': 1,
        'cake': 1
        
    },
    'The bill came to five dollars.': {
        'the': 1,
        'bill': 1,
        'came': 1,
        'to': 1,
        'five': 1,
        'dollars': 1
    }

}

class TestWordCloudData(unittest.TestCase):

    def test_count_words_naive(self):
        for string, answer in strings_counts_dict.items():
            self.assertEqual(
                count_words_naive(string),
                answer
            )
        

unittest.main(argv=[''], verbosity=2, exit=False)


test_count_words_naive (__main__.TestWordCloudData) ... ok

----------------------------------------------------------------------
Ran 1 test in 0.004s

OK


<unittest.main.TestProgram at 0x1035387f0>