# Word Counter

## Set up

In [1]:
import requests

In [2]:
url = "https://raw.githubusercontent.com/msaxton/18th-century-historians/main/gibbon/gibbon_decline_and_fall/gibbon_decline_volume1_chap01.txt"

In [3]:
response = requests.get(url)

In [4]:
text = response.text

In [5]:
text[:200]

'The extent and military force of the Roman empire, in the age of the Antonines Introduction\n IN the second century of the Christian era, the Empire of\n Rome comprehended the fairest part of the earth,'

## Frist Attempt

In [6]:
word_counts = {}

In [7]:
words = text.split()

In [8]:
for word in words:
    lower_word = word.lower()
    if lower_word in word_counts.keys():
        word_counts[lower_word] += 1
    else:
        word_counts[lower_word] = 1

In [9]:
sorted_word_counts = sorted(word_counts.items(), key=lambda x:x[1], reverse=True)

In [10]:
sorted_word_counts

[('the', 1150),
 ('of', 715),
 ('and', 377),
 ('to', 207),
 ('a', 156),
 ('was', 148),
 ('in', 119),
 ('by', 102),
 ('their', 88),
 ('were', 77),
 ('that', 75),
 ('as', 74),
 ('which', 72),
 ('from', 64),
 ('his', 61),
 ('roman', 54),
 ('with', 51),
 ('or', 51),
 ('it', 50),
 ('on', 45),
 ('they', 45),
 ('most', 38),
 ('had', 38),
 ('for', 38),
 ('but', 35),
 ('its', 30),
 ('an', 30),
 ('is', 29),
 ('we', 29),
 ('he', 28),
 ('more', 27),
 ('military', 23),
 ('than', 23),
 ('into', 23),
 ('at', 23),
 ('two', 21),
 ('every', 21),
 ('under', 20),
 ('be', 19),
 ('hundred', 17),
 ('those', 16),
 ('province', 16),
 ('this', 16),
 ('not', 16),
 ('may', 16),
 ('ancient', 15),
 ('first', 15),
 ('have', 15),
 ('arms', 15),
 ('country', 15),
 ('now', 14),
 ('between', 14),
 ('part', 13),
 ('all', 13),
 ('less', 13),
 ('thousand', 13),
 ('who', 13),
 ('when', 13),
 ('augustus', 12),
 ('might', 12),
 ('well', 12),
 ('only', 12),
 ('formed', 12),
 ('any', 12),
 ('very', 12),
 ('modern', 12),
 ('been

## Discussion

The first attempt at a word counter was a success, but the word count should not include stop words because they do not carry significant meaning.

## Solution 1
Remove stop words before counting

In [11]:
# create list of stop words
stop_words = ['the', 'of', 'and', 'to', 'a', 'was', 'in', 'by', 'their', 'were', 'that', 'as', 'which', 'from', 'his', 'with',
             'or', 'it', 'on','they', 'had', 'for', 'but', 'its', 'an', 'is', 'we', 'he', 'than', 'into', 'at', 'be', 'those',
             'this','not', 'who', 'when', 'any', 'very', 'been', 'within', 'nor', 'are', 'these', 'so', 'would', 'which,',
             'without', 'either', 'after', 'about', 'among', 'during', 'before', 'yet',]

In [12]:
# create list of words without stop words
words_no_stops = []
for word in words:  # using original list of words
    word_lower = word.lower()  # we still need to 'normailize' the text by putting everything in lower case
    if word_lower not in stop_words:
        words_no_stops.append(word_lower)
    

In [13]:
word_counts2 = {}

In [14]:
for word in words_no_stops:
    word_lower = word.lower()
    if word_lower in word_counts2.keys():
        word_counts2[word_lower] += 1
    else:
        word_counts2[word_lower] = 1

In [15]:
sorted_word_counts2 = sorted(word_counts2.items(), key=lambda x:x[1], reverse=True)

In [16]:
sorted_word_counts2

[('roman', 54),
 ('most', 38),
 ('more', 27),
 ('military', 23),
 ('two', 21),
 ('every', 21),
 ('under', 20),
 ('hundred', 17),
 ('province', 16),
 ('may', 16),
 ('ancient', 15),
 ('first', 15),
 ('have', 15),
 ('arms', 15),
 ('country', 15),
 ('now', 14),
 ('between', 14),
 ('part', 13),
 ('all', 13),
 ('less', 13),
 ('thousand', 13),
 ('augustus', 12),
 ('might', 12),
 ('well', 12),
 ('only', 12),
 ('formed', 12),
 ('modern', 12),
 ('soon', 11),
 ('many', 11),
 ('hadrian', 11),
 ('name', 11),
 ('empire', 10),
 ('other', 10),
 ('long', 10),
 ('strength', 10),
 ('emperors', 9),
 ('legions', 9),
 ('considered', 9),
 ('legions,', 9),
 ('divided', 9),
 ('antoninus', 9),
 ('over', 9),
 ('own', 9),
 ('above', 9),
 ('three', 9),
 ('extent', 8),
 ('ever', 8),
 ('still', 8),
 ('whole', 8),
 ('barbarians.', 8),
 ('scarcely', 8),
 ('valour', 8),
 ('one', 8),
 ('almost', 8),
 ('such', 8),
 ('received', 8),
 ('emperor', 8),
 ('trajan', 8),
 ('danube,', 8),
 ('provinces', 8),
 ('troops', 8),
 ('pu

## Solution 2
Remove stop words while counting

In [17]:
word_counts3 = {}

In [18]:
for word in words:  # using original list of words
    word_lower = word.lower()
    if word_lower not in stop_words:
        if word_lower in word_counts3.keys():
            word_counts3[word_lower] += 1
        else:
            word_counts3[word_lower] = 1

In [19]:
sorted_word_counts3 = sorted(word_counts3.items(), key=lambda x:x[1], reverse=True)

## Remove punctuation

You may have noticed that punctuation marks are messing up the word counts. So, those need to be removed. We will later learn how to remove them more efficently, but we can do so with some tools we have aready learned.

In [20]:
for word in words:
    if word.endswith('.'):
        word = word.replace('.', '')
    elif word.endswith(','):
        word = word.replace(',', '')
    else:
        continue

In [21]:
word_counts4 = {}

In [22]:
for word in words:
    word_lower = word.lower()
    if word_lower not in stop_words:
        if word_lower in word_counts4.keys():
            word_counts4[word_lower] += 1
        else:
            word_counts4[word_lower] = 1

In [23]:
sorted_word_counts4 = sorted(word_counts4.items(), key=lambda x:x[1], reverse=True)

In [24]:
sorted_word_counts4

[('roman', 54),
 ('most', 38),
 ('more', 27),
 ('military', 23),
 ('two', 21),
 ('every', 21),
 ('under', 20),
 ('hundred', 17),
 ('province', 16),
 ('may', 16),
 ('ancient', 15),
 ('first', 15),
 ('have', 15),
 ('arms', 15),
 ('country', 15),
 ('now', 14),
 ('between', 14),
 ('part', 13),
 ('all', 13),
 ('less', 13),
 ('thousand', 13),
 ('augustus', 12),
 ('might', 12),
 ('well', 12),
 ('only', 12),
 ('formed', 12),
 ('modern', 12),
 ('soon', 11),
 ('many', 11),
 ('hadrian', 11),
 ('name', 11),
 ('empire', 10),
 ('other', 10),
 ('long', 10),
 ('strength', 10),
 ('emperors', 9),
 ('legions', 9),
 ('considered', 9),
 ('legions,', 9),
 ('divided', 9),
 ('antoninus', 9),
 ('over', 9),
 ('own', 9),
 ('above', 9),
 ('three', 9),
 ('extent', 8),
 ('ever', 8),
 ('still', 8),
 ('whole', 8),
 ('barbarians.', 8),
 ('scarcely', 8),
 ('valour', 8),
 ('one', 8),
 ('almost', 8),
 ('such', 8),
 ('received', 8),
 ('emperor', 8),
 ('trajan', 8),
 ('danube,', 8),
 ('provinces', 8),
 ('troops', 8),
 ('pu

## More calculation
As pointed out in class, it might be useful to know what percentage of our text a give word occupies. We can do that with a small calculation and restructuring our dictinary

In [25]:
for k, v in word_counts4.items():
    percentage = v / len(words)
    word_counts4[k] = {'count': v, 'percentage': percentage}

In [26]:
word_counts4

{'extent': {'count': 8, 'percentage': 0.0008648648648648649},
 'military': {'count': 23, 'percentage': 0.0024864864864864865},
 'force': {'count': 6, 'percentage': 0.0006486486486486486},
 'roman': {'count': 54, 'percentage': 0.0058378378378378375},
 'empire,': {'count': 6, 'percentage': 0.0006486486486486486},
 'age': {'count': 5, 'percentage': 0.0005405405405405405},
 'antonines': {'count': 2, 'percentage': 0.00021621621621621621},
 'introduction': {'count': 1, 'percentage': 0.00010810810810810811},
 'second': {'count': 3, 'percentage': 0.0003243243243243243},
 'century': {'count': 2, 'percentage': 0.00021621621621621621},
 'christian': {'count': 3, 'percentage': 0.0003243243243243243},
 'era,': {'count': 2, 'percentage': 0.00021621621621621621},
 'empire': {'count': 10, 'percentage': 0.001081081081081081},
 'rome': {'count': 6, 'percentage': 0.0006486486486486486},
 'comprehended': {'count': 6, 'percentage': 0.0006486486486486486},
 'fairest': {'count': 2, 'percentage': 0.0002162162

In [27]:
sorted_word_counts4 = sorted(word_counts4.items(), key=lambda x:x[1]['count'], reverse=True)

In [28]:
sorted_word_counts4

[('roman', {'count': 54, 'percentage': 0.0058378378378378375}),
 ('most', {'count': 38, 'percentage': 0.0041081081081081085}),
 ('more', {'count': 27, 'percentage': 0.0029189189189189188}),
 ('military', {'count': 23, 'percentage': 0.0024864864864864865}),
 ('two', {'count': 21, 'percentage': 0.00227027027027027}),
 ('every', {'count': 21, 'percentage': 0.00227027027027027}),
 ('under', {'count': 20, 'percentage': 0.002162162162162162}),
 ('hundred', {'count': 17, 'percentage': 0.0018378378378378379}),
 ('province', {'count': 16, 'percentage': 0.0017297297297297297}),
 ('may', {'count': 16, 'percentage': 0.0017297297297297297}),
 ('ancient', {'count': 15, 'percentage': 0.0016216216216216215}),
 ('first', {'count': 15, 'percentage': 0.0016216216216216215}),
 ('have', {'count': 15, 'percentage': 0.0016216216216216215}),
 ('arms', {'count': 15, 'percentage': 0.0016216216216216215}),
 ('country', {'count': 15, 'percentage': 0.0016216216216216215}),
 ('now', {'count': 14, 'percentage': 0.00