In [91]:
from bs4 import BeautifulSoup
import re
f = open("transcript.html")
html_string = f.read()
f.close()
html_tree = BeautifulSoup(html_string, 'html.parser')

def filter_html(tag):
    # gets all p tags with no i tags in them
    if tag.name == 'p' and not tag.i:
        if re.match('\([A-Z ]+\)', tag.text):
            # filters out placeholders like '(APPLAUSE)' and '(LAUGHTER)'
            return False
        return True
    else:
        return False
    
    
p_collection = html_tree.find('article').find_all(filter_html)
# we don't want to include the moderators in our table

moderators = ['TAPPER', 'BASH']
last_match = None
word_frequencies = {}
for p in p_collection:
    if p.string == None:
        continue
    tag_text = p.string.replace('\n', '')
    tag_matcher = '^(([A-Z]+)( \(\?\))?): (.+)$'; # grabs the name of the speaker
    speaker_match = re.match(tag_matcher, tag_text)
    # some new p tags continue the last speaker's thought and don't have a name in front
    if speaker_match:
        # if there is a name in front, we know there is a new speaker
        name = speaker_match.group(2)
        last_match = name
        dialog = speaker_match.group(4)
    else:
        # otherwise, use the last name we saw
        name = last_match
        dialog = tag_text
    if name in moderators:
        # throw out moderator dialog
        last_match = name
        continue
    if name not in word_frequencies:
        word_frequencies[name] = {}
    # count up words
    word_list = dialog.split(' ')
    for word in word_list:
        word = word.lower()
        word = re.sub('[.,\?!"\d\$-]+', '', word)
        if word == '':
            continue
        if word in word_frequencies[name]:
            word_frequencies[name][word] += 1
        else:
            word_frequencies[name][word] = 1
            
print(word_frequencies)
        
    
    

{'BULLOCK': {'thanks': 2, 'dana': 3, 'i': 30, 'come': 1, 'from': 7, 'a': 48, 'state': 4, 'where': 7, 'lot': 1, 'of': 39, 'people': 9, 'voted': 2, 'for': 20, 'donald': 6, 'trump': 8, "let's": 5, 'not': 15, 'kid': 2, 'ourselves': 1, 'he': 1, 'will': 5, 'be': 11, 'hard': 1, 'to': 78, 'beat': 2, 'yet': 1, 'watching': 1, 'that': 53, 'last': 2, 'debate': 1, 'folks': 7, 'seemed': 1, 'more': 3, 'concerned': 1, 'about': 13, 'scoring': 1, 'points': 1, 'or': 8, 'outdoing': 1, 'each': 1, 'other': 1, 'with': 17, 'wishlist': 1, 'economics': 3, 'than': 2, 'making': 3, 'sure': 4, 'americans': 5, 'know': 10, 'we': 66, 'hear': 1, 'their': 5, 'voices': 1, 'and': 44, 'help': 1, 'lives': 3, 'look': 8, "i'm": 5, 'prochoice': 1, 'prounion': 1, 'populist': 1, 'democrat': 1, 'who': 3, 'won': 3, 'three': 2, 'elections': 4, 'in': 29, 'red': 1, 'by': 11, 'compromising': 1, 'our': 13, 'values': 2, 'but': 20, 'getting': 4, 'stuff': 2, 'done': 2, "that's": 9, 'how': 6, 'win': 8, 'back': 6, 'the': 81, 'places': 3, 'l

In the above cell, we are parsing through the HTML of the debate page from the Washington Post to extract all `<p>` tags in the `<article>` tag. We then throw away tags with text that is in all caps and surrounded by parentheses (e.g. '(APPLAUSEE)' and '(LAUGHTER)') because those don't correspond to any debaters' speech and therefore, don't add anything of value to our analysis. 

After that, we loop through the text of the remaining `<p>` tags to get the names of the person speaking, and the contents of what they said. In some cases, this is easy, we set up a regular expression to match strings like `"BULLOCK: That's how I win. That's how we can take back the office."` and split them up into `'BULLOCK'` and `'That's how I win. That's how we can take back the office.'`. However, there are cases where a candidate's speech is broken up into multiple `<p>` tags, e.g.
```html
<p>WARREN: No. It is my way of talking about I know how to fight and I know how to win. I took on giant banks, and I beat them. I took on Wall Street, and CEOs, and their lobbyists, and their lawyers, and I beat them. I took on a popular Republican incumbent senator, and I beat him.</p>

<p>I remember when people said Barack Obama couldn't get elected. Shoot, I remember when people said Donald Trump couldn't get elected. But here's where we are.</p>
```

In those cases, we need to keep track of the last name we saw, and if the regular expression doesn't pick up a name in the current `<p>`, assign that last name to the current text.

Once we have the name of the current speaker, we can discard what the moderators say, and then count up the words that the candidates say.

In [92]:
import operator
#sorted_x = sorted(x.items(), key=operator.itemgetter(1))
sorted_candidate_frequencies = {}
for candidate, freqs in word_frequencies.items():
    sorted_frequencies = sorted(freqs.items(), key=operator.itemgetter(1), reverse=True)
    sorted_candidate_frequencies[candidate] = sorted_frequencies

print(sorted_candidate_frequencies)
# f = open('common-words.txt')
# common_words = f.read().split('\n')
# for el in word_frequencies.items():
#     print(el)



{'BULLOCK': [('the', 81), ('to', 78), ('we', 66), ('that', 53), ('a', 48), ('and', 44), ('of', 39), ('i', 30), ('in', 29), ('have', 24), ('this', 23), ('it', 22), ('for', 20), ('but', 20), ('is', 20), ('with', 17), ('can', 17), ('are', 16), ('you', 16), ('not', 15), ('going', 15), ('actually', 14), ('about', 13), ('our', 13), ('if', 13), ('on', 12), ('do', 12), ('as', 12), ('be', 11), ('by', 11), ('get', 11), ('know', 10), ('at', 10), ('people', 9), ("that's", 9), ('president', 9), ('my', 9), ('what', 9), ('trump', 8), ('or', 8), ('look', 8), ('win', 8), ('now', 8), ('just', 8), ('so', 8), ('when', 8), ('there', 8), ('way', 8), ('think', 8), ("it's", 8), ('make', 8), ('from', 7), ('where', 7), ('folks', 7), ('those', 7), ('has', 7), ('no', 7), ('health', 7), ('us', 7), ('like', 7), ("don't", 7), ('too', 7), ('america', 7), ('want', 7), ('donald', 6), ('how', 6), ('back', 6), ('all', 6), ('care', 6), ('had', 6), ('many', 6), ('got', 6), ('part', 6), ('only', 6), ('far', 6), ('climate', 