In [2]:
# Clean unicode and double nested arrays
import unicodedata
import itertools

# Handle Data
import pandas as pd
import numpy as np
from collections import OrderedDict 
from collections import Counter

# CMU dictionary
from nltk.corpus import cmudict

# Time and Progress Bar
from tqdm import tqdm
import time

# Regular Expressions
import re

from pprint import pprint

# Parallel Processing
from joblib import Parallel, delayed
import math
from spacy.util import minibatch
from functools import partial

# Web scaping
import requests
from bs4 import BeautifulSoup
import pickle

# Handle numbers
from text_to_num import text2num
from text_to_num import alpha2digit
from num2words import num2words

# Spacy
import spacy
from spacy.matcher import Matcher
from spacy.lang.en import English

# Custom Tokenizer
import re
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.lang.de.punctuation import _quotes
from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex

ModuleNotFoundError: No module named 'text_to_num'

# 1. Preprocessing Functions

## 1.1 Custom Tokenizer <a id='1_1_id'></a>

In [2]:
# https://stackoverflow.com/questions/57295996/is-it-possible-to-change-the-token-split-rules-for-a-spacy-tokenizer

# Custom tokenizer to not split on hyphens
def custom_tokenizer(nlp):
    infixes = (
        LIST_ELLIPSES
        + LIST_ICONS
        + [
            r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
            r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
            r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA),
            r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
            r"(?<=[{a}])([{q}\]\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
            r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
            r"(?<=[0-9])-(?=[0-9])",
        ]
    )

    infix_re = compile_infix_regex(infixes)
    
    updated_tokenizer =  Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search,
                                suffix_search=nlp.tokenizer.suffix_search,
                                infix_finditer=infix_re.finditer,
                                token_match=nlp.tokenizer.token_match,
                                rules=nlp.Defaults.tokenizer_exceptions)
    
    return updated_tokenizer

## 1.2 Matchers <a id='1_3_id'></a>

In [3]:
def pattern_merger(doc):
    """ 
        This will be called on the Doc object in the pipeline 
    """
    matched_spans = []
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        matched_spans.append(span)
    for span in matched_spans:  # merge into one token after collecting all matches
        span.merge()
    return doc

# 1.3 Pipelines

In [4]:
def remove_excess_spaces_component(doc):
    """ Removes all excess spaces to a single space """
    filtered_text = ' '.join([token.text for token in doc if not token.is_space])
    return pattern_merger(nlp.make_doc(filtered_text))

# 2. Helper Functions

## 2.1 General Functions <a id='2_1_id'></a>

In [5]:
def update_dict(dictionary, before_after_values):
    """ 
        Given a list of tuples (before, after), updates dictionary with 'before' as key and 'after' as value 
        if the pair doesn't already exist in the dictionary and returns it
    """
    
    for before, after in before_after_values:
        if before not in dictionary:
            dictionary[before] = after
    return dictionary

In [6]:
def is_valid_hyphen_word(word):
    """
        Returns True if each word in the hyphenated token is a valid word
    """
    word_tokens = word.split('-')
    words = [is_word(token) is not False for token in word_tokens]
    return sum(words) == len(words)

In [7]:
def is_all_words(token):
    """
        Returns True if all tokens are valid words
    """
    tokens = token.text.split()
    return sum([token.isalnum() for token in tokens]) == len(tokens)


In [8]:
def is_word(word):
    """
        Verifies the word is valid if it exists in merriam-wester's database
    """
    
    url = 'https://www.merriam-webster.com/dictionary/' + word
    
    while True:
        try:
            page = requests.get(url).text
            soup = BeautifulSoup(page, 'lxml')
            try:
                if soup.find_all("div", {"id": "definition-wrapper"})[0].find('h1', {'class': 'hword'}):
                    return soup.find_all("div", {"id": "definition-wrapper"})[0].find('h1', {'class': 'hword'}).text
                elif soup.find_all("div", {"id": "definition-wrapper"})[0].find('h1', {'class': 'mispelled-word'}):
                    return False
            except:
                return False
        except:
            # If there is connection failure, wait 2 seconds then retry
            time.sleep(2)
            continue
        break

In [9]:
def preview_summaries(president):
    """ 
        Prints  all summaries for a given president 
    """
    df = speeches[speeches.President == president].Summary.reset_index()
    for row in df.itertuples(index=False):
        index = row[0]
        summary = row[1]
        if len(summary) > 0:
            print('Index: {}\n{}\n'.format(index, summary))

In [10]:
def get_presidents(ret = False):
    """ 
        Returns list of presidents in alphabetical order 
    """
    
    presidents = sorted(list(set(speeches.President.values)))
    if ret:
        return presidents
    else:
        for i, president in enumerate(presidents):
            print('{}: {}'.format(i, president))
        return presidents

In [11]:
def clean_removable_speaker_text(speaker, speakers):
    """
        Given a speaker and a list of all speakers, remove all texts from the speaker f
    """
    texts = []
    
    pattern = '{}[^:]+'.format(speaker)
    patterns = find_pattern(speeches['Transcript'], pattern, 50, -1, verbose = False)
    
    for pattern in patterns:
        pattern_match = []
        for speaker in speakers:
            speaker = speaker.rstrip(':')
            
            # End of pattern matches a speaker, 
            if speaker == pattern[-len(speaker):]:
                pattern_match += [pattern[:-len(speaker)]]
        if not pattern_match:
            texts += [pattern]
        else:
            texts += pattern_match
    return texts

In [12]:
def text_surrounding_speakers(speakers, single_multiple = 'multiple'):
    """
        Returns all surrounding text around a given speaker
    """
    if single_multiple == 'single':
        speaker = speakers
        print('{}\n| {} |\n{}\n'.format((len(speaker) + 6) * '-', speaker, (len(speaker) + 6) * '-'))

        pattern = '.{,30}' + speaker + '.{,30}'
        pprint(find_pattern(speeches['Transcript'], pattern, 50, -1)[:5])
        print()
    elif single_multiple == 'multiple':
        for i, speaker in enumerate(speakers):
            print('{}\n| {} {} |\n{}\n'.format((len(speaker) + (6 if i < 10 else 7)) * '-', i, speaker, (len(speaker) + (6 if i < 10 else 7)) * '-'))

            pattern = '.{,30}' + speaker + '.{,30}'
            pprint(find_pattern(speeches['Transcript'], pattern, 50, -1)[:5])
            print()

In [13]:
def is_number(text):
    """ Returns boolean on whether token is a number or not """
    try:
        return text2num(text.lower(), "en")
    except:
        return None
    

In [14]:
def is_ordinal(token):
    """ Returns boolean indicating if token is ordinal """
    try:
        ordinal_number = alpha2digit(token, "en")
        ordinal_number_stripped = re.match('[0-9]+', ordinal_number)[0]
        return token == num2words(int(ordinal_number_stripped), to='ordinal')
    except:
        return False

In [15]:
def is_compound(token, p1, p2, select):
    """
        Returns specified pos/tag for each token of the specified format
        
        p: part of speech
        t: tag
    """
    
    if select == 'pp':
        if token[0].pos_ == p1 and token[1].pos_ == p2:
            return True
    if select == 'pt':
        if token[0].pos_ == p1 and token[1].tag_ == p2:
            return True
    if select == 'tp':
        if token[0].tag_ == p1 and token[1].pos_ == p2:
            return True
    if select == 'tt':
        if token[0].tag_ == p1 and token[1].tag_ == p2:
            return True

In [16]:
def clean_transcript(doc):
    """
    Given a doc, replaces all quotes with the correct double quotes token. Then merges the tokens in sentences
    with the correct formatting and correct puncuation placement. Finally removes all unncessary text in 
    parenthese 
    """
    
    # Replace quotation tokens
    transcript_quotes = []
    left_quote = True
    for token in doc:
        if token.text == '"':
            if left_quote:
                transcript_quotes += ['“']
                left_quote = False
            else:
                transcript_quotes += ['”']
                left_quote = True
        else:
            transcript_quotes += [token.text]  
    
    transcript_quotes = nlp(' '.join(transcript_quotes))
    
    punctuation_marks = ['.', ',','?','!',':',';', ',']
    
    # Merge tokens into correct placement
    transcript_punctuation = ""
    for i, token in enumerate(transcript_quotes):
        if i < len(transcript_quotes) - 1:
            if transcript_quotes[i+1].text in punctuation_marks:
                transcript_punctuation += token.text
            else:
                transcript_punctuation += token.text + ' '
    transcript_punctuation
    
    # Merge sentences using quotation boundaries 
    new_transcript = transcript_punctuation.replace(' ..', '.')
    new_transcript = new_transcript.split(' “ ')
    new_transcript = ' “'.join(new_transcript)
    new_transcript = new_transcript.split(' ” ')
    new_transcript = '” '.join(new_transcript)
    
    # Removes all unncessary text in  parenthese 
    new_transcript = new_transcript.replace('(Applause.)', '')
    new_transcript = new_transcript.replace('(Laughs.)', '')
    new_transcript = new_transcript.replace('(Laughter.)', '')
    new_transcript = new_transcript.replace('(LAUGHTER)', '')
    new_transcript = new_transcript.replace('(APPLAUSE)', '')
    new_transcript = new_transcript.replace('(laughter.)', '')
    new_transcript = new_transcript.replace('(TRANSLATION) ', '')
    new_transcript = re.sub(' -{2,}', '', new_transcript)

    return new_transcript

In [17]:
def clean_text_tokens(text, clean_dict):
    """
     Given a dictionary of tokens before and after, cleans the given text
    """
    for token in clean_dict:
        if token in text:
            text = text.replace(token, clean_dict[token])
            
    return text.strip()

## 2.2 Parallel Processing Functions <a id='2_3_id'></a>

### 2.2.1 Calling Functions <a id='2_3_id'></a>

In [18]:
def find_pattern(texts, pattern, batch_size, n_jobs, verbose = False):
    """
        Finds all pattern matches in the given texts using parallel processing
    """
    num_of_texts = len(texts)
    num_of_batches = math.ceil(num_of_texts/batch_size) 
    
    # Processing Texts
    mini_batch_pattern('find_pattern', texts, pattern, num_of_texts, batch_size, num_of_batches, n_jobs, verbose)
    
    path = 'pickles/batch_functions/find_pattern/find_pattern'
    return pickles_to_list(path, num_of_batches)

In [19]:
def get_transcript_tokens(texts, batch_size, n_jobs, verbose = False):
    """
        Finds all tokens in the given texts that are not words or numbers
    """
    
    num_of_texts = len(texts)
    num_of_batches = math.ceil(num_of_texts/batch_size) 
    
    # Processing Texts
    mini_batch('transcript_tokens', texts, num_of_texts, batch_size, num_of_batches, n_jobs, verbose)
    
    path = 'pickles/batch_functions/transcript_tokens/transcript_tokens'
    return pickles_to_list(path, num_of_batches)

### 2.2.2 Parallel Processing Functions <a id='2_3_id'></a>

In [20]:
def mini_batch(method, texts, num_of_texts, batch_size, num_of_batches, n_jobs, verbose):
    """
        Given the function name and texts, executes the specified function using parallel processing
    """
    
    partitions = minibatch(texts, size=batch_size)
    executor = Parallel(n_jobs=n_jobs, backend="multiprocessing", prefer="processes")
    
    path = 'pickles/batch_functions/{}/{}'.format(method, method)
    
    
    if method == 'combine_transcripts':
        do = delayed(partial(combine_transcripts_batch, num_of_texts, num_of_batches, path, verbose))
    if method == 'transcript_tokens':
        do = delayed(partial(transcript_tokens_batch, num_of_texts, num_of_batches, path, verbose))
        
    tasks = (do(i, batch) for i, batch in enumerate(partitions))
    executor(tasks)

In [21]:
def mini_batch_pattern(method, texts, pattern, num_of_texts, batch_size, num_of_batches, n_jobs, verbose):
    """
        Executes find_pattern using parallel processing
    """
    
    partitions = minibatch(texts, size=batch_size)
    executor = Parallel(n_jobs=n_jobs, backend="multiprocessing", prefer="processes")
    
    path = 'pickles/batch_functions/{}/{}'.format(method, method)
    
    if method == 'find_pattern':
        do = delayed(partial(find_pattern_batch, pattern, num_of_texts, num_of_batches, path, verbose))
        
    tasks = (do(i, batch) for i, batch in enumerate(partitions))
    executor(tasks)

### 2.2.3 Called Functions <a id='2_3_id'></a>

In [22]:
def find_pattern_batch(pattern, num_of_texts, num_of_batches, path, verbose, batch_id, texts):
    if verbose:
        print("Processing batch {}/{}".format(batch_id + 1, num_of_batches))
    
    pattern_matches = []
    for text in texts:
        matches = re.finditer(pattern, text)
        for match in matches:
            pattern_matches += [match[0]]
    output_path = '{}_{}.p'.format(path, batch_id + 1)
    pickle.dump(pattern_matches, open(output_path, "wb" ))

In [23]:
def combine_transcripts_batch(num_of_texts, num_of_batches, path, verbose, batch_id, texts):
    if verbose:
        print("Processing batch {}/{}".format(batch_id + 1, num_of_batches))
    
    transcripts = []
    for doc in nlp.pipe(texts):
        transcripts += [doc.text]        
    output_path = '{}_{}.p'.format(path, batch_id + 1)
    pickle.dump(transcripts, open(output_path, "wb" ))  

In [24]:
def transcript_tokens_batch(num_of_texts, num_of_batches, path, verbose, batch_id, texts):
    if verbose:
        print("Processing batch {}/{}".format(batch_id + 1, num_of_batches))
        
    docs = list(nlp.pipe(texts))
    tokens = [[token.text for token in doc if not token.is_alpha and not token.like_num and not token.is_punct and not token.is_space] for doc in docs]
    tokens = list(set(itertools.chain.from_iterable(tokens)))
    
    output_path = '{}_{}.p'.format(path, batch_id + 1)
    pickle.dump(tokens, open(output_path, "wb" ))  

### 2.4 Load Pickles Functions <a id='2_3_id'></a>

In [25]:
def pickles_to_list(path, num_of_batches):
    pickle_list = []
    for batch in range(1, num_of_batches + 1):
        batch_path = '{}_{}.p'.format(path, batch)
        pickle_list += pickle.load(open(batch_path, 'rb'))
    return list(set(pickle_list))

# 3. Setup

## 3.1 Load Data <a id='3_1_id'></a>

In [26]:
# Load speeches
speeches = pickle.load(open('pickles/webscrape/raw_speech_data.p', 'rb'))

# Examine speeches
speeches

Unnamed: 0,Date,President,Speech Title,Summary,Transcript,URL
0,"September 25, 2019",Donald Trump Presidency,"September 25, 2019: Press Conference",President Donald Trump holds a press conferenc...,\nTranscript\nPRESIDENT TRUMP: Thank you very...,https://millercenter.org/the-presidency/presid...
1,"September 24, 2019",Donald Trump Presidency,"September 24, 2019: Remarks at the United Nati...",President Donald Trump speaks to the 74th sess...,\nTranscript\nPRESIDENT TRUMP: Thank you very...,https://millercenter.org/the-presidency/presid...
2,"February 15, 2019",Donald Trump Presidency,"February 15, 2019: Speech Declaring a National...",President Donald Trump declares a national eme...,\nTranscript\nTHE PRESIDENT: Thank you very m...,https://millercenter.org/the-presidency/presid...
3,"February 05, 2019",Donald Trump Presidency,"February 5, 2019: State of the Union Address","In his second State of the Union Address, Pres...","\nTranscript\n\nMadam Speaker, Mr. Vice Presid...",https://millercenter.org/the-presidency/presid...
4,"January 19, 2019",Donald Trump Presidency,"January 19, 2019: Remarks about the US Souther...",President Donald Trump speaks about what he se...,\nTranscript\nTHE PRESIDENT: Just a short time...,https://millercenter.org/the-presidency/presid...
...,...,...,...,...,...,...
987,"December 29, 1790",George Washington Presidency,"December 29, 1790: Talk to the Chiefs and Coun...",The President reassures the Seneca Nation that...,Transcript I the President of the United State...,https://millercenter.org/the-presidency/presid...
988,"December 08, 1790",George Washington Presidency,"December 8, 1790: Second Annual Message to Con...",Washington focuses on commerce in his second a...,Transcript Fellow citizens of the Senate and H...,https://millercenter.org/the-presidency/presid...
989,"January 08, 1790",George Washington Presidency,"January 8, 1790: First Annual Message to Congress","In a wide-ranging speech, President Washington...",TranscriptFellow Citizens of the Senate and Ho...,https://millercenter.org/the-presidency/presid...
990,"October 03, 1789",George Washington Presidency,"October 3, 1789: Thanksgiving Proclamation","At the request of Congress, Washington establi...",TranscriptWhereas it is the duty of all Nation...,https://millercenter.org/the-presidency/presid...


## 3.2 Initialize NLP Model <a id='3_2_id'></a>

In [27]:
# Small English Model
nlp = spacy.load("en_core_web_sm")

# Set tokenizer to the custom tokenizer
nlp.tokenizer = custom_tokenizer(nlp)

# Pipeline
nlp.add_pipe(pattern_merger, first=True)  # add it right after the tokenizer

# Tokens to be cleaned before and after mappings
clean_dict = {}

## 3.3 Initialize Matcher <a id='3_3_id'></a>

In [28]:
# Matcher
matcher = Matcher(nlp.vocab)

# Pattern to match possesion
matcher.add('match_conj_apos_s', None, [{'IS_ALPHA': True}, {'TEXT': '\'s'}])
matcher.add('match_conj_not', None, [{'IS_ALPHA': True}, {'TEXT': 'n\'t'}])
matcher.add('match_conj_apos_d', None, [{'IS_ALPHA': True}, {'TEXT': '\'d'}])
matcher.add('match_conj_apos_ll', None, [{'IS_ALPHA': True}, {'TEXT': '\'ll'}])
matcher.add('match_conj_apos_ll', None, [{'IS_ALPHA': True}, {'TEXT': '\'m'}])
matcher.add('match_conj_apos_re', None, [{'IS_ALPHA': True}, {'TEXT': '\'re'}])
matcher.add('match_conj_apos_ve', None, [{'IS_ALPHA': True}, {'TEXT': '\'ve'}])

 # 4. Cleaning <a id='6_1_id'></a>

## 4.1. Clean Dates <a id='4_1_id'></a>

In [29]:
# Convert Date type into datetime
speeches['Date'] = pd.to_datetime(speeches['Date']) 

## 4.2. Clean President Names <a id='4_2_id'></a>

In [30]:
# Remove 'Presidency' from each president's name
speeches['President'] = [' '.join(president.split()[:-1]) for president in speeches['President']]

## 4.3. Clean Speech Titles <a id='4_3_id'></a>

In [31]:
# Remove date from each speech title
speeches['Speech Title'] = speeches['Speech Title'].apply(lambda x: x.split(':')[-1].strip())

## 4.4. Clean Summaries <a id='4_4_id'></a>

### 4.4.1 Normalize Unicode <a id='4_4_1_id'></a>

In [32]:
speeches['Summary'] = speeches['Summary'].apply(lambda x: unicodedata.normalize('NFKD', x))

### 4.4.2 Initial preprocessing <a id='4_4_2_id'></a>

In [33]:
# Clean newlines and different apostrophe symbols
speeches['Summary'] = speeches['Summary'].apply(lambda x: x.replace('\n', ' '))
speeches['Summary'] = speeches['Summary'].apply(lambda x: x.replace('’', '\''))
speeches['Summary'] = speeches['Summary'].apply(lambda x: x.strip())

### 4.4.3 Remove unnecessary leading and trailing text from summaries <a id='4_4_3_id'></a>

In [34]:
# List of all presidents
presidents = get_presidents()

0: Abraham Lincoln
1: Andrew Jackson
2: Andrew Johnson
3: Barack Obama
4: Benjamin Harrison
5: Bill Clinton
6: Calvin Coolidge
7: Chester A. Arthur
8: Donald Trump
9: Dwight D. Eisenhower
10: Franklin D. Roosevelt
11: Franklin Pierce
12: George H. W. Bush
13: George W. Bush
14: George Washington
15: Gerald Ford
16: Grover Cleveland
17: Harry S. Truman
18: Herbert Hoover
19: James A. Garfield
20: James Buchanan
21: James K. Polk
22: James Madison
23: James Monroe
24: Jimmy Carter
25: John Adams
26: John F. Kennedy
27: John Quincy Adams
28: John Tyler
29: Lyndon B. Johnson
30: Martin Van Buren
31: Millard Fillmore
32: Richard M. Nixon
33: Ronald Reagan
34: Rutherford B. Hayes
35: Theodore Roosevelt
36: Thomas Jefferson
37: Ulysses S. Grant
38: Warren G. Harding
39: William Harrison
40: William McKinley
41: William Taft
42: Woodrow Wilson
43: Zachary Taylor


#### Preview summaries for any unnecessary text at the beginning and end of each summary 

In [35]:
# Preview all summaries for a given president
preview_summaries(presidents[38])

Index: 522
Victor Recording #35718-B (4:40) On November 12,1921, President Harding convened in Washington the most significant arms limitation conference of the inter-war era. Represented were the major political and military powers including Belgium, China, Great Britain, Italy, France, the Netherlands, Portugal, and Japan. Mindful of Wilson's ill-fated experience and inability to gain consent for the Treaty of Versailles and the League of Nations by the Senate, President Harding elected to welcome and challenge the invited diplomats, and then delegate the negotiations to Secretary of State Charles Evans Hughes. He also ensured that the U.S. delegation contained important Congressional Republicans including Senator Henry Cabot Lodge. After the formalities of the negotiations were completed, leaving nothing to chance, Harding spoke in person to the Senate on February 10, 1922 and submitted the treaties for consent. He noted that he was respectful of the Senate's part in contracting for

#### Remove unncessary text

In [36]:
# Obama
speeches.loc[38, 'Summary'] = ' '.join(speeches['Summary'].loc[38].split()[:-10])
speeches.loc[39, 'Summary'] = ' '.join(speeches['Summary'].loc[39].split()[:-10])

#Eisenhower
speeches.loc[408, 'Summary'] = ' '.join(speeches['Summary'].loc[408].split()[:-10])

# Clinton
speeches.loc[117, 'Summary'] = ' '.join(speeches['Summary'].loc[117].split()[11:])

# Kennedy
speeches.loc[376, 'Summary'] = ' '.join(speeches['Summary'].loc[376].split()[:-67])
speeches.loc[377, 'Summary'] = ' '.join(speeches['Summary'].loc[377].split()[:-67])
speeches.loc[378, 'Summary'] = ' '.join(speeches['Summary'].loc[378].split()[:-67])
speeches.loc[380, 'Summary'] = ' '.join(speeches['Summary'].loc[380].split()[:-31])
speeches.loc[381, 'Summary'] = ' '.join(speeches['Summary'].loc[381].split()[:-67])
speeches.loc[384, 'Summary'] = ' '.join(speeches['Summary'].loc[384].split()[:-67])
speeches.loc[385, 'Summary'] = ' '.join(speeches['Summary'].loc[385].split()[:-31])
speeches.loc[387, 'Summary'] = ' '.join(speeches['Summary'].loc[387].split()[:-53])
speeches.loc[388, 'Summary'] = ' '.join(speeches['Summary'].loc[388].split()[:-67])
speeches.loc[393, 'Summary'] = ' '.join(speeches['Summary'].loc[393].split()[:-31])
speeches.loc[394, 'Summary'] = ' '.join(speeches['Summary'].loc[394].split()[:-31])

# Harding
speeches.loc[522, 'Summary'] = ' '.join(speeches['Summary'].loc[522].split()[4:])
speeches.loc[523, 'Summary'] = ' '.join(speeches['Summary'].iloc[523].split()[5:])
speeches.loc[528, 'Summary'] = ' '.join(speeches['Summary'].iloc[528].split()[4:])
speeches.loc[529, 'Summary'] = ' '.join(speeches['Summary'].iloc[529].split()[4:])
speeches.loc[530, 'Summary'] = ' '.join(speeches['Summary'].iloc[530].split()[6:])
speeches.loc[535, 'Summary'] = ' '.join(speeches['Summary'].iloc[535].split()[6:])
speeches.loc[536, 'Summary'] = ' '.join(speeches['Summary'].loc[536].split()[7:-28])
speeches.loc[538, 'Summary'] = ' '.join(speeches['Summary'].loc[538].split()[6:-6])
speeches.loc[545, 'Summary'] = ' '.join(speeches['Summary'].loc[545].split()[:-6])


### 4.4.5 Additional Cleaning <a id='4_4_5_id'></a>

In [37]:
# Get all tokens that are not alphanumeric, numbers, punctuation, or spaces
docs = list(nlp.pipe(speeches.Summary, batch_size = 50, n_process=16))
tokens = [[token.text for token in doc if not token.is_alpha and not token.like_num and not token.is_punct and not token.is_space] for doc in docs]

# Convert lists of lists into a single list
tokens = list(set(itertools.chain.from_iterable(tokens)))
tokens

['Waldorf-Astoria',
 'newly-trained',
 'N.Y.',
 "Lincoln's",
 'ideas—justice',
 'world"--to',
 'nation-wide',
 'pork-barrel',
 'two-hundredth',
 '80th',
 'Asia-first',
 'D.C',
 "Administration's",
 "Grant's",
 'anti-Chinese',
 'W.',
 'and/or',
 'clean-energy',
 'D.C.',
 'cabinet-level',
 'communism—which',
 "Committee's",
 "Revolution's",
 "MacArthur's",
 'non-belligerent',
 'J.P.',
 "Decatur's",
 'future—is',
 'economy—energy',
 'non-communist',
 "France's",
 '20th',
 "world's",
 'post-World',
 'self-made',
 'it—the',
 "Laden's",
 'anti-establishment',
 "America's",
 "Clinton's",
 "Europe's",
 'Kai-shek',
 'gender-based',
 "Institute's",
 "Hoover's",
 'post-Cold',
 'cash-and-carry',
 "act's",
 "speech's",
 'policy—all',
 '72nd',
 '11:32',
 'Webster-Ashburton',
 'still-young',
 'Hay-Buneau-Varilla',
 "Paul's",
 'AFL-CIO',
 'Mexican-American',
 'full-time',
 'de-emphasizes',
 "Japan's",
 "Africa's",
 '73rd',
 'U.S.S.',
 "government's",
 'war-stricken',
 '48-hour',
 "Bush's",
 '25th',
 '

> ##### 1. Hyphens <a id='4_4_5_hyphens_id'></a>

In [38]:
# Examine all tokens with hyphens 
hyphen_tokens = [token for token in tokens if '-' in token]
hyphen_tokens

['Waldorf-Astoria',
 'newly-trained',
 'world"--to',
 'nation-wide',
 'pork-barrel',
 'two-hundredth',
 'Asia-first',
 'anti-Chinese',
 'clean-energy',
 'cabinet-level',
 'non-belligerent',
 'non-communist',
 'post-World',
 'self-made',
 'anti-establishment',
 'Kai-shek',
 'gender-based',
 'post-Cold',
 'cash-and-carry',
 'Webster-Ashburton',
 'still-young',
 'Hay-Buneau-Varilla',
 'AFL-CIO',
 'Mexican-American',
 'full-time',
 'de-emphasizes',
 'war-stricken',
 '48-hour',
 'strategy--',
 'mid-term',
 'Bergen-Belsen',
 'newly-acquired',
 'drought-stricken',
 'non-aggression',
 'self-determination',
 'Kansas-Nebraska',
 'one-year',
 'military-industrial',
 'well-funded',
 'one-third',
 'U.S.-British',
 'ash-heap',
 'R-Ohio',
 'peace-loving',
 'crime-related',
 'Anti-trust',
 'Latin-American',
 'Maine-Brunswick',
 'D-Day',
 'twenty-seven',
 'long-term',
 'hand-in-hand',
 '100-a-plate',
 'twenty-eight',
 'Interstate-Commerce',
 'Iran-Contra',
 'Destroyers-for-Bases',
 'anti-ballistic',
 '

>> #### 1.1 Problem: Tokens with two consecutive hyphens <a id='4_4_5_1_1_id'></a>

In [39]:
pattern = '.*--.*'
hyphens_before = [re.search(pattern, token)[0] for token in hyphen_tokens if re.search(pattern, token)]
hyphens_before

['world"--to', 'strategy--', 'everyone--', 'government--']

>> #### 1.2 Solution: Tokens with two consecutive hyphens

In [40]:
hyphens_after = [token.replace('--', ' ') for token in hyphens_before]
hyphens_after

['world" to', 'strategy ', 'everyone ', 'government ']

>> #### 1.3 Prepare token pairs and update dictionary

In [41]:
# Zip pairs into before and after tuples
hyphens = list(zip(hyphens_before, hyphens_after))

# Update dictionary and hyphen tokens
clean_dict = update_dict(clean_dict, hyphens)
hyphen_tokens = [token for token in hyphen_tokens if token not in clean_dict]

>> #### 2.1 Problem: Tokens with leading or trailing hyphens <a id='4_4_5_1_2_id'></a>

In [42]:
pattern = '^-+.*|.*-+$'
hyphens_before = [re.search(pattern, token)[0] for token in hyphen_tokens if re.search(pattern, token)]
hyphens_before

['laws-']

>> #### 2.2 Solution: Tokens with two consecutive hyphens

In [43]:
hyphens_after = [token.replace('-', ' ') for token in hyphens_before]
hyphens_after

['laws ']

>> #### 2.3 Prepare token pairs and update dictionary

In [44]:
# Zip pairs into before and after tuples
hyphens = list(zip(hyphens_before, hyphens_after))

# Update dictionary and hyphen tokens
clean_dict = update_dict(clean_dict, hyphens)
hyphen_tokens = [token for token in hyphen_tokens if token not in clean_dict]

>> #### 3.1 Problem: Determining which tokens are valid words <a id='4_4_5_1_3_id'></a>

In [45]:
# Examine tokens
hyphen_tokens

['Waldorf-Astoria',
 'newly-trained',
 'nation-wide',
 'pork-barrel',
 'two-hundredth',
 'Asia-first',
 'anti-Chinese',
 'clean-energy',
 'cabinet-level',
 'non-belligerent',
 'non-communist',
 'post-World',
 'self-made',
 'anti-establishment',
 'Kai-shek',
 'gender-based',
 'post-Cold',
 'cash-and-carry',
 'Webster-Ashburton',
 'still-young',
 'Hay-Buneau-Varilla',
 'AFL-CIO',
 'Mexican-American',
 'full-time',
 'de-emphasizes',
 'war-stricken',
 '48-hour',
 'mid-term',
 'Bergen-Belsen',
 'newly-acquired',
 'drought-stricken',
 'non-aggression',
 'self-determination',
 'Kansas-Nebraska',
 'one-year',
 'military-industrial',
 'well-funded',
 'one-third',
 'U.S.-British',
 'ash-heap',
 'R-Ohio',
 'peace-loving',
 'crime-related',
 'Anti-trust',
 'Latin-American',
 'Maine-Brunswick',
 'D-Day',
 'twenty-seven',
 'long-term',
 'hand-in-hand',
 '100-a-plate',
 'twenty-eight',
 'Interstate-Commerce',
 'Iran-Contra',
 'Destroyers-for-Bases',
 'anti-ballistic',
 'ill-fated',
 'self-reliance',


>> #### 3.2 Solution: Check validity of token using <span style="color:purple">howmanysyllables.com</span>

In [46]:
# Get all valid words
word_tokens = [token for token in tqdm(hyphen_tokens) if is_word(token)]
word_tokens

100%|██████████| 98/98 [00:48<00:00,  2.01it/s]


['nation-wide',
 'pork-barrel',
 'non-belligerent',
 'non-communist',
 'self-made',
 'anti-establishment',
 'Kai-shek',
 'cash-and-carry',
 'AFL-CIO',
 'full-time',
 'de-emphasizes',
 'mid-term',
 'Bergen-Belsen',
 'non-aggression',
 'self-determination',
 'Kansas-Nebraska',
 'military-industrial',
 'ash-heap',
 'Anti-trust',
 'Latin-American',
 'D-Day',
 'twenty-seven',
 'long-term',
 'hand-in-hand',
 'twenty-eight',
 'Interstate-Commerce',
 'anti-ballistic',
 'ill-fated',
 'self-reliance',
 'Austria-Hungary',
 'war-time',
 'cease-fire',
 'anti-war',
 'high-level',
 'Lend-Lease',
 'inter-war',
 'wide-ranging',
 'do-nothing',
 'legal-tender',
 'Smoot-Hawley',
 're-election',
 'self-help',
 'Taft-Hartley',
 'President-elect',
 'co-existence',
 'Judeo-Christian',
 'seventy-third']

In [47]:
# Zip pairs into before and after tuples
hyphens = list(zip(word_tokens, word_tokens))

# Update dictionary and hyphen tokens
clean_dict = update_dict(clean_dict, hyphens)
hyphen_tokens = [token for token in hyphen_tokens if token not in clean_dict]

In [48]:
# Non-words
hyphen_tokens = sorted(list(set(hyphen_tokens).difference(set(word_tokens))))
hyphen_tokens

['10-point',
 '100-a-plate',
 '37-day',
 '48-hour',
 'Anglo-United',
 'Asia-first',
 'Asian-Pacific',
 'Bland-Allison',
 'Destroyers-for-Bases',
 'Hay-Buneau-Varilla',
 'Iran-Contra',
 'Jefferson-Jackson',
 'Maine-Brunswick',
 'Mexican-American',
 'R-Ohio',
 'Reagan-endorsed',
 'Texas-Austin',
 'Then-Senator',
 'U.S.-British',
 'U.S.-Soviet',
 'Waldorf-Astoria',
 'Webster-Ashburton',
 'agreed-upon',
 'anti-Chinese',
 'cabinet-level',
 'century-long',
 'clean-energy',
 'conversation-style',
 'crime-related',
 'drought-stricken',
 'eighteen-year',
 'future-Chief',
 'gender-based',
 'late-term',
 'long-established',
 'newly-acquired',
 'newly-trained',
 'one-third',
 'one-year',
 'peace-loving',
 'post-Cold',
 'post-World',
 'self-supportive',
 'silver-purchase',
 'still-young',
 'ten-mile-wide',
 'two-hundredth',
 'two-percent',
 'war-stricken',
 'well-funded',
 'well-received']

In [49]:
# Zip pairs into before and after tuples
hyphens = list(zip(hyphen_tokens, hyphen_tokens))

# Update dictionary and hyphen tokens
clean_dict = update_dict(clean_dict, hyphens)
hyphen_tokens = [token for token in hyphen_tokens if token not in clean_dict]

> ##### 2. Clean and match remaining tokens <a id='4_4_5_clean_match_id'></a>

>> #### 1.1 Problem: Some tokens have unnecessary hyphens

In [50]:
hyphens_before = [token for token in tokens if '—' in token]
hyphens_before

['ideas—justice',
 'communism—which',
 'future—is',
 'economy—energy',
 'it—the',
 'policy—all',
 'issue—peace',
 'treaty—to',
 'citizens—“never',
 'bust—often',
 'me—and',
 'responsibility—for',
 'war—to',
 'compact—which',
 'union—as',
 'Nations—would',
 'you—ask']

>> #### 1.2 Solution: Remove unneessary hyphens from tokens

In [51]:
hyphens_after = [token.replace('—', ' ') for token in hyphens_before]
hyphens_after

['ideas justice',
 'communism which',
 'future is',
 'economy energy',
 'it the',
 'policy all',
 'issue peace',
 'treaty to',
 'citizens “never',
 'bust often',
 'me and',
 'responsibility for',
 'war to',
 'compact which',
 'union as',
 'Nations would',
 'you ask']

>> #### 1.3 Prepare token pairs and update dictionary

In [52]:
# Zip pairs
hyphens = list(zip(hyphens_before, hyphens_after))

# Update dictionary and hyphen tokens
clean_dict = update_dict(clean_dict, hyphens)
tokens = [token for token in tokens if not matcher(nlp.make_doc(token)) and token not in clean_dict]

>> #### 2.1 Problem: Tokens with few pattern similarities <a id='4_4_5_2_2_id'></a>

In [53]:
tokens

['N.Y.',
 '80th',
 'D.C',
 'W.',
 'and/or',
 'D.C.',
 'J.P.',
 '20th',
 '72nd',
 '11:32',
 '73rd',
 'U.S.S.',
 '25th',
 '11th',
 '150th',
 'I.',
 'B&O',
 'U.S.',
 'Jr.',
 'détente',
 '100th',
 '50th',
 '21st',
 'J.',
 "'s",
 'H.W.',
 'H.',
 'HIV/AIDS',
 'D.',
 'Mr.',
 '26th',
 'C.',
 "O'Neill",
 'S.',
 'Mrs.',
 'M.',
 '15th',
 'vs.',
 'T.',
 'G.I.',
 '5th',
 'St.',
 '$',
 'U.S.S.R.',
 '19th',
 '74th',
 'L.',
 'labor/management',
 "Mu'ammar",
 'F.',
 'U.S']

>> #### 2.2 Solution: Clean tokens and leave the rest

In [54]:
# D.C should be D.C.
clean_dict['D.C'] = 'D.C.'
clean_dict['labor/management'] = 'labor or management'
clean_dict['HIV/AIDS'] = 'HIV or AIDS'
clean_dict['and/or'] = 'and or'

# Note, "'s" and "$" will be handled by our pattern matcher so we can ignore them

## 4.5. Clean Transcripts <a id='4_5_id'></a>

In [55]:
with open('speeches_modified/speeches_737.txt') as file:
    speeches.iloc[737].Transcript = file.read()

with open('speeches_modified/speeches_738.txt') as file:
    speeches.iloc[738].Transcript = file.read()
    
with open('speeches_modified/speeches_964.txt') as file:
    speeches.iloc[964].Transcript = file.read()
    
with open('speeches_modified/speeches_575.txt') as file:
    speeches.iloc[575].Transcript = file.read()

with open('speeches_modified/speeches_579.txt') as file:
    speeches.iloc[579].Transcript = file.read()
    
with open('speeches_modified/speeches_675.txt') as file:
    speeches.iloc[675].Transcript = file.read()
    
with open('speeches_modified/speeches_673.txt') as file:
    speeches.iloc[673].Transcript = file.read()
    
with open('speeches_modified/speeches_712.txt') as file:
    speeches.iloc[712].Transcript = file.read()
    
with open('speeches_modified/speeches_572.txt') as file:
    speeches.iloc[572].Transcript = file.read()
    
with open('speeches_modified/speeches_507.txt') as file:
    speeches.iloc[507].Transcript = file.read()
    
with open('speeches_modified/speeches_609.txt') as file:
    speeches.iloc[609].Transcript = file.read()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


### 4.5.1 Normalize Unicode <a id='4_5_1_id'></a>

In [56]:
speeches['Transcript'] = speeches['Transcript'].apply(lambda x: unicodedata.normalize('NFKD', x))

### 4.5.2 Initial preprocessing <a id='4_5_2_id'></a>

In [57]:
# Clean newlines and different apostrophe symbol
speeches['Transcript'] = speeches['Transcript'].apply(lambda x: x.replace('\n', ' '))
speeches['Transcript'] = speeches['Transcript'].apply(lambda x: x.replace('’', '\''))
speeches['Transcript'] = speeches['Transcript'].apply(lambda x: x.replace('\r', ''))
speeches['Transcript'] = speeches['Transcript'].apply(lambda x: x.replace('ooo', '000'))
speeches['Transcript'] = speeches['Transcript'].apply(lambda x: x.replace('(Applause.)', '')) 
speeches['Transcript'] = speeches['Transcript'].apply(lambda x: x.replace('(applause.)', ''))
speeches['Transcript'] = speeches['Transcript'].apply(lambda x: x.replace('(Applause)', '')) 
speeches['Transcript'] = speeches['Transcript'].apply(lambda x: x.replace('(applause)', ''))
speeches['Transcript'] = speeches['Transcript'].apply(lambda x: x.strip())
speeches['Transcript'] = speeches['Transcript'].apply(lambda x: x[10:] if x[:10].lower() == 'transcript' else x)

In [58]:
speeches.iloc[575].Transcript = re.sub('Final Estimates for 1912 as of November 29 (.*?) COLLECTION.', '', speeches.iloc[575].Transcript)
speeches.iloc[979].Transcript = speeches.iloc[979].Transcript[:-45]
speeches.iloc[679].Transcript = re.match('(.*?)(?=CHESTER A. ARTHUR)', speeches.iloc[679].Transcript)[0]
speeches.iloc[250].Transcript = speeches.iloc[250].Transcript.replace('CCC [Civilian Conservation Corps]-type program', 'CCC-type program, Civilian Conservation Corps,')

text_to_remove = "The rapid increase in the trade between the two countries is shown in the following table:Philippine exports, fiscal years 1908-1910. [Exclusive of gold and silver.]Fiscal Year To: United States To: Other Countries Total 1908$10,323,233$22,493,334$32,816,567190910,215,33120,778,23230,993,563191018,741,77121,122,39839,864,169NOTE.--Latest monthly returns show exports for the year ending August, 1910, to the United States $20,035,902, or 49 per cent of the $41,075,738 total, against 031,275 to the United States, or 34 per cent of the $32,183,871 total for the year ending August, 1909.Philippine imports, fiscal years 1908-1910. [Exclusive of gold and silver and government supplies.]Fiscal Year From: United States From: Other Countries Total 1908$5,079,487$25,838,870$30,918,35719094,691,77023,100,27027,792,397191010,775,30126,292,32937,067,630NOTE.--Latest monthly returns show imports for the year ending August, 1910, from the United States $11,615,982, or 30 per cent of the $39,025,667 total, against $5,193,419 from the United States, or 18 per cent of the $28,948,011 total for the year ending August, 1909."
speeches.iloc[575].Transcript = speeches.iloc[575].Transcript.replace(text_to_remove, '')

### 4.5.3 Remove non-presidential text <a id='4_5_3_id'></a>

In [59]:
# First Pass
pattern = '([A-Z]+\.\s)*([A-Z]+[A-Z\'-]+[A-Z]\s)*[A-Z]+[A-Z\'-]+[A-Z]:'
potential_speakers = list(set(find_pattern(speeches['Transcript'], pattern, 50, -1)))

speaker_dict = {}
for speaker in potential_speakers:
    if speaker not in speaker_dict:
        speaker_dict[speaker] = speaker
    
print('There are {} potential speakers'.format(len(potential_speakers)))

There are 101 potential speakers


In [60]:
[speaker_dict[speaker] for speaker in sorted(speaker_dict.keys())]

['AGENT DUNN:',
 'AIDE:',
 'AUDIENCE MEMBER:',
 'AUDIENCE MEMBERS:',
 'AUDIENCE:',
 'BROTHER HANDSOME LAKE:',
 'BUSH:',
 'C. GENERAL:',
 'C. LEHRER:',
 'C. SIR:',
 'CONGRESS ASSEMBLED:',
 'CREDENCE. MR. MINISTER:',
 'CROWD:',
 'DC. PRESIDENT CLINTON:',
 'DEAR DOCTOR HERRAN:',
 'DUKAKIS:',
 'EDM:',
 'EITC:',
 'EXCELLENCY:',
 'FELLOW CITIZENS:',
 'FELLOW-CITIZENS:',
 'GEE:',
 'GENERAL:',
 'GENTLEMEN:',
 'GROER:',
 'HAWLEY:',
 'ICE:',
 'JENNINGS:',
 'JIM LEHRER:',
 'JONES:',
 'KENNEDY:',
 'LEADER RYAN:',
 'LEHRER:',
 'MASHEK:',
 'MODERATOR:',
 'MR. CATER:',
 'MR. CHANCELLOR:',
 'MR. CRONKITE:',
 'MR. DRUMMOND:',
 'MR. EDWARDS:',
 'MR. FLEMING:',
 'MR. HOWE:',
 'MR. KALB:',
 'MR. KELLY:',
 'MR. KENNEDY:',
 'MR. KONDRACKE:',
 'MR. LEVY:',
 'MR. MONDALE:',
 'MR. MORGAN:',
 'MR. NEWMAN:',
 'MR. NIVEN:',
 'MR. NIXON:',
 'MR. NOVINS:',
 'MR. SCHWAB:',
 'MR. SHADEL:',
 'MR. SINGISER:',
 'MR. SMITH:',
 'MR. SPIVAK:',
 'MR. TREWHITT:',
 'MR. VANOCUR:',
 'MR. VON FREMD:',
 'MR. WARREN:',
 'MS. GEYE

In [61]:
# Examine 
text_surrounding_speakers([speaker for speaker in sorted(speaker_dict.keys())])

-----------------
| 0 AGENT DUNN: |
-----------------

['ough guys. Come here, Derek.  AGENT DUNN: Just want to say thanks for e']

-----------
| 1 AIDE: |
-----------

[' Mr. President — (inaudible). AIDE:  Yes, we do.  There you go. Q']

----------------------
| 2 AUDIENCE MEMBER: |
----------------------

['we want to stop it now?       AUDIENCE MEMBER:  Four more years! THE PRESIDE',
 's?  Go ahead, say that again. AUDIENCE MEMBER:  Vote yes! THE PRESIDENT:  A ',
 ' in the political process.    AUDIENCE MEMBER:  Because of you!     THE PRES',
 't give her or him some space. AUDIENCE MEMBER:  Hope you have insurance.  (L',
 'dam Hussein, we saw a threat. AUDIENCE MEMBER: (Inaudible.) AUDIENCE: U-S-A!']

-----------------------
| 3 AUDIENCE MEMBERS: |
-----------------------

[' well --  -- the survivors -- AUDIENCE MEMBERS:  We love you, Obama! THE '
 'PRES']

---------------
| 4 AUDIENCE: |
---------------

['nt of the United States.      AUDIENCE:  Four more years!  Four more ',
 

[' FRANK McGEE, MODERATOR: Good evening. This is Frank M',
 " BILL SHADEL, MODERATOR: Good evening. I'm Bill Shadel",
 ' QUINCY HOWE, MODERATOR: I am Quincy Howe of ABC News ',
 ' HOWARD K. SMITH, MODERATOR: Good evening. The television ']

-----------------
| 35 MR. CATER: |
-----------------

[" is for Vice President Nixon. MR. CATER: Mr. Vice President, I'd like ",
 'question for Senator Kennedy. MR. CATER: Senator Kennedy, last week yo',
 'r. Cater for Senator Kennedy. MR. CATER: Uh - Mr. Kennedy, uh - Senato']

----------------------
| 36 MR. CHANCELLOR: |
----------------------

["ion for Vice President Nixon. MR. CHANCELLOR: Sir, I'd like to ask you an- ",
 'question for Senator Kennedy. MR. CHANCELLOR: Senator, another question uh ']

--------------------
| 37 MR. CRONKITE: |
--------------------

['question for Senator Kennedy. MR. CRONKITE: Senator, the charge has been ',
 'ion for Vice President Nixon. MR. CRONKITE: Thank you Quincy. Mr. Vice Pr']

--------------------
| 38 

["didn't.   The President's Age MR. TREWHITT: Mr. Mondale, I'm going to han",
 'step forward in human rights. MR. TREWHITT: Mr. Mondale, could I ask you ',
 "   The President's Leadership MR. TREWHITT: This question of leadership k",
 "Reagan?   The President's Age MR. TREWHITT: Mr. President, I want to rais",
 ' candidate.   Nuclear Weapons MR. TREWHITT: Mr. President, could I take y']

-------------------
| 59 MR. VANOCUR: |
-------------------

['ident Nixon from Mr. Vanocur. MR. VANOCUR: Uh - Mr. Vice President, sinc',
 " Well as you take the bills - MR. VANOCUR: - if you weren't able to get ",
 "tor Kennedy from Mr. Vanocur. MR. VANOCUR: Senator, you've been promisin",
 'ion for Vice President Nixon. MR. VANOCUR: Mr. Vice President uh - in on']

---------------------
| 60 MR. VON FREMD: |
---------------------

['emd for Vice President Nixon. MR. VON FREMD: Mr. Vice President, a two-par',
 'emd for Vice President Nixon. MR. VON FREMD: Mr. Vice President, in the pa',
 "question for

['by the people themselves. MR. SMITH: Senator Kennedy, your conclus',
 "er a government can have. MR. SMITH: Senator Kennedy's comment? MR",
 'f success for the future. MR. SMITH: Mr. Nixon, comment? MR. NIXON',
 'cts which are in surplus. Mr. SMITH: The next question to Vice Pre',
 'g to face in the sixties? MR. SMITH: The next question to Senator ']

--------------------
| 89 SPEAKER RYAN: |
--------------------

["r many, many years. So—right? SPEAKER RYAN: Yep. THE PRESIDENT: That's go"]

-------------
| 90 STATE: |
-------------

['er 11, 1903. MR. SECRETARY OF STATE: I have the very great honor t']

--------------------
| 91 THE CONGRESS: |
--------------------

['GENTLEMEN OF THE CONGRESS: The year that has elapsed sin',
 'GENTLEMEN OF THE CONGRESS:When I addressed myself to per',
 'MEMBERS OF THE CONGRESS:So many problems are calling f',
 'GENTLEMEN OF THE CONGRESS: In fulfilling at this time th',
 'GENTLEMEN OF THE CONGRESS: Since I last had the privileg']

------------------

In [62]:
speaker_dict['BROTHER HANDSOME LAKE:'] = 'TO BROTHER HANDSOME LAKE:'
speaker_dict['C. LEHRER:'] = 'LEHRER:'
speaker_dict['CONGRESS ASSEMBLED:'] = 'TO THE SENATE AND HOUSE OF REPRESENTATIVES OF THE UNITED STATES   IN CONGRESS ASSEMBLED:'
speaker_dict['CREDENCE. MR. MINISTER:'] = 'THE PRESIDENT\'S REPLY TO THE REMARKS MADE BY SENOR BUNAU VARILLA ON THE OCCASION OF THE PRESENTATION OF HIS LETTERS OF CREDENCE. MR. MINISTER:'
speaker_dict['DC. PRESIDENT CLINTON:'] = 'PRESIDENT CLINTON:'
speaker_dict['FELLOW CITIZENS:'] = 'MY FELLOW CITIZENS:'
speaker_dict['GEE:'] = 'MR. McGEE:'
speaker_dict['KENNEDY:'] = 'MR. KENNEDY:'
speaker_dict['N. PRESIDENT CLINTON:'] = 'PRESIDENT CLINTON:'
speaker_dict['N. SENATOR DOLE:'] = 'SENATOR DOLE:'
speaker_dict['NIXON:'] = 'MR. NIXON:'
speaker_dict['O. THE PRESIDENT:'] = 'THE PRESIDENT:'
speaker_dict['PANAMA. MR. PRESIDENT:'] = 'MR. PRESIDENT:'
speaker_dict['PRESIDENT:'] = 'THE PRESIDENT:'
speaker_dict['SM1TH:'] = 'MR. SMITH:'
speaker_dict['STATE:'] = 'MR. SECRETARY OF STATE:'
speaker_dict['THE MANDAN NATION:'] = 'MY CHILDREN, THE WOLF AND PEOPLE OF THE MANDAN NATION:'
speaker_dict['TRUMP:'] = 'PRESIDENT TRUMP:'
speaker_dict['UDIENCE MEMBER:'] = 'AUDIENCE MEMBER:'

# Add SM1TH
speaker_dict['MR. SM1TH:'] = 'MR. SM1TH:'

# 'MODERATORS:'
del speaker_dict['MODERATOR:']
speaker_dict['QUINCY HOWE, MODERATOR:'] = 'QUINCY HOWE, MODERATOR:'
speaker_dict['HOWARD K. SMITH, MODERATOR:'] = 'HOWARD K. SMITH, MODERATOR:'
speaker_dict['FRANK McGEE, MODERATOR:'] = 'FRANK McGEE, MODERATOR:'
speaker_dict['BILL SHADEL, MODERATOR:'] = 'BILL SHADEL, MODERATOR:'

# 'REPRESENTATIVES:''
del speaker_dict['REPRESENTATIVES:']
speaker_dict['TO THE SENATE AND HOUSE OF REPRESENTATIVES:'] = 'TO THE SENATE AND HOUSE OF REPRESENTATIVES:'
speaker_dict['FELLOW CITIZENS OF THE SENATE AND HOUSE OF REPRESENTATIVES:'] = 'FELLOW CITIZENS OF THE SENATE AND HOUSE OF REPRESENTATIVES:'
speaker_dict['TO THE SENATE AND HOUSE OF REPRESENTATIVES OF THE UNITED STATES:'] = 'TO THE SENATE AND HOUSE OF REPRESENTATIVES OF THE UNITED STATES:'

speaker_dict['THE UNITED STATES:'] = 'The PRESIDENT OF THE UNITED STATES:'

speaker_dict['THE CONGRESS:'] = 'GENTLEMEN OF THE CONGRESS:'
speaker_dict['MEMBERS OF THE CONGRESS:'] = 'MEMBERS OF THE CONGRESS:'


In [63]:
all_speakers = list(set([speaker_dict[keys] for keys in speaker_dict.keys()]))

In [64]:
president_speakers = [
    'GRANT:',
    'TO THE SENATE AND HOUSE OF REPRESENTATIVES:',
    'TO BROTHER HANDSOME LAKE:',
    'FELLOW-CITIZENS:',
    'GENTLEMEN:',
    'PRESIDENT CLINTON:'
    'PRESIDENT BUSH:',
    'PRESIDENT REAGAN:',
    'PRESIDENT TRUMP:',
    "THE PRESIDENT'S REPLY TO THE REMARKS MADE BY SENOR BUNAU VARILLA ON THE OCCASION OF THE PRESENTATION OF HIS LETTERS OF CREDENCE. MR. MINISTER:",
    'MR. PRESIDENT:',
    'SIR:',
    'BUSH:',
    'MY CHILDREN, THE WOLF AND PEOPLE OF THE MANDAN NATION:',
    'MY FELLOW CITIZENS:',
    'TO THE SENATE AND HOUSE OF REPRESENTATIVES OF THE UNITED STATES   IN CONGRESS ASSEMBLED:',
    'MEMBERS OF THE CONGRESS:',
    'GENTLEMEN OF THE CONGRESS:',
    'THE PRESIDENT:',
    'The PRESIDENT OF THE UNITED STATES:',
    'FELLOW CITIZENS OF THE SENATE AND HOUSE OF REPRESENTATIVES:'    
]

# Get all non-presidential speakers
non_president_speakers = list(set(all_speakers).difference(set(president_speakers)))
non_president_speakers += ['Audience:']

# Get all sayings for each non-president
non_president_speaker_sayings_dict = {}
for speaker in tqdm(non_president_speakers):
    non_president_speaker_sayings_dict[speaker] = clean_removable_speaker_text(speaker, all_speakers)

100%|██████████| 81/81 [00:25<00:00,  3.18it/s]


In [65]:
# List of non-president sayings
non_president_speaker_sayings = [sayings for sayings in non_president_speaker_sayings_dict.values()]
non_president_speaker_sayings = list(itertools.chain.from_iterable(non_president_speaker_sayings))
non_president_speaker_sayings

["SENATOR KENNEDY: Mr. Smith, Mr. Nixon. In the election of 1860, Abraham Lincoln said the question was whether this nation could exist half-slave or half-free. In the election of 1960, and with the world around us, the question is whether the world will exist half-slave or half-free, whether it will move in the direction of freedom, in the direction of the road that we are taking, or whether it will move in the direction of slavery. I think it will depend in great measure upon what we do here in the United States, on the kind of society that we build, on the kind of strength that we maintain. We discuss tonight domestic issues, but I would not want that to be any implication to be given that this does not involve directly our struggle with Mr. Khrushchev for survival. Mr. Khrushchev is in New York, and he maintains the Communist offensive throughout the world because of the productive power of the Soviet Union itself. The Chinese Communists have always had a large population. But they

In [66]:
# For each transcript, iterate through sayings and remove and if they are present
for row in tqdm(speeches.itertuples()):
    index = row[0]
    transcript = row[5]
    
    for saying in non_president_speaker_sayings:
        if saying in transcript:
            speeches.Transcript[index] = speeches.Transcript[index].replace(saying, ' ')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
992it [00:08, 112.38it/s]


In [67]:
# Examine remaining speakers not cleaned up
remaining_speakers = []
for row in tqdm(speeches.itertuples()):
    transcript = row[5]
    for non_president in non_president_speakers:
        if non_president in transcript and non_president not in remaining_speakers:
            remaining_speakers += [non_president]
remaining_speakers

992it [00:00, 1543.43it/s]


['AUDIENCE MEMBER:', 'LEHRER:', 'DUKAKIS:', 'PRESIDENT. SIR:', 'WAR. SIR:']

In [68]:
# Get remaining non-presidential speakers
remaining_non_president_speakers = []
for speaker in remaining_speakers:
    remaining_non_president_speakers += clean_removable_speaker_text(speaker, all_speakers)

In [69]:
# Removing remaining sayings
for row in tqdm(speeches.itertuples()):
    index = row[0]
    transcript = row[5]
    
    for saying in remaining_non_president_speakers:
        if saying in transcript:
            speeches.Transcript[index] = speeches.Transcript[index].replace(saying, ' ')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
992it [00:00, 8109.82it/s]


In [70]:
# Examine remaining speakers not cleaned up again
remaining_speakers = []
for row in tqdm(speeches.itertuples()):
    transcript = row[5]
    for non_president in non_president_speakers:
        if non_president in transcript and non_president not in remaining_speakers:
            remaining_speakers += [non_president]
            
remaining_speakers

992it [00:00, 1599.67it/s]


['PRESIDENT. SIR:']

In [71]:
# Removing presidents
for row in tqdm(speeches.itertuples()):
    index = row[0]
    transcript = row[5]
    
    for president in president_speakers:
        if president in transcript:
            speeches.Transcript[index] = speeches.Transcript[index].replace(president, ' ')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
992it [00:00, 8055.54it/s]


In [72]:
speeches

Unnamed: 0,Date,President,Speech Title,Summary,Transcript,URL
0,2019-09-25,Donald Trump,Press Conference,President Donald Trump holds a press conferenc...,"Thank you very much. Thank you. Well, tha...",https://millercenter.org/the-presidency/presid...
1,2019-09-24,Donald Trump,Remarks at the United Nations General Assembly,President Donald Trump speaks to the 74th sess...,"Thank you very much. Mr. President, Mr. S...",https://millercenter.org/the-presidency/presid...
2,2019-02-15,Donald Trump,Speech Declaring a National Emergency,President Donald Trump declares a national eme...,"Thank you very much, everybody. Before we...",https://millercenter.org/the-presidency/presid...
3,2019-02-05,Donald Trump,State of the Union Address,"In his second State of the Union Address, Pres...","Madam Speaker, Mr. Vice President, Members o...",https://millercenter.org/the-presidency/presid...
4,2019-01-19,Donald Trump,Remarks about the US Southern Border,President Donald Trump speaks about what he se...,"Just a short time ago, I had the honor of p...",https://millercenter.org/the-presidency/presid...
...,...,...,...,...,...,...
987,1790-12-29,George Washington,Talk to the Chiefs and Counselors of the Senec...,The President reassures the Seneca Nation that...,"I the President of the United States, by my o...",https://millercenter.org/the-presidency/presid...
988,1790-12-08,George Washington,Second Annual Message to Congress,Washington focuses on commerce in his second a...,Fellow citizens of the Senate and House of Re...,https://millercenter.org/the-presidency/presid...
989,1790-01-08,George Washington,First Annual Message to Congress,"In a wide-ranging speech, President Washington...",Fellow Citizens of the Senate and House of Rep...,https://millercenter.org/the-presidency/presid...
990,1789-10-03,George Washington,Thanksgiving Proclamation,"At the request of Congress, Washington establi...",Whereas it is the duty of all Nations to ackno...,https://millercenter.org/the-presidency/presid...


In [73]:
transcript_tokens = list(set(get_transcript_tokens(speeches.Transcript, 50, -1, True)))
print(len(transcript_tokens))

Processing batch 1/20
Processing batch 2/20
Processing batch 3/20
Processing batch 4/20
Processing batch 5/20
Processing batch 6/20
Processing batch 7/20
Processing batch 8/20
Processing batch 9/20
Processing batch 10/20
Processing batch 11/20
Processing batch 12/20
Processing batch 13/20
Processing batch 14/20
Processing batch 15/20
Processing batch 16/20
Processing batch 17/20
Processing batch 18/20
Processing batch 19/20
Processing batch 20/20
11533


#### Clean transcript tokens by replacing all  EM dashes, '—', with a comma followed by a space

In [74]:
tokens_before = [token for token in transcript_tokens if '—' in token]
tokens_after = [token.replace('—', ', ') for token in tokens_before]

# Zip pairs into before and after tuples
em_dash_tokens = list(zip(tokens_before, tokens_after))

# Update dictionary and transcript tokens
clean_dict = update_dict(clean_dict, em_dash_tokens)
transcript_tokens = list(set(transcript_tokens).difference(set([a for (a,b) in set(em_dash_tokens)])))

> ##### 1. Hyphens <a id='4_4_5_hyphens_id'></a>

In [75]:
# Examine all tokens with hyphens 
hyphen_tokens_list = [token for token in transcript_tokens if '-' in token]
hyphen_tokens = hyphen_tokens_list.copy()
print(len(hyphen_tokens))
hyphen_tokens

4747


['Vice-Presidency',
 'self-protection',
 'pass-throughs',
 'employer-provided',
 'lower-paid',
 'control-moves',
 '1920--and',
 'non-Marxist',
 '1865.--Revolution',
 'time-lag',
 'States-a',
 'double-digit',
 'self-confidence',
 'Presidents-President',
 'follow-the-leader',
 'all-black',
 '-Japan',
 'Tsung-li',
 'record-keeping',
 'islands-----------------------------------------------',
 'argument---some',
 'across-the-board',
 'upside-down',
 'all-absorbing',
 '5-ambulatory',
 'half-a',
 'Island----------------------------------------------------------',
 'last-named',
 'that--',
 'otherwise-or',
 'that-nothing',
 'legend-such',
 'dominate-which',
 'common-law',
 'SS-5',
 'vice-admiral',
 'everything-including',
 '1--The',
 'Canada----------',
 'fugitive-slave',
 'shop-worn',
 'Communist-dominated',
 'Total--------------------------',
 'one-twelfth',
 'two-tenths',
 '--Moreover',
 'four-square',
 'eighty-two',
 'medium-sized',
 'save-later',
 'vice-consul',
 'Ten-forties',
 'over-cro

>> #### 1.1 Problem: Tokens with leading or trailing hyphens <a id='4_4_5_1_1_id'></a>

In [76]:
pattern = '^-+.*|.*-+$'
hyphens_before = [re.search(pattern, token)[0] for token in hyphen_tokens if re.search(pattern, token)]
hyphens_before

['-Japan',
 'islands-----------------------------------------------',
 'Island----------------------------------------------------------',
 'that--',
 'Canada----------',
 'Total--------------------------',
 '--Moreover',
 'of-----------------------------------------------',
 '--Our',
 '1,221,956,620-----------------',
 'were--',
 'Total---------------------------',
 'Island------------------------------------------------------------------',
 '1,719,428,069----------',
 'interest--------------------------------------',
 '125,775.00-',
 '--not',
 '1882--',
 's-',
 'Quissibis----------------------------------------------------------------------',
 '--to',
 'condition--',
 'beverages-------------------------------------------------',
 '--I',
 '--during',
 'ahead--',
 'Silks-----------------------------------------------------------------------',
 'secondly--',
 'provide--',
 'judgment--',
 '--As',
 'Sundries---------------------------------------------------',
 'of--',
 'of---------------

>> #### 1.2 Solution: Replace hyphens with spaces

In [77]:
hyphens_after = [token.replace('-', ' ') for token in hyphens_before]
hyphens_after

[' Japan',
 'islands                                               ',
 'Island                                                          ',
 'that  ',
 'Canada          ',
 'Total                          ',
 '  Moreover',
 'of                                               ',
 '  Our',
 '1,221,956,620                 ',
 'were  ',
 'Total                           ',
 'Island                                                                  ',
 '1,719,428,069          ',
 'interest                                      ',
 '125,775.00 ',
 '  not',
 '1882  ',
 's ',
 'Quissibis                                                                      ',
 '  to',
 'condition  ',
 'beverages                                                 ',
 '  I',
 '  during',
 'ahead  ',
 'Silks                                                                       ',
 'secondly  ',
 'provide  ',
 'judgment  ',
 '  As',
 'Sundries                                                   ',
 'of  ',
 'of               

>> #### 1.3 Prepare token pairs and update dictionary

In [78]:
# Zip pairs into before and after tuples
hyphens = list(zip(hyphens_before, hyphens_after))

# Update dictionary and hyphen tokens
clean_dict = update_dict(clean_dict, hyphens)
hyphen_tokens = [token for token in hyphen_tokens if token not in clean_dict]

>> #### 2.1 Problem: Tokens with 2 or more consecutive inner hyphens

In [79]:
pattern = '[^-]+(-{2,}[^-]+)+'
hyphens_before = [re.search(pattern, token)[0] for token in hyphen_tokens if re.search(pattern, token)]
hyphens_before

['1920--and',
 '1865.--Revolution',
 'argument---some',
 '1--The',
 '1902.--Revolution',
 '31,280,372-----------------------E',
 '1878.--Revolt',
 '1851.--Outbreak',
 'slaves---being',
 '1903--9:15',
 'passion---a',
 'post---the',
 '1851.--Revolution',
 '1885.--Revolutionary',
 '1859.--Riots',
 '1835--a',
 'examinations.--For',
 '581,813---while',
 'Illinois---that',
 'here---that',
 'permit---prohibiting',
 'trade---that',
 'service---the',
 '1791.--Copy',
 'not---cannot',
 'prospective---north',
 'repudiated---discarded',
 'go---is',
 'friend---brother',
 'Missouri---and',
 'Congress--"more',
 'representatives---being',
 'principle---do',
 'it---to',
 'constitution---during',
 'resource"--of',
 '1866.--Unsuccessful',
 '1808---the',
 '60,000,000--a',
 'entered--30',
 'ceased---that',
 'extreme---against',
 '265,093--a',
 'adoption---that',
 '300,965,953-----------------------F',
 '1811--is',
 'cent.--of',
 'trade---that',
 'Texas--"I',
 'Lion---and',
 'II--0.13',
 'congress---that',
 

>> #### 2.2 Solution: Replace hyphens with spaces

In [80]:
hyphens_after = [token.replace('-', ' ') for token in hyphens_before]
hyphens_after

['1920  and',
 '1865.  Revolution',
 'argument   some',
 '1  The',
 '1902.  Revolution',
 '31,280,372                       E',
 '1878.  Revolt',
 '1851.  Outbreak',
 'slaves   being',
 '1903  9:15',
 'passion   a',
 'post   the',
 '1851.  Revolution',
 '1885.  Revolutionary',
 '1859.  Riots',
 '1835  a',
 'examinations.  For',
 '581,813   while',
 'Illinois   that',
 'here   that',
 'permit   prohibiting',
 'trade   that',
 'service   the',
 '1791.  Copy',
 'not   cannot',
 'prospective   north',
 'repudiated   discarded',
 'go   is',
 'friend   brother',
 'Missouri   and',
 'Congress  "more',
 'representatives   being',
 'principle   do',
 'it   to',
 'constitution   during',
 'resource"  of',
 '1866.  Unsuccessful',
 '1808   the',
 '60,000,000  a',
 'entered  30',
 'ceased   that',
 'extreme   against',
 '265,093  a',
 'adoption   that',
 '300,965,953                       F',
 '1811  is',
 'cent.  of',
 'trade   that',
 'Texas  "I',
 'Lion   and',
 'II  0.13',
 'congress   that',
 

>> #### 2.3 Prepare token pairs and update dictionary

In [81]:
# Zip pairs into before and after tuples
hyphens = list(zip(hyphens_before, hyphens_after))

# Update dictionary and hyphen tokens
clean_dict = update_dict(clean_dict, hyphens)
hyphen_tokens = [token for token in hyphen_tokens if token not in clean_dict]

>> #### 3.1 Problem: Some tokens are valid words while others are not

In [82]:
word_tokens_transcript = [(token, is_word(token)) for token in tqdm(hyphen_tokens) if is_word(token)]
word_tokens_transcript

100%|██████████| 3991/3991 [39:37<00:00,  1.68it/s] 


[('Vice-Presidency', 'vice presidency'),
 ('self-protection', 'self-protection'),
 ('pass-throughs', 'pass-through'),
 ('non-Marxist', 'non-Marxist'),
 ('time-lag', 'time lag'),
 ('double-digit', 'double-digit'),
 ('self-confidence', 'self-confidence'),
 ('follow-the-leader', 'follow-the-leader'),
 ('across-the-board', 'across-the-board'),
 ('upside-down', 'upside down'),
 ('half-a', 'halfa'),
 ('common-law', 'common-law'),
 ('vice-admiral', 'vice admiral'),
 ('fugitive-slave', 'Fugitive Slave Acts'),
 ('shop-worn', 'shopworn'),
 ('four-square', 'foursquare'),
 ('eighty-two', 'eighty-two'),
 ('medium-sized', 'medium-sized'),
 ('vice-consul', 'vice-consul'),
 ('over-crowding', 'overcrowd'),
 ('barbed-wire', 'barbed wire'),
 ('surveyors-general', 'surveyor general'),
 ('trick-or-treat', 'trick or treat'),
 ('down-to-earth', 'down-to-earth'),
 ('paper-thin', 'paper-thin'),
 ('pre-eminent', 'preeminent'),
 ('freedom-the', 'freedom of the seas'),
 ('sugar-cane', 'sugarcane'),
 ('behind-the-

>> #### 3.2 Solution: Remove valid words from tokens to be cleaned

In [83]:
# Remove valid words
hyphen_tokens = sorted(list(set(hyphen_tokens).difference(set(word_tokens_transcript))))
print(len(hyphen_tokens))
hyphen_tokens

3991


['1,500-a-year',
 '1-day',
 '1-in-20',
 '1-yard',
 '1-year',
 '1-year-old',
 '1/2-inch',
 '1/2-percent',
 '10,000-foot',
 '10,000-page',
 '10,000-ton',
 '10-day',
 '10-inch',
 '10-man',
 '10-minute',
 '10-percent',
 '10-round',
 '10-to-1',
 '10-year',
 '10-year-old',
 '100-cent',
 '11-year-old',
 '12-hour',
 '12-inch',
 '12-month',
 '12-year-old',
 '122-foot',
 '122-percent',
 '13-year-old',
 '13Ѕ-inch',
 '14-percent',
 '14-point',
 '14-to-1',
 '14-year',
 '14-year-old',
 '15-percent',
 '15-year-olds',
 '152-millimeter',
 '16-inch',
 '16-inch-type',
 '16-year-old',
 '167-page',
 '17-year',
 '17-year-low',
 '17-year-old',
 '18-hour',
 '18-nation',
 '18-percent',
 '18-wheelers',
 '18-year-old',
 '18-year-olds',
 '180-odd',
 '19-foot',
 '19-year-old',
 '19-year-olds',
 '190-year',
 '1970s-era',
 '19th-century',
 '2,000-foot',
 '2,000-mile',
 '2-in-3',
 '2-is',
 '2-minute',
 '2-percent',
 '2-year',
 '2-year-old',
 '2-year-olds',
 '20-knot',
 '20-percent',
 '20-point',
 '20-year',
 '20-year

>> #### 4.1 Problem: Cardinal numbers do not need to be cleaned

In [84]:
token_numbers = [token for token in hyphen_tokens if is_number(token)]
token_numbers

['Eighty-five',
 'Eighty-one',
 'FIFTY-TWO',
 'Fifty-five',
 'Fifty-nine',
 'Fifty-one',
 'Fifty-six',
 'Fifty-two',
 'Forty-eight',
 'Forty-five',
 'Forty-four',
 'Forty-six',
 'Forty-three',
 'Forty-two',
 'Ninety-five',
 'Ninety-four',
 'Ninety-seven',
 'Ninety-six',
 'Ninety-three',
 'Seventy-four',
 'Seventy-seven',
 'Seventy-six',
 'Seventy-two',
 'Sixty-five',
 'Sixty-seven',
 'Thirty-five',
 'Thirty-four',
 'Thirty-one',
 'Thirty-seven',
 'Thirty-six',
 'Twenty-eight',
 'Twenty-five',
 'Twenty-four',
 'Twenty-nine',
 'Twenty-one',
 'Twenty-seven',
 'Twenty-six',
 'Twenty-three',
 'Twenty-two',
 'eighty-eight',
 'eighty-five',
 'eighty-one',
 'eighty-seven',
 'eighty-two',
 'fifty-eight',
 'fifty-five',
 'fifty-four',
 'fifty-nine',
 'fifty-one',
 'fifty-seven',
 'fifty-six',
 'fifty-three',
 'fifty-two',
 'forty-eight',
 'forty-five',
 'forty-four',
 'forty-one',
 'forty-seven',
 'forty-six',
 'forty-three',
 'forty-two',
 'ninety-eight',
 'ninety-five',
 'ninety-nine',
 'ninet

>> #### 4.2 Solution: Remove cardinal numbers from tokens to be cleaned

In [85]:
hyphen_tokens = sorted(list(set(hyphen_tokens).difference(set(token_numbers))))
print(len(hyphen_tokens))
hyphen_tokens

3889


['1,500-a-year',
 '1-day',
 '1-in-20',
 '1-yard',
 '1-year',
 '1-year-old',
 '1/2-inch',
 '1/2-percent',
 '10,000-foot',
 '10,000-page',
 '10,000-ton',
 '10-day',
 '10-inch',
 '10-man',
 '10-minute',
 '10-percent',
 '10-round',
 '10-to-1',
 '10-year',
 '10-year-old',
 '100-cent',
 '11-year-old',
 '12-hour',
 '12-inch',
 '12-month',
 '12-year-old',
 '122-foot',
 '122-percent',
 '13-year-old',
 '13Ѕ-inch',
 '14-percent',
 '14-point',
 '14-to-1',
 '14-year',
 '14-year-old',
 '15-percent',
 '15-year-olds',
 '152-millimeter',
 '16-inch',
 '16-inch-type',
 '16-year-old',
 '167-page',
 '17-year',
 '17-year-low',
 '17-year-old',
 '18-hour',
 '18-nation',
 '18-percent',
 '18-wheelers',
 '18-year-old',
 '18-year-olds',
 '180-odd',
 '19-foot',
 '19-year-old',
 '19-year-olds',
 '190-year',
 '1970s-era',
 '19th-century',
 '2,000-foot',
 '2,000-mile',
 '2-in-3',
 '2-is',
 '2-minute',
 '2-percent',
 '2-year',
 '2-year-old',
 '2-year-olds',
 '20-knot',
 '20-percent',
 '20-point',
 '20-year',
 '20-year

>> #### 5.1 Problem: Ordinal numbers do not need to be cleaned

In [86]:
valid_token_word_nums = [token for token in hyphen_tokens if is_ordinal(token.lower())]
valid_token_word_nums

['Fifty-first',
 'Fifty-seventh',
 'Fifty-sixth',
 'Fifty-third',
 'Forty-first',
 'Forty-fourth',
 'Forty-seventh',
 'Forty-sixth',
 'Seventy-first',
 'Seventy-third',
 'Sixty-first',
 'Sixty-fourth',
 'Sixty-sixth',
 'Sixty-third',
 'Twenty-first',
 'eighty-second',
 'eighty-seventh',
 'fifty-fourth',
 'fifty-second',
 'fifty-seventh',
 'fifty-sixth',
 'fifty-third',
 'forty-first',
 'forty-second',
 'forty-third',
 'ninety-first',
 'ninety-fourth',
 'ninety-second',
 'ninety-sixth',
 'ninety-third',
 'seventy-first',
 'seventy-seventh',
 'sixty-second',
 'sixty-third',
 'thirty-first',
 'thirty-fourth',
 'thirty-second',
 'thirty-sixth',
 'twenty-first',
 'twenty-fourth',
 'twenty-second',
 'twenty-seventh',
 'twenty-sixth',
 'twenty-third']

>> #### 5.2 Solution: Remove ordinal tokens from tokens to be cleaned

In [87]:
hyphen_tokens = sorted(list(set(hyphen_tokens).difference(set(valid_token_word_nums))))
hyphen_tokens

['1,500-a-year',
 '1-day',
 '1-in-20',
 '1-yard',
 '1-year',
 '1-year-old',
 '1/2-inch',
 '1/2-percent',
 '10,000-foot',
 '10,000-page',
 '10,000-ton',
 '10-day',
 '10-inch',
 '10-man',
 '10-minute',
 '10-percent',
 '10-round',
 '10-to-1',
 '10-year',
 '10-year-old',
 '100-cent',
 '11-year-old',
 '12-hour',
 '12-inch',
 '12-month',
 '12-year-old',
 '122-foot',
 '122-percent',
 '13-year-old',
 '13Ѕ-inch',
 '14-percent',
 '14-point',
 '14-to-1',
 '14-year',
 '14-year-old',
 '15-percent',
 '15-year-olds',
 '152-millimeter',
 '16-inch',
 '16-inch-type',
 '16-year-old',
 '167-page',
 '17-year',
 '17-year-low',
 '17-year-old',
 '18-hour',
 '18-nation',
 '18-percent',
 '18-wheelers',
 '18-year-old',
 '18-year-olds',
 '180-odd',
 '19-foot',
 '19-year-old',
 '19-year-olds',
 '190-year',
 '1970s-era',
 '19th-century',
 '2,000-foot',
 '2,000-mile',
 '2-in-3',
 '2-is',
 '2-minute',
 '2-percent',
 '2-year',
 '2-year-old',
 '2-year-olds',
 '20-knot',
 '20-percent',
 '20-point',
 '20-year',
 '20-year

>> #### 6.1 Problem: Some tokens are two words separated by a hyphen with the second word being a stopword

In [88]:
STOP_WORDS = spacy.lang.en.stop_words.STOP_WORDS

# Pattern that matches all lowercase tokens
pattern = '^[^-0-9A-Z]+-[^-A-Z]+$'
hyphen_pattern_matches = [re.search(pattern, token)[0] for token in hyphen_tokens if re.search(pattern, token)]

# Pattern that matches the last token
pattern = '([^-]*?)$'
hyphen_pattern_matches_last_token = [re.search(pattern, token)[0] for token in hyphen_pattern_matches if re.search(pattern, token)]

# Pair hyphens
hyphen_pairs = list(zip(hyphen_pattern_matches, hyphen_pattern_matches_last_token))

# Get all hyphen tokens where last token is a stopword
hyphens_before = [a for (a,b) in hyphen_pairs if b in STOP_WORDS]
hyphens_before

['abroad-the',
 'abroad-to',
 'accept-and',
 'action-to',
 'administration-and',
 'administration-as',
 'administrations-to',
 'advance-and',
 'advantage-and',
 'advantages-in',
 'afford-along',
 'again,-a',
 'agencies-as',
 'agencies-is',
 'aggression-and',
 'aggression-that',
 'aggression-to',
 'ago-and',
 'ago-many',
 'ago-was',
 'all-and',
 'all-in',
 'all-out',
 'allies-just',
 'alone-and',
 'amendments-it',
 'antagonisms-that',
 'appropriate-and',
 'are-five',
 'arena-an',
 'army-not',
 'aspirations-have',
 'audience-it',
 'available-and',
 'background-not',
 'base-and',
 'battle-front',
 'be-from',
 'be-the',
 'be-then',
 'be-we',
 'beefed-up',
 'beings-might',
 'believe-that',
 'belt-will',
 'better-off',
 'bombed-out',
 'borders-how',
 'bore-down',
 'born-again',
 'breadth-from',
 'break-in',
 'break-through',
 'budget-for',
 'build-up',
 'builders-if',
 'building-up',
 'built-in',
 'burnt-out',
 'by-the',
 'call-in',
 'campaign-and',
 "can't–-in",
 'can-and',
 'can-then',
 'c

>> #### 6.2 Solution: Replace hyphens with spaces

In [89]:
hyphens_after = [token.replace('-', ' ') for token in hyphens_before]
hyphens_after

['abroad the',
 'abroad to',
 'accept and',
 'action to',
 'administration and',
 'administration as',
 'administrations to',
 'advance and',
 'advantage and',
 'advantages in',
 'afford along',
 'again, a',
 'agencies as',
 'agencies is',
 'aggression and',
 'aggression that',
 'aggression to',
 'ago and',
 'ago many',
 'ago was',
 'all and',
 'all in',
 'all out',
 'allies just',
 'alone and',
 'amendments it',
 'antagonisms that',
 'appropriate and',
 'are five',
 'arena an',
 'army not',
 'aspirations have',
 'audience it',
 'available and',
 'background not',
 'base and',
 'battle front',
 'be from',
 'be the',
 'be then',
 'be we',
 'beefed up',
 'beings might',
 'believe that',
 'belt will',
 'better off',
 'bombed out',
 'borders how',
 'bore down',
 'born again',
 'breadth from',
 'break in',
 'break through',
 'budget for',
 'build up',
 'builders if',
 'building up',
 'built in',
 'burnt out',
 'by the',
 'call in',
 'campaign and',
 "can't– in",
 'can and',
 'can then',
 'c

>> #### 6.3 Prepare token pairs and update dictionary

In [90]:
# Zip pairs into before and after tuples
hyphens = list(zip(hyphens_before, hyphens_after))

# Update dictionary and hyphen tokens
clean_dict = update_dict(clean_dict, hyphens)
hyphen_tokens = [token for token in hyphen_tokens if token not in clean_dict]

In [91]:
print(len(hyphen_tokens))
hyphen_tokens

3316


['1,500-a-year',
 '1-day',
 '1-in-20',
 '1-yard',
 '1-year',
 '1-year-old',
 '1/2-inch',
 '1/2-percent',
 '10,000-foot',
 '10,000-page',
 '10,000-ton',
 '10-day',
 '10-inch',
 '10-man',
 '10-minute',
 '10-percent',
 '10-round',
 '10-to-1',
 '10-year',
 '10-year-old',
 '100-cent',
 '11-year-old',
 '12-hour',
 '12-inch',
 '12-month',
 '12-year-old',
 '122-foot',
 '122-percent',
 '13-year-old',
 '13Ѕ-inch',
 '14-percent',
 '14-point',
 '14-to-1',
 '14-year',
 '14-year-old',
 '15-percent',
 '15-year-olds',
 '152-millimeter',
 '16-inch',
 '16-inch-type',
 '16-year-old',
 '167-page',
 '17-year',
 '17-year-low',
 '17-year-old',
 '18-hour',
 '18-nation',
 '18-percent',
 '18-wheelers',
 '18-year-old',
 '18-year-olds',
 '180-odd',
 '19-foot',
 '19-year-old',
 '19-year-olds',
 '190-year',
 '1970s-era',
 '19th-century',
 '2,000-foot',
 '2,000-mile',
 '2-in-3',
 '2-is',
 '2-minute',
 '2-percent',
 '2-year',
 '2-year-old',
 '2-year-olds',
 '20-knot',
 '20-percent',
 '20-point',
 '20-year',
 '20-year

>> #### 7.1 Problem: Compound adjectives have hyphens and do not need to be cleaned
>> - noun + adjective
- noun + participle
- adjective + participle

In [92]:
pattern = '^[a-z][a-z]+-[^-A-Z0-9]+$'
hyphens_before = [re.search(pattern, token)[0] for token in hyphen_tokens if re.search(pattern, token)]
hyphens_before

['able-bodied',
 'above-cited',
 'above-estimated',
 'above-mentioned',
 'above-named',
 'above-recited',
 'academy-type',
 'add-ons',
 'adjusted-service',
 'administration-helped',
 'administration-laws',
 'administration-tried',
 'advanced-placement',
 "afflicted-they've",
 'after-dinner',
 'after-effect',
 'after-effects',
 'after-school',
 'after-tax',
 'age-long',
 'age-old',
 'ago-ideas',
 'ago-passed',
 'agreements-agreements',
 'agricultural-college',
 'ahead-immediately',
 'air-based',
 'air-landing',
 'air-lane',
 'air-lift',
 'air-mail',
 'air-raid',
 'all-absorbing',
 'all-black',
 'all-embracing',
 'all-engrossing',
 'all-important',
 'all-inclusive',
 'all-male',
 'all-merciful',
 'all-military',
 'all-pervading',
 'all-powerful',
 'all-round',
 'all-solar',
 'all-time',
 'all-volunteer',
 'all-weather',
 'all-wind',
 'all-wise',
 'alliance-building',
 'and-towns',
 'andlaw-abiding',
 'anever-increasing',
 'animal-killers',
 'anniversary-let',
 'ante-revolutionary',
 'ant

>> #### 7.2 Solution: Remove hyphens from words that do not require hyphens

In [93]:
hyphens_after = list(nlp.pipe([token.replace('-', ' ') for token in hyphens_before]))
hyphens_after = [token.text.replace(' ', '-') for token in hyphens_after if is_compound(token, 'NOUN', 'ADJ', 'pp') or is_compound(token, 'NOUN', 'VBG', 'pt') or is_compound(token, 'NOUN', 'VBN', 'pt') or is_compound(token, 'ADJ', 'VGB', 'pt') or is_compound(token, 'ADJ', 'VBN', 'pt')]
hyphens_after

['age-old',
 'air-based',
 'blockade-breaking',
 'blood-thirsty',
 'bond-paying',
 'bone-tired',
 'bread-winning',
 'bullion-producing',
 'cancer-free',
 'centuries-old',
 'combat-equipped',
 'community-based',
 'community-involving',
 'computer-controlled',
 'conflict-torn',
 'consumer-conscious',
 'cost-conscious',
 'cotton-growing',
 'cotton-producing',
 'country-promoting',
 'courts-martial',
 'crime-infested',
 'crop-moving',
 'cross-examining',
 'deal-making',
 'death-defying',
 'debt-free',
 'debt-paying',
 'double-turreted',
 'drug-free',
 'drug-related',
 'drug-resistant',
 'duty-free',
 'economy-wide',
 'emissions-free',
 'employer-based',
 'energy-consuming',
 'energy-efficient',
 'energy-producing',
 'examination-excepted',
 'fact-seeking',
 'faith-based',
 'family-friendly',
 'fees-consular',
 'flag-draped',
 'food-producing',
 'friendship-exercising',
 'fruit-growing',
 'fuel-efficient',
 'habit-forming',
 'half-burned',
 'heart-rending',
 'heart-wrenching',
 'higher-pric

>> #### 7.3 Prepare token pairs

In [94]:
hyphen_tokens = sorted(list(set(hyphen_tokens).difference(set(hyphens_after))))
hyphen_tokens

['1,500-a-year',
 '1-day',
 '1-in-20',
 '1-yard',
 '1-year',
 '1-year-old',
 '1/2-inch',
 '1/2-percent',
 '10,000-foot',
 '10,000-page',
 '10,000-ton',
 '10-day',
 '10-inch',
 '10-man',
 '10-minute',
 '10-percent',
 '10-round',
 '10-to-1',
 '10-year',
 '10-year-old',
 '100-cent',
 '11-year-old',
 '12-hour',
 '12-inch',
 '12-month',
 '12-year-old',
 '122-foot',
 '122-percent',
 '13-year-old',
 '13Ѕ-inch',
 '14-percent',
 '14-point',
 '14-to-1',
 '14-year',
 '14-year-old',
 '15-percent',
 '15-year-olds',
 '152-millimeter',
 '16-inch',
 '16-inch-type',
 '16-year-old',
 '167-page',
 '17-year',
 '17-year-low',
 '17-year-old',
 '18-hour',
 '18-nation',
 '18-percent',
 '18-wheelers',
 '18-year-old',
 '18-year-olds',
 '180-odd',
 '19-foot',
 '19-year-old',
 '19-year-olds',
 '190-year',
 '1970s-era',
 '19th-century',
 '2,000-foot',
 '2,000-mile',
 '2-in-3',
 '2-is',
 '2-minute',
 '2-percent',
 '2-year',
 '2-year-old',
 '2-year-olds',
 '20-knot',
 '20-percent',
 '20-point',
 '20-year',
 '20-year

>> #### 8.1 Problem: Remove hyphens from single hyphen words that do not require hyphens

In [95]:
pattern = '^[a-z]+-[^-A-Z0-9]+$'
hyphens_before = [re.search(pattern, token)[0] for token in hyphen_tokens if re.search(pattern, token)]
hyphens_before.remove('minimum-wage')
hyphens_before

['able-bodied',
 'above-cited',
 'above-estimated',
 'above-mentioned',
 'above-named',
 'above-recited',
 'academy-type',
 'add-ons',
 'adjusted-service',
 'administration-helped',
 'administration-laws',
 'administration-tried',
 'advanced-placement',
 "afflicted-they've",
 'after-dinner',
 'after-effect',
 'after-effects',
 'after-school',
 'after-tax',
 'age-long',
 'ago-ideas',
 'ago-passed',
 'agreements-agreements',
 'agricultural-college',
 'ahead-immediately',
 'air-landing',
 'air-lane',
 'air-lift',
 'air-mail',
 'air-raid',
 'all-absorbing',
 'all-black',
 'all-embracing',
 'all-engrossing',
 'all-important',
 'all-inclusive',
 'all-male',
 'all-merciful',
 'all-military',
 'all-pervading',
 'all-powerful',
 'all-round',
 'all-solar',
 'all-time',
 'all-volunteer',
 'all-weather',
 'all-wind',
 'all-wise',
 'alliance-building',
 'and-towns',
 'andlaw-abiding',
 'anever-increasing',
 'animal-killers',
 'anniversary-let',
 'ante-revolutionary',
 'anti-aircraft',
 'anti-bullyi

>> #### 8.2 Solution: Remove hyphens from single hyphen words that do not require hyphens

In [96]:
hyphens_after = [token.replace('-', '') for token in hyphens_before]
hyphens_after = [token for token in tqdm(hyphens_after) if is_word(token) and token == is_word(token)]
hyphens_after

100%|██████████| 2030/2030 [16:58<00:00,  1.99it/s]


['abovementioned',
 'abovenamed',
 'afterdinner',
 'aftereffect',
 'agelong',
 'airlift',
 'airmail',
 'anterevolutionary',
 'antiaircraft',
 'antibusiness',
 'antipollution',
 'antiretroviral',
 'antislavery',
 'antitrust',
 'banknote',
 'battlefield',
 'battleship',
 'beachhead',
 'bipartisan',
 'bighearted',
 'bombproof',
 'breakwater',
 'businesslike',
 'byword',
 'childcare',
 'coequal',
 'coop',
 'cooperate',
 'cooperation',
 'cooperative',
 'coordination',
 'corelation',
 'coastline',
 'coolheaded',
 'counterguerrilla',
 'counterinvasion',
 'counteroffensive',
 'counterrevolution',
 'counterterrorism',
 'countryman',
 'courthouse',
 'customhouse',
 'cutthroat',
 'demoralize',
 'desegregate',
 'doorkeeper',
 'dryland',
 'evenhanded',
 'everyday',
 'extraterritorial',
 'fainthearted',
 'farseeing',
 'farsighted',
 'fatherland',
 'faultfinding',
 'fieldwork',
 'figurehead',
 'firearm',
 'fireproof',
 'firsthand',
 'flextime',
 'flintlock',
 'foursquare',
 'freeborn',
 'freehanded',

>> #### 8.3 Prepare token pairs and update dictionary

In [97]:
# Zip pairs into before and after tuples
hyphens = list(zip(hyphens_before, hyphens_after))

# Update dictionary and hyphen tokens
clean_dict = update_dict(clean_dict, hyphens)
hyphen_tokens = [token for token in hyphen_tokens if token not in clean_dict]

>> #### 9.1 Problem: Some hyphen tokens are compound verbs and can be removed

In [98]:
pattern = '^[a-z][a-z]+-[^-A-Z0-9]+$'
hyphens_before = [re.search(pattern, token)[0] for token in hyphen_tokens if re.search(pattern, token)]
hyphens_before

['cross-purposes',
 'cross-section',
 'cure-alls',
 'custom-house',
 'custom-houses',
 'customs-revenue',
 'cut-throat',
 'cutting-edge',
 'cyber-attacks',
 'cyber-threats',
 'danger-filled',
 'dark-stained',
 'day-inspector',
 'de-emphasize',
 'de-escalate',
 'de-escalation',
 'de-moralize',
 'de-nuke',
 'de-segregate',
 'dead-end',
 'dead-level',
 'death-dealing',
 'death-rate',
 'death-wish',
 'debt-bearing',
 'decades-long',
 'decent-living',
 'decision-making',
 'deep-graven',
 'deep-rooted',
 'deep-seated',
 'defense-oriented',
 'defense-related',
 'defied-pledges',
 'democratic-republican',
 'department-store',
 'dependent-pension',
 'desert-land',
 'destruction-constitutes',
 'destruction-meaning',
 'dictatorship-clearly',
 'die-hards',
 'dilly-dallying',
 'diplomacy-protect',
 'diplomatic-service',
 'dis-armed',
 'dis-unionists',
 'disability-pension',
 'disarmament-true',
 'disease-bearing',
 'disease-ridden',
 'dive-bomber',
 'do-gooders',
 'doctor-patient',
 'domestic-slave

>> #### 9.2 Solution: Remove compound from hyphen_tokens

In [99]:
hyphens_compound_verbs = list(nlp.pipe([token.replace('-', ' ') for token in hyphens_before]))
hyphens_compound_verbs = [token.text.replace(' ', '-') for token in hyphens_compound_verbs if is_compound(token, 'ADJ', 'NOUN', 'pp') or is_compound(token, 'NOUN', 'VERB', 'pp') or is_compound(token, 'ADV', 'VERB', 'pp')]
hyphens_compound_verbs

['danger-filled',
 'dead-level',
 'decent-living',
 'deep-rooted',
 'deep-seated',
 'dependent-pension',
 'destruction-constitutes',
 'dilly-dallying',
 'diplomatic-service',
 'dive-bomber',
 'domestic-slavery',
 'double-digit',
 'double-edged',
 'double-talk',
 'double-tracking',
 'double-turret',
 'dry-dock',
 'dry-land',
 'early-childhood',
 'early-stage',
 'eighth-grade',
 'eleventh-hour',
 'employer-provided',
 'equal-rights',
 'even-handed',
 'ever-advancing',
 'ever-broadening',
 'ever-enduring',
 'ever-expanding',
 'ever-growing',
 'ever-increasing',
 'ever-living',
 'ever-mounting',
 'ever-recurring',
 'ever-rising',
 'ever-shifting',
 'ever-varying',
 'ever-widening',
 'evil-doers',
 'exact-percent',
 'excess-profits',
 'extended-family',
 'fair-dealing',
 'fair-share',
 'family-owned',
 'far-famed',
 'far-left',
 'far-reaching',
 'far-removed',
 'far-seeing',
 'far-sighted',
 'fast-changing',
 'fast-food',
 'fast-growing',
 'fast-track',
 'federally-aided',
 'federally-funde

>> #### 9.3 Prepare token pairs

In [100]:
hyphen_tokens = sorted(list(set(hyphen_tokens).difference(set(hyphens_compound_verbs))))
print(len(hyphen_tokens))
hyphen_tokens

2421


['1,500-a-year',
 '1-day',
 '1-in-20',
 '1-yard',
 '1-year',
 '1-year-old',
 '1/2-inch',
 '1/2-percent',
 '10,000-foot',
 '10,000-page',
 '10,000-ton',
 '10-day',
 '10-inch',
 '10-man',
 '10-minute',
 '10-percent',
 '10-round',
 '10-to-1',
 '10-year',
 '10-year-old',
 '100-cent',
 '11-year-old',
 '12-hour',
 '12-inch',
 '12-month',
 '12-year-old',
 '122-foot',
 '122-percent',
 '13-year-old',
 '13Ѕ-inch',
 '14-percent',
 '14-point',
 '14-to-1',
 '14-year',
 '14-year-old',
 '15-percent',
 '15-year-olds',
 '152-millimeter',
 '16-inch',
 '16-inch-type',
 '16-year-old',
 '167-page',
 '17-year',
 '17-year-low',
 '17-year-old',
 '18-hour',
 '18-nation',
 '18-percent',
 '18-wheelers',
 '18-year-old',
 '18-year-olds',
 '180-odd',
 '19-foot',
 '19-year-old',
 '19-year-olds',
 '190-year',
 '1970s-era',
 '19th-century',
 '2,000-foot',
 '2,000-mile',
 '2-in-3',
 '2-is',
 '2-minute',
 '2-percent',
 '2-year',
 '2-year-old',
 '2-year-olds',
 '20-knot',
 '20-percent',
 '20-point',
 '20-year',
 '20-year

>> #### 10.1 Problem: Some hyphenated words are formed from non-words

In [101]:
pattern = '^[a-z]+-[^-A-Z0-9]+$'
hyphens_before = [re.search(pattern, token)[0] for token in hyphen_tokens if re.search(pattern, token)]
hyphens_before

['cross-purposes',
 'cross-section',
 'cure-alls',
 'custom-house',
 'custom-houses',
 'customs-revenue',
 'cut-throat',
 'cutting-edge',
 'cyber-attacks',
 'cyber-threats',
 'dark-stained',
 'day-inspector',
 'de-emphasize',
 'de-escalate',
 'de-escalation',
 'de-moralize',
 'de-nuke',
 'de-segregate',
 'dead-end',
 'death-dealing',
 'death-rate',
 'death-wish',
 'debt-bearing',
 'decades-long',
 'decision-making',
 'deep-graven',
 'defense-oriented',
 'defense-related',
 'defied-pledges',
 'democratic-republican',
 'department-store',
 'desert-land',
 'destruction-meaning',
 'dictatorship-clearly',
 'die-hards',
 'diplomacy-protect',
 'dis-armed',
 'dis-unionists',
 'disability-pension',
 'disarmament-true',
 'disease-bearing',
 'disease-ridden',
 'do-gooders',
 'doctor-patient',
 'door-keeper',
 'draw-backs',
 'e-mails',
 'earned-income',
 'eight-hour',
 'eight-story',
 'eighty-ninth',
 'election-year',
 'employer-employee',
 'energy-poor',
 'energy-related',
 'energy-rich',
 'energ

>> #### 10.2 Solution: Find all tokens that only contain valid words and remove them from consideration to be cleaned

In [102]:
hyphens_after = [token for token in tqdm(hyphens_before) if is_valid_hyphen_word(token)]
hyphens_after

100%|██████████| 1290/1290 [17:54<00:00,  1.20it/s]


['cross-purposes',
 'cross-section',
 'cure-alls',
 'custom-house',
 'custom-houses',
 'customs-revenue',
 'cut-throat',
 'cutting-edge',
 'cyber-attacks',
 'cyber-threats',
 'dark-stained',
 'day-inspector',
 'de-emphasize',
 'de-escalate',
 'de-escalation',
 'de-moralize',
 'de-nuke',
 'de-segregate',
 'dead-end',
 'death-dealing',
 'death-rate',
 'death-wish',
 'debt-bearing',
 'decades-long',
 'decision-making',
 'deep-graven',
 'defense-oriented',
 'defense-related',
 'defied-pledges',
 'democratic-republican',
 'department-store',
 'desert-land',
 'destruction-meaning',
 'dictatorship-clearly',
 'die-hards',
 'diplomacy-protect',
 'dis-armed',
 'dis-unionists',
 'disability-pension',
 'disarmament-true',
 'disease-bearing',
 'disease-ridden',
 'doctor-patient',
 'door-keeper',
 'draw-backs',
 'e-mails',
 'earned-income',
 'eight-hour',
 'eight-story',
 'eighty-ninth',
 'election-year',
 'employer-employee',
 'energy-poor',
 'energy-related',
 'energy-rich',
 'energy-saving',
 'en

>> #### 10.3 Prepare token pairs

In [103]:
hyphen_tokens = sorted(list(set(hyphen_tokens).difference(set(hyphens_after))))
print(len(hyphen_tokens))
hyphen_tokens

1151


['1,500-a-year',
 '1-day',
 '1-in-20',
 '1-yard',
 '1-year',
 '1-year-old',
 '1/2-inch',
 '1/2-percent',
 '10,000-foot',
 '10,000-page',
 '10,000-ton',
 '10-day',
 '10-inch',
 '10-man',
 '10-minute',
 '10-percent',
 '10-round',
 '10-to-1',
 '10-year',
 '10-year-old',
 '100-cent',
 '11-year-old',
 '12-hour',
 '12-inch',
 '12-month',
 '12-year-old',
 '122-foot',
 '122-percent',
 '13-year-old',
 '13Ѕ-inch',
 '14-percent',
 '14-point',
 '14-to-1',
 '14-year',
 '14-year-old',
 '15-percent',
 '15-year-olds',
 '152-millimeter',
 '16-inch',
 '16-inch-type',
 '16-year-old',
 '167-page',
 '17-year',
 '17-year-low',
 '17-year-old',
 '18-hour',
 '18-nation',
 '18-percent',
 '18-wheelers',
 '18-year-old',
 '18-year-olds',
 '180-odd',
 '19-foot',
 '19-year-old',
 '19-year-olds',
 '190-year',
 '1970s-era',
 '19th-century',
 '2,000-foot',
 '2,000-mile',
 '2-in-3',
 '2-is',
 '2-minute',
 '2-percent',
 '2-year',
 '2-year-old',
 '2-year-olds',
 '20-knot',
 '20-percent',
 '20-point',
 '20-year',
 '20-year

>> #### 11.1 Problem: Some hyphen words are not valid words or contain extra words without space

In [104]:
pattern = '^[a-z]+-[^-A-Z0-9]+$'
hyphens_before = [re.search(pattern, token)[0] for token in hyphen_tokens if re.search(pattern, token)]
hyphens_before

['do-gooders',
 'faint-heartedly',
 'farnorth-west',
 'fellow-citizensa',
 'fellow-citizensin',
 'fifteen-foldwithin',
 'gur-reat',
 'one-hoss',
 'post-poned',
 'progress-alianza',
 'pur-rinciple',
 'self-goverment',
 'self-governmentthroughout',
 'self-sacrificingpatriotism',
 'thirty-eightpopulous',
 'twenty-fiveyears',
 'war-makingpower',
 'well-disciplinedand',
 'well-informedpersons',
 'whole-heartedly']

>> #### 11.2 Solution: Remove some words and update clean dictionary

In [105]:
# Update hyphen tokens
clean_dict['farnorth-west'] = 'far north-west'
clean_dict['fellow-citizensa'] = 'fellow-citizens a'
clean_dict['fellow-citizensin'] = 'fellow-citizens in'
clean_dict['fifteen-foldwithin'] = 'fifteen-fold within'
clean_dict['self-governmentthroughout'] = 'self-government throughout'
clean_dict['self-sacrificingpatriotism'] = 'self-sacrificing patriotism'
clean_dict['thirty-eightpopulous'] = 'thirty-eight populous'
clean_dict['twenty-fiveyears'] = 'twenty-five years'
clean_dict['war-makingpower'] = 'war-making power'
clean_dict['well-disciplinedand'] = 'well-disciplined and'
clean_dict['well-informedpersons'] = 'well-informed persons'

>> #### 11.3 Prepare token pairs

In [106]:
hyphen_tokens = sorted(list(set(hyphen_tokens).difference(set(hyphens_before))))
print(len(hyphen_tokens))

1131


>> #### 12.1 Problem: Some tokens have hyphens between capitalized words and stopwords 

In [107]:
pattern = '^[A-Z][a-z]+-[^-A-Z0-9]+$'
hyphens_before = [re.search(pattern, token)[0] for token in hyphen_tokens if re.search(pattern, token)]
hyphens_before

['African-owned',
 'All-merciful',
 'All-powerful',
 'All-time',
 'Allies-in',
 'America-in',
 'America-the',
 'American-born',
 'American-built',
 'American-made',
 'American-mediated',
 'American-owned',
 'Americans-to',
 'Americas-the',
 'An-tung',
 'As-salaamu',
 'Asia-and',
 'Asia-which',
 'Assembly-have',
 'Band-aid',
 'Berlin-and',
 'Big-hearted',
 'Black-owned',
 'Blue-collar',
 'Bosnia-wide',
 'Cabinet-level',
 'Census-we',
 'Chamber-the',
 'Chi-li',
 'Chinese-made',
 'Christ-like',
 'Civil-service',
 'Clear-eyed',
 'Co-operation',
 'Coal-land',
 'Columbia-it',
 'Communist-controlled',
 'Communist-directed',
 'Communist-dominated',
 'Communist-inspired',
 'Communist-supplied',
 'Communist-supported',
 'Congress-and',
 'Congress-by',
 'Congress-through',
 'Convention-my',
 'Day-inspector',
 'Democratic-controlled',
 'Double-digit',
 'Earth-based',
 'East-mark',
 'East-there',
 'Eisenhower-and',
 'Eisenhower-they',
 'English-speaking',
 'Establishment-this',
 'Europe-the',
 'Fai

>> #### 12.2 Solution: Remove hyphens between capitalized words and stopwords

In [108]:
hyphens_after = [token.replace('-', ' ') for token in hyphens_before if token.split('-')[-1] in STOP_WORDS]
hyphens_after

['Allies in',
 'America in',
 'America the',
 'American made',
 'Americans to',
 'Americas the',
 'Asia and',
 'Asia which',
 'Assembly have',
 'Berlin and',
 'Census we',
 'Chamber the',
 'Chinese made',
 'Columbia it',
 'Congress and',
 'Congress by',
 'Congress through',
 'Convention my',
 'East there',
 'Eisenhower and',
 'Eisenhower they',
 'Establishment this',
 'Europe the',
 'Follow up',
 'Germany the',
 'Government its',
 'Government the',
 'Government under',
 'Governor to',
 'Mustering out',
 'Nations and',
 'Nine eleven',
 'One third',
 'Pent up',
 'President and',
 'President as',
 'President some',
 'President that',
 'President with',
 'Sam whoever',
 'Soviet made',
 'State an',
 'States a',
 'States besides',
 'Union for',
 'Vietnam and',
 'War would']

>> #### 12.3 Prepare token pairs and update dictionary

In [109]:
# Zip pairs into before and after tuples
hyphens = list(zip(hyphens_before, hyphens_after))

# Update dictionary and hyphen tokens
clean_dict = update_dict(clean_dict, hyphens)
hyphen_tokens = [token for token in hyphen_tokens if token not in clean_dict]

In [110]:
len(hyphen_tokens)

1084

>> #### 13.1 Problem: Some tokens both words capitalized are valid words while others are not

In [111]:
pattern = '^[A-Z][a-z]+-[A-Z][a-z]+$'
hyphens_before = [re.search(pattern, token)[0] for token in hyphen_tokens if re.search(pattern, token)]
hyphens_before

['Adjutant-General',
 'Administration-Mr',
 'African-American',
 'African-Americans',
 'Aigner-Clark',
 'Al-Azhar',
 'Alsace-Lorraine',
 'Amazon-Washington',
 'Ambassador-Mr',
 'American-British',
 'American-Hellenic',
 'American-Soviet',
 'Anglo-Saxon',
 'Anglo-Venezuelan',
 'Anti-Trust',
 'Antung-Mukden',
 'Apollo-Gemini',
 'Arab-Israeli',
 'Armenian-Americans',
 'Asia-Pacific',
 'Asian-American',
 'Asian-Americans',
 'Atlantic-Mississippi',
 'Attorney-General',
 'Attorneys-General',
 'Austro-Hungarian',
 'Baltic-Americans',
 'Band-Aid',
 'Boutros-Ghali',
 'Brest-Litovsk',
 'Briand-Kellogg',
 'Brigadier-General',
 'British-American',
 'Bunau-Varilla',
 'Cambodian-Vietnam',
 'Capper-Ketcham',
 'Capper-Volstead',
 'Captain-General',
 'Captain-Generalto',
 'Carter-Mondale',
 'Chinchow-Aigun',
 'Chinese-American',
 'Chinese-Japanese',
 'Civil-Service',
 'Clayton-Bulwer',
 'Coleman-Singleton',
 'Commissary-General',
 'Commissioner-General',
 'Conable-Hanee',
 'Congo-Adoula',
 'Congo-Kinsh

>> #### 13.2 Solution: Remove invalid tokens with both words capitalized

In [112]:
tokens_to_clean_before = [token for token in hyphens_before if 'Mr' in token or 'Bob' in token]
tokens_to_clean_after = [token.replace('-', ' ') for token in tokens_to_clean_before]

>> #### 13.3 Prepare token pairs and update dictionary

In [113]:
# Update dictionary
for tup in list(zip(tokens_to_clean_before, tokens_to_clean_after)):
    clean_dict[tup[0]] = tup[1]
    
# Filter out cleaned tokens
hyphen_tokens = sorted(list(set(hyphen_tokens).difference(set(hyphens_before))))
print(len(hyphen_tokens))

853


>> #### 14.1 Problem: Some tokens with first word capitalized have second word as stopwords

In [114]:
pattern = '^[A-Z][a-z]+-[a-z]+$'
hyphens_before = [re.search(pattern, token)[0] for token in hyphen_tokens if re.search(pattern, token)]
hyphens_before

['Democratic-controlled',
 'Double-digit',
 'Earth-based',
 'East-mark',
 'East-there',
 'Eisenhower-and',
 'Eisenhower-they',
 'English-speaking',
 'Establishment-this',
 'Europe-the',
 'Fair-minded',
 'Faith-based',
 'Far-reaching',
 'Federal-assisted',
 'Federal-highway',
 'Federally-funded',
 'Fellow-citizens',
 'Fifty-eighth',
 'Fifty-fifth',
 'Five-twenties',
 'Flat-head',
 'Follow-up',
 'Forty-fifth',
 'Forty-ninth',
 'Four-fifths',
 'Franklin-leaning',
 'Free-delivery',
 'Free-market',
 'Gall-lard',
 'Germany-the',
 'Germany-voting',
 'God-awful',
 'God-blessed',
 'God-fearing',
 'God-given',
 'God-imposed',
 'Government-improved',
 'Government-its',
 'Government-led',
 'Government-owned',
 'Government-regulated',
 'Government-the',
 'Government-under',
 'Government-wide',
 'Governor-to',
 'Half-hearted',
 'Hard-headedness',
 'Harvard-trained',
 'Heaven-favored',
 'Heavy-armed',
 'High-ranking',
 'High-wage',
 'Hitler-dominated',
 'Hope-renewed',
 'House-business',
 'Hsiao-ping

In [115]:
hyphens_before = [token for token in hyphens_before if token.split('-')[-1] in STOP_WORDS]
hyphens_before

['East-there',
 'Eisenhower-and',
 'Eisenhower-they',
 'Establishment-this',
 'Europe-the',
 'Follow-up',
 'Germany-the',
 'Government-its',
 'Government-the',
 'Government-under',
 'Governor-to',
 'Mustering-out',
 'Nations-and',
 'Nine-eleven',
 'One-third',
 'Pent-up',
 'President-and',
 'President-as',
 'President-some',
 'President-that',
 'President-with',
 'Sam-whoever',
 'Soviet-made',
 'State-an',
 'States-a',
 'States-besides',
 'Union-for',
 'Vietnam-and',
 'War-would']

>> #### 14.2 Solution: Remove tokens with first word capitalized have second word as stopwords

In [116]:
hyphens_after = [token.replace('-', ' ') for token in hyphens_before]
hyphens_after

['East there',
 'Eisenhower and',
 'Eisenhower they',
 'Establishment this',
 'Europe the',
 'Follow up',
 'Germany the',
 'Government its',
 'Government the',
 'Government under',
 'Governor to',
 'Mustering out',
 'Nations and',
 'Nine eleven',
 'One third',
 'Pent up',
 'President and',
 'President as',
 'President some',
 'President that',
 'President with',
 'Sam whoever',
 'Soviet made',
 'State an',
 'States a',
 'States besides',
 'Union for',
 'Vietnam and',
 'War would']

>> #### 14.3 Prepare token pairs and update dictionary

In [117]:
# Zip pairs into before and after tuples
hyphens = list(zip(hyphens_before, hyphens_after))

# Update dictionary and hyphen tokens
clean_dict = update_dict(clean_dict, hyphens)
hyphen_tokens = [token for token in hyphen_tokens if token not in clean_dict]

>> #### 15.1 Problem: Some hyphen tokens with first word capitalized are not valid sayings

In [118]:
pattern = '^[A-Z][a-z]+-[a-z]+$'
hyphens_before = [re.search(pattern, token)[0] for token in hyphen_tokens if re.search(pattern, token)]
hyphens_before

['Democratic-controlled',
 'Double-digit',
 'Earth-based',
 'East-mark',
 'English-speaking',
 'Fair-minded',
 'Faith-based',
 'Far-reaching',
 'Federal-assisted',
 'Federal-highway',
 'Federally-funded',
 'Fellow-citizens',
 'Fifty-eighth',
 'Fifty-fifth',
 'Five-twenties',
 'Flat-head',
 'Forty-fifth',
 'Forty-ninth',
 'Four-fifths',
 'Franklin-leaning',
 'Free-delivery',
 'Free-market',
 'Gall-lard',
 'Germany-voting',
 'God-awful',
 'God-blessed',
 'God-fearing',
 'God-given',
 'God-imposed',
 'Government-improved',
 'Government-led',
 'Government-owned',
 'Government-regulated',
 'Government-wide',
 'Half-hearted',
 'Hard-headedness',
 'Harvard-trained',
 'Heaven-favored',
 'Heavy-armed',
 'High-ranking',
 'High-wage',
 'Hitler-dominated',
 'Hope-renewed',
 'House-business',
 'Hsiao-ping',
 'Indian-school',
 'Internal-revenue',
 'Iraqi-led',
 'Jefferson-like',
 'Job-killing',
 'Judas-like',
 'Kiao-chow',
 'Klan-minded',
 'Lao-tzu',
 'Laos-regained',
 'Large-scale',
 'Latter-day',


>> #### 15.2 Solution: Remove hyphen tokens with first word capitalized that are not valid sayings

In [119]:
clean_dict['Germany-voting'] = 'Germany, voting'
clean_dict['Medicare-rarely'] = 'Medicare, rarely'
clean_dict['President-tried'] = 'President, tried'
clean_dict['September-immediately'] = 'September, immediately'
clean_dict['The-world'] = 'The world'
clean_dict['To-day'] = 'Today'
clean_dict['To-morrow'] = 'Tomorrow'
clean_dict['Vietnam-apply'] = 'Vietnam, apply'

>> #### 15.3 Prepare token pairs and update dictionary

In [120]:
# Update dictionary
for tup in list(zip(tokens_to_clean_before, tokens_to_clean_after)):
    clean_dict[tup[0]] = tup[1]
    
# Filter out cleaned tokens
hyphen_tokens = sorted(list(set(hyphen_tokens).difference(set(hyphens_before))))
print(len(hyphen_tokens))

662


>> #### 16.1 Problem: Some tokens with multiple hyphens do not need hyphens

In [121]:
pattern = '^([a-z]+-)*[a-z]+$'
hyphens_before = [re.search(pattern, token)[0] for token in hyphen_tokens if re.search(pattern, token)]
hyphens_before

['across-the-board',
 'aid-to-democracies',
 'air-to-air',
 'all-of-the-above',
 'all-or-nothing',
 'anti-drug-trafficking',
 'arm-in-arm',
 'back-and-forth',
 'back-to-back',
 'back-to-school',
 'balance-of-payments',
 'behind-the-scenes',
 'black-and-white',
 'brothers-in-arms',
 'cap-and-trade',
 'case-by-case',
 'catch-and-release',
 'command-and-control',
 'commander-in-chief',
 'comrades-in-arms',
 'cost-of-living',
 'cost-plus-fixed',
 'cross-examination-type',
 'cut-three-quarters',
 'day-by-day',
 'day-to-day',
 'devil-take-the-hind-most',
 'dime-by-dime',
 'do-it-yourself',
 'do-or-die',
 'door-to-door',
 'down-the-middle',
 'down-to-earth',
 'eight-year-old',
 'elbow-to-elbow',
 'end-of-the-year',
 'eye-and-a-half',
 'face-to-face',
 'farm-to-market',
 'five-and-a-half',
 'five-day-old',
 'five-to-four',
 'flat-to-deteriorating',
 'follow-the-leader',
 'four-and-a-half',
 'four-year-old',
 'fourteen-thousand-mile',
 'fox-guarding-the-henhouse',
 'free-for-all',
 'get-rich-qu

>> #### 16.2 Solution: Clean invalid tokens with multiple hyphens

In [122]:
tokens_to_clean_before = [
    'borne-successfully-a',
    'hurry-anything-even',
    'in-between-more',
    'leadership-yes-but',
    'self-determination-requires',
    'self-determination-that',
    'self-government-that',
    'self-government-with',
    'self-interest-of',
    'self-interest-then',
    'self-preservation-as',
    'small-busi-ness',
    'something-anything-to',
    'straight-down-the',
    'teachers-listen-with',
    'tonight-laughter-but',
]

tokens_to_clean_after = [
    'borne-successfully a',
    'hurry anything even',
    'in-between more',
    'leadership, yes, but',
    'self-determination requires',
    'self-determination that',
    'self-government that',
    'self-government with',
    'self-interest of',
    'self-interest then',
    'self-preservation as',
    'small-business',
    'something, anything, to',
    'straight down the',
    'teachers listen with',
    'tonight but',
]

>> #### 16.3 Prepare token pairs and update dictionary

In [123]:
# Update dictionary
for tup in list(zip(tokens_to_clean_before, tokens_to_clean_after)):
    clean_dict[tup[0]] = tup[1]
    
# Filter out cleaned tokens
hyphen_tokens = sorted(list(set(hyphen_tokens).difference(set(hyphens_before))))

# Filter out additional tokens
pattern = '^([a-z]+-)*[A-Z][a-z]+$'
additional_tokens = [re.search(pattern, token)[0] for token in hyphen_tokens if re.search(pattern, token)]
hyphen_tokens = sorted(list(set(hyphen_tokens).difference(set(additional_tokens))))

>> #### 17.1 Problem: The remaining tokens have few pattern similiarities

In [124]:
hyphen_tokens

['1,500-a-year',
 '1-day',
 '1-in-20',
 '1-yard',
 '1-year',
 '1-year-old',
 '1/2-inch',
 '1/2-percent',
 '10,000-foot',
 '10,000-page',
 '10,000-ton',
 '10-day',
 '10-inch',
 '10-man',
 '10-minute',
 '10-percent',
 '10-round',
 '10-to-1',
 '10-year',
 '10-year-old',
 '100-cent',
 '11-year-old',
 '12-hour',
 '12-inch',
 '12-month',
 '12-year-old',
 '122-foot',
 '122-percent',
 '13-year-old',
 '13Ѕ-inch',
 '14-percent',
 '14-point',
 '14-to-1',
 '14-year',
 '14-year-old',
 '15-percent',
 '15-year-olds',
 '152-millimeter',
 '16-inch',
 '16-inch-type',
 '16-year-old',
 '167-page',
 '17-year',
 '17-year-low',
 '17-year-old',
 '18-hour',
 '18-nation',
 '18-percent',
 '18-wheelers',
 '18-year-old',
 '18-year-olds',
 '180-odd',
 '19-foot',
 '19-year-old',
 '19-year-olds',
 '190-year',
 '1970s-era',
 '19th-century',
 '2,000-foot',
 '2,000-mile',
 '2-in-3',
 '2-is',
 '2-minute',
 '2-percent',
 '2-year',
 '2-year-old',
 '2-year-olds',
 '20-knot',
 '20-percent',
 '20-point',
 '20-year',
 '20-year

>> #### 17.2 Solution: Clean remaining tokens have few pattern similiarities

In [125]:
# Update clean dict
clean_dict['2-is'] = '2 is'
clean_dict['4-or'] = '4- or'
clean_dict['5?-percent'] = '5-percent'
clean_dict['81-which'] = '81, which'
clean_dict['F1a.-there'] = 'F1a., there'
clean_dict['FBI-in'] = 'FBI, in'
clean_dict['II-has'] = 'II, has'
clean_dict['Paymaster-General.3'] = 'Paymaster-General. 3'
clean_dict['Post-OfficeDepartment'] = 'Post-Office Department'
clean_dict['Postmaster-General'] = 'Postmaster-General'
clean_dict['act-$350,000,000'] = 'act, $350,000,000'
clean_dict['attack,-dangers'] = 'attack, dangers'
clean_dict['business-I'] = 'business, I'
clean_dict['coal-I'] = 'coal, I'
clean_dict['however-I'] = 'however, I'
clean_dict['involved-I'] = 'involved, I'
clean_dict['percent-I'] = 'percent, I'
clean_dict['politics-I'] = 'politics, I'
clean_dict['problems-I'] = 'problems, I'
clean_dict['that-I'] = 'that, I'
clean_dict['reorganization-I'] = 'reorganization, I'
clean_dict['world-I'] = 'world, I'
clean_dict['theVice-President'] = 'the Vice President'
clean_dict['(Applause.)'] = ''
clean_dict['(applause.)'] = ''
clean_dict['(laughter)'] = ''
clean_dict['(Laughter)'] = ''
clean_dict['Nixon-16 years ago-was'] = 'Nixon, 16 years ago, was'
clean_dict['to-morrow,-the'] = 'tomorrow, the'
clean_dict['to-day."Acting'] = 'today."Acting'
clean_dict['self-government---that'] = 'self-government, that'
clean_dict['slave-trade---that'] = 'slave-trade, that'
clean_dict['only."Attorney-General'] = 'only. "Attorney-General'
clean_dict['theVice-President'] = 'the Vice President'
clean_dict['struggle,-turn'] = 'struggle, turn'
clean_dict['run,-especially'] = 'run, especially'
clean_dict['words-"We'] = 'words, "We'
clean_dict["reorganization-I'd"] = "reorganization, I'd"
clean_dict['necessity-I'] = 'necessity, I'
clean_dict['money,-expenditures'] = 'money, expenditures'
clean_dict['name-J.'] = 'name, J.'
clean_dict['get-11.4'] = 'get, 11.4'
clean_dict['history–-157'] = 'history, 157'

# Patterns to match tokens
patterns = [
    '^[0-9]+-(seconds?|millimeter|knot|minutes?|hours?|days?)$',
    '^[0-9]+-(weeks?|months?|years?|page|percent|point|mile)$',
    '^[0-9]+-(power|pounder|acre|some|billion|nation|star|something)$',
    '^[0-9]+-(odd|inch|yard|bed|million|percent|foot|year-olds?)$',
    '^[0-9]/[0-9]-[a-z]+$',
    '^U.S.-[A-Za-z]+$',
    '^[A-Z]+-[0-9]+$'
]

# Remove all tokens that match the given patterns
for pattern in patterns:
    token_matches =[re.search(pattern, token)[0] for token in hyphen_tokens if re.search(pattern, token)]
    hyphen_tokens = sorted(list(set(hyphen_tokens).difference(set(token_matches))))

# Update hyphen_tokens
hyphen_tokens = [token for token in hyphen_tokens if token not in clean_dict]
hyphen_tokens

['1,500-a-year',
 '1-in-20',
 '10,000-foot',
 '10,000-page',
 '10,000-ton',
 '10-man',
 '10-round',
 '10-to-1',
 '100-cent',
 '13Ѕ-inch',
 '14-to-1',
 '16-inch-type',
 '17-year-low',
 '18-wheelers',
 '1970s-era',
 '19th-century',
 '2,000-foot',
 '2,000-mile',
 '2-in-3',
 '21st-century',
 '22,000-word',
 '28-square-mile',
 '3-to-1',
 '30-cent',
 '3D-print',
 '3rd-grade',
 '4.8-percent',
 '40,000-odd',
 '48-story',
 '5-ambulatory',
 '5-letter',
 '50,000-a-year',
 '50-cent',
 '50-cent-a-gallon',
 '500-per-child',
 '60,000-plus',
 '600,000-man',
 '600-ship',
 '62-year-olders',
 '650-billion-a-year',
 '7-year-long',
 '800-HELPNOW',
 '9,000-men',
 '9.2-percent',
 '900,000-$900,000',
 '91-to-1',
 'A-6F',
 'A-grade',
 'A-plus',
 'ACT-UP',
 'ADJUTANT-GENERAL',
 'AIDS-free',
 'AK-47s',
 'AMERICA-HONDURAS',
 'ANTI-TRUST',
 'ARBITRATIONS-PANAMA',
 'Ah-h-h',
 'Aix-la-Chapelle',
 'Alaskan-Yukon-Pacific',
 'B-1B',
 'B-l',
 'BUNAU-VARILLA',
 'Big-Brother-is-watching-you',
 'Boo-o-o',
 'Bush-McCain',
 

>> #### 17.3 Clean hyphen_tokens and then update transcript_tokens to remove all hyphen_tokens

In [126]:
# Filter hyphen_tokens out of transcript_tokens
transcript_tokens = sorted(list(set(transcript_tokens).difference(set(hyphen_tokens_list))))

>> #### 18.1 Problem: Some tokens have no space between two sentences

In [127]:
pattern = '[0-9]+\.[A-Z]?[a-z][a-z]+'
pattern_before = [re.search(pattern, token)[0] for token in transcript_tokens if re.search(pattern, token) and re.search(pattern, token)[0]==token]
pattern_before

['11.Each',
 '15.All',
 '1800.Again',
 '1812.The',
 '1818.New',
 '1820.The',
 '1824.During',
 '1827.Coinciding',
 '1827.The',
 '1842.The',
 '1846.His',
 '1849.But',
 '1850.Preceding',
 '1860.After',
 '1860.Besides',
 '1860.The',
 '1861.Our',
 '1861.The',
 '1863.Hon',
 '1864.Again',
 '1864.The',
 '1866.There',
 '1868.The',
 '1871.Other',
 '1876.This',
 '1878.The',
 '1881.The',
 '1883.Our',
 '1884.The',
 '1885.Inasmuch',
 '1885.The',
 '1886.Our',
 '1889.The',
 '1890.Under',
 '1891.The',
 '1892.The',
 '1893.During',
 '1893.From',
 '1893.The',
 '1893.There',
 '1893.When',
 '1894.Again',
 '1894.The',
 '1898.Having',
 '1898.The',
 '1899.Government',
 '1899.Our',
 '1899.The',
 '1899.This',
 '1908.The',
 '1909.Many',
 '1909.Philippine',
 '1909.The',
 '1910.The',
 '1911.Therefore',
 '1911.These',
 '22.For',
 '23.Our',
 '30.The',
 '5.The',
 '78.The',
 '83.From']

>> #### 18.2 Solution: Add space between two sentences

In [128]:
pattern_after = [token.replace('.', '. ') for token in pattern_before]
pattern_after

['11. Each',
 '15. All',
 '1800. Again',
 '1812. The',
 '1818. New',
 '1820. The',
 '1824. During',
 '1827. Coinciding',
 '1827. The',
 '1842. The',
 '1846. His',
 '1849. But',
 '1850. Preceding',
 '1860. After',
 '1860. Besides',
 '1860. The',
 '1861. Our',
 '1861. The',
 '1863. Hon',
 '1864. Again',
 '1864. The',
 '1866. There',
 '1868. The',
 '1871. Other',
 '1876. This',
 '1878. The',
 '1881. The',
 '1883. Our',
 '1884. The',
 '1885. Inasmuch',
 '1885. The',
 '1886. Our',
 '1889. The',
 '1890. Under',
 '1891. The',
 '1892. The',
 '1893. During',
 '1893. From',
 '1893. The',
 '1893. There',
 '1893. When',
 '1894. Again',
 '1894. The',
 '1898. Having',
 '1898. The',
 '1899. Government',
 '1899. Our',
 '1899. The',
 '1899. This',
 '1908. The',
 '1909. Many',
 '1909. Philippine',
 '1909. The',
 '1910. The',
 '1911. Therefore',
 '1911. These',
 '22. For',
 '23. Our',
 '30. The',
 '5. The',
 '78. The',
 '83. From']

>> #### 18.3 Prepare tokens and update dictionary

In [129]:
# Zip pairs into before and after tuples
patterns = list(zip(pattern_before, pattern_after))

# Update dictionary and hyphen tokens
clean_dict = update_dict(clean_dict, patterns)
transcript_tokens = [token for token in transcript_tokens if token not in clean_dict]

>> #### 19.1 Problem: Some tokens have no space between period and double quotes

In [130]:
pattern = "[A-Z]?[a-z]+\.\"[A-Za-z][a-z]+"
pattern_before = [re.search(pattern, token)[0] for token in list(set(transcript_tokens)) if re.search(pattern, token) and re.search(pattern, token)[0]==token]
pattern_before

['Empire."Faithful',
 'man."In',
 'admission."Did',
 'Government."These',
 'violence."In',
 'thereof."Such',
 'restriction."In',
 'Samoa."Arrangements',
 'expedient."But',
 'enterprise."The',
 'independent."That',
 'Corps."An',
 'means."That',
 'country."The',
 'jurisdiction."In',
 'Congress."In',
 'aphorism."This',
 'means."In',
 'use."Seven',
 'Court."The',
 'kind."During',
 'use."Under',
 'States."If',
 'present."In',
 'Indians."The',
 'view."This',
 'America."It',
 'origin."This',
 'cargo."This',
 'charge."Violent',
 'banks."But',
 'avail."Afterwards',
 'happiness."Now',
 'together."Yet',
 'prescribe."An',
 'consideration."Cheap',
 'incorporation."Before',
 'war."These',
 'fetters."The',
 'law."No',
 'case."In',
 'Honduras."Upon',
 'people."To',
 'thereof."In',
 'States."Then',
 'jail."The',
 'improvements."The',
 'color."Moreover',
 'bondage."The',
 'it."As',
 'embarrassed."The',
 'company."If',
 'statute."Generally',
 'reform."Action',
 'governments."For',
 'altogether."With',
 '

>> #### 19.2 Solution: Add space between some tokens and double quotes

In [131]:
pattern_after = [token.replace('.', '. ') for token in pattern_before]
pattern_after

['Empire. "Faithful',
 'man. "In',
 'admission. "Did',
 'Government. "These',
 'violence. "In',
 'thereof. "Such',
 'restriction. "In',
 'Samoa. "Arrangements',
 'expedient. "But',
 'enterprise. "The',
 'independent. "That',
 'Corps. "An',
 'means. "That',
 'country. "The',
 'jurisdiction. "In',
 'Congress. "In',
 'aphorism. "This',
 'means. "In',
 'use. "Seven',
 'Court. "The',
 'kind. "During',
 'use. "Under',
 'States. "If',
 'present. "In',
 'Indians. "The',
 'view. "This',
 'America. "It',
 'origin. "This',
 'cargo. "This',
 'charge. "Violent',
 'banks. "But',
 'avail. "Afterwards',
 'happiness. "Now',
 'together. "Yet',
 'prescribe. "An',
 'consideration. "Cheap',
 'incorporation. "Before',
 'war. "These',
 'fetters. "The',
 'law. "No',
 'case. "In',
 'Honduras. "Upon',
 'people. "To',
 'thereof. "In',
 'States. "Then',
 'jail. "The',
 'improvements. "The',
 'color. "Moreover',
 'bondage. "The',
 'it. "As',
 'embarrassed. "The',
 'company. "If',
 'statute. "Generally',
 'reform. 

>> #### 19.3 Prepare tokens and update dictionary

In [132]:
# Zip pairs into before and after tuples
patterns = list(zip(pattern_before, pattern_after))

# Update dictionary and hyphen tokens
clean_dict = update_dict(clean_dict, patterns)
transcript_tokens = [token for token in transcript_tokens if token not in clean_dict]

>> #### 20.1 Problem: No space between end of sentence and a number

In [133]:
pattern = "[A-Z]?[a-z]+\.[0-9]"
pattern_before = [re.search(pattern, token)[0] for token in list(set(transcript_tokens)) if re.search(pattern, token) and re.search(pattern, token)[0]==token]
pattern_before

['them.1',
 'prescribe.3',
 'Service.3',
 'each.2',
 'posts.5',
 'war.2',
 'each.3',
 'each.4',
 'disease.2',
 'fever.7',
 'requires.6',
 'purpose.4',
 'loans.1',
 'nations.1',
 'Bundy.6',
 'exist.2']

>> #### 20.2 Solution: Add space between end of sentence and number 

In [134]:
pattern_after = [token.replace('.', '. ') for token in pattern_before]
pattern_after

['them. 1',
 'prescribe. 3',
 'Service. 3',
 'each. 2',
 'posts. 5',
 'war. 2',
 'each. 3',
 'each. 4',
 'disease. 2',
 'fever. 7',
 'requires. 6',
 'purpose. 4',
 'loans. 1',
 'nations. 1',
 'Bundy. 6',
 'exist. 2']

>> #### 20.3 Prepare tokens and update dictionary

In [135]:
# Zip pairs into before and after tuples
patterns = list(zip(pattern_before, pattern_after))

# Update dictionary and hyphen tokens
clean_dict = update_dict(clean_dict, patterns)
transcript_tokens = [token for token in transcript_tokens if token not in clean_dict]

>> #### 21.1 Problem: Some tokens have no space between semicolon and word

In [136]:
pattern = "[A-Z]?[a-z]+;[a-z]+"
pattern_before = [re.search(pattern, token)[0] for token in list(set(transcript_tokens)) if re.search(pattern, token) and re.search(pattern, token)[0]==token]
pattern_before

['flag;that',
 'basis;to',
 'Federal;and',
 'producer;to',
 'aggression;we',
 'States;and',
 'office;the',
 'reservations;that',
 'war;it',
 'nations;and',
 'unforeseen;many',
 'secure;and',
 'elected;that',
 'latitude;but',
 'circulation;but',
 'exercised;and',
 'intimate;the',
 'learning;that',
 'power;and',
 'race;and',
 'welfare;it',
 'example;but',
 'legislation;and',
 'numbers;the',
 'experiment;today',
 'duty;and',
 'future;many',
 'beauty;filled',
 'monopolies;the',
 'he;d',
 'settlements;to',
 'frontier;to',
 'Randall;that',
 'disruption;it']

>> #### 21.2 Solution: Add space between semicolon and word

In [137]:
pattern_after = [token.replace(';', '; ') for token in pattern_before]
#pattern_after[-1] = 'held'
pattern_after

['flag; that',
 'basis; to',
 'Federal; and',
 'producer; to',
 'aggression; we',
 'States; and',
 'office; the',
 'reservations; that',
 'war; it',
 'nations; and',
 'unforeseen; many',
 'secure; and',
 'elected; that',
 'latitude; but',
 'circulation; but',
 'exercised; and',
 'intimate; the',
 'learning; that',
 'power; and',
 'race; and',
 'welfare; it',
 'example; but',
 'legislation; and',
 'numbers; the',
 'experiment; today',
 'duty; and',
 'future; many',
 'beauty; filled',
 'monopolies; the',
 'he; d',
 'settlements; to',
 'frontier; to',
 'Randall; that',
 'disruption; it']

>> #### 21.3 Prepare tokens and update dictionary

In [138]:
# Zip pairs into before and after tuples
patterns = list(zip(pattern_before, pattern_after))

# Update dictionary and hyphen tokens
clean_dict = update_dict(clean_dict, patterns)
transcript_tokens = [token for token in transcript_tokens if token not in clean_dict]

>> #### 22.1 Problem: Some tokens have no space between colon and word

In [139]:
pattern = "[A-Z]?[a-z]+:\"[A-Za-z]+"
pattern_before = [re.search(pattern, token)[0] for token in list(set(transcript_tokens)) if re.search(pattern, token) and re.search(pattern, token)[0]==token]
pattern_before

['said:"If',
 'said:"Our',
 'wrote:"I',
 'paragraph:"Such',
 'action:"That',
 'follows:"Whereas',
 'follows:"The',
 'action:"We',
 'further:"I',
 'said:"We',
 'following:"The',
 'Beaupre:"The',
 'instructions:"It',
 'stated:"In',
 'said:"But',
 'declared:"That',
 'recommendation:"The',
 'follows:"Article',
 'said:"Interest',
 'position:"The',
 'said:"The',
 'president:"Congress',
 'follows:"From',
 'Ehrman:"The',
 'said:"It',
 'follows:"Knowing',
 'follows:"It',
 'words:"This',
 'America:"Among',
 'said:"I',
 'currency:"I']

>> #### 22.2 Solution: Add space between colon and word

In [140]:
pattern_after = [token.replace(':', ': ') for token in pattern_before]
pattern_after

['said: "If',
 'said: "Our',
 'wrote: "I',
 'paragraph: "Such',
 'action: "That',
 'follows: "Whereas',
 'follows: "The',
 'action: "We',
 'further: "I',
 'said: "We',
 'following: "The',
 'Beaupre: "The',
 'instructions: "It',
 'stated: "In',
 'said: "But',
 'declared: "That',
 'recommendation: "The',
 'follows: "Article',
 'said: "Interest',
 'position: "The',
 'said: "The',
 'president: "Congress',
 'follows: "From',
 'Ehrman: "The',
 'said: "It',
 'follows: "Knowing',
 'follows: "It',
 'words: "This',
 'America: "Among',
 'said: "I',
 'currency: "I']

>> #### 22.3 Prepare tokens and update dictionary

In [141]:
# Zip pairs into before and after tuples
patterns = list(zip(pattern_before, pattern_after))

# Update dictionary and hyphen tokens
clean_dict = update_dict(clean_dict, patterns)
transcript_tokens = [token for token in transcript_tokens if token not in clean_dict]

>> #### 23.1 Problem: Tokens like dates or cardinal numbers we can ignore

In [142]:
patterns = [
    "([A-Z]\.)*",
    "[0-9]+(th|nd|rd|st)",
    "[0-9]{1,2}:[0-9]{2}",
    '[A-Za-z]+\'s',
    "[A-Za-z]+\.$",
    "[0-9]{4}s$",
    "^[OM]'$",
    "^p.[0-9]+",
    "^[A-Za-z]+'(d|re|ll|ve|t)$",
    "^[0-9]+d$",
    "^G[0-9]+$",
    "^d'[A-Za-z]+$",
    "^ma'[A-Za-z]+$",
    "^[Oo]'[A-Za-z]+$"
]

>> #### 23.2 Solution: Remove tokens from consideration of being cleaned

In [143]:
for pattern in patterns:
    pattern_match = [re.search(pattern, token)[0] for token in list(set(transcript_tokens)) if re.search(pattern, token) and re.search(pattern, token)[0]==token]
    transcript_tokens = sorted(list(set(transcript_tokens).difference(set(pattern_match))))

>> #### 24.1 Problem: There is not space between large numbers and words

In [144]:
pattern = "([0-9]{1,3},)*[0-9]{1,3}\.([0-9]{2}\.)?[A-Za-z]+"
pattern_before = [re.search(pattern, token)[0] for token in list(set(transcript_tokens)) if re.search(pattern, token) and re.search(pattern, token)[0]==token]
pattern_before

['75,000,000.The',
 '273,437,161.51.For',
 '66,818,292.38.The',
 '80,606,808.40.The',
 '4,557,462.71.The',
 '12,929,690.33.The',
 '560,000.The',
 '57,470,129.59.For',
 '37,951.Among',
 '247,131,549.The',
 '24,175,000,000.There',
 '104,000,000.The',
 '7,063,298.57.The',
 '2,226,876.The',
 '101,000,000.It',
 '196,778,017.The',
 '934,123.11.There',
 '63,000,000.During',
 '101,232,511.66.The',
 '31,414,788.04.Our',
 '40,000,000.For',
 '79,085.On',
 '451,963,981.On',
 '90,000.With',
 '14,381,808.40.The',
 '2,000.I',
 '300,000,000.FACILITIES',
 '218,196.The',
 '2,033,053.09.It',
 '5,125,638.14.The',
 '2,740,854,750.Every',
 '51,576,706.I',
 '58,579,780.08.It',
 '1,285,118.08.From',
 '300,000.The',
 '828,474.43.The',
 '4,140,211,139.There',
 '501,965,778.In',
 '33,801,378.78.The',
 '91.It',
 '290,712.07.The',
 '94,480,189.If',
 '427,834.62.The',
 '86,856,710.If',
 '16,838,240.There',
 '70,000,000.The',
 '70,129,195.56.The',
 '18,420.There',
 '6,398,316.10.The',
 '93,047,373.15.It',
 '1,417,55

>> #### 24.2 Solution: Add space between large numbers and words

In [145]:
pattern_after = []
for token in pattern_before:
    decimals = len(token.replace('.', '. ').split()) == 3
    if decimals:
        tokens = token.split('.')
        token_cleaned = '.'.join(tokens[:-1]) + '. ' + tokens[-1]
    else:
        token_cleaned = token.replace('.', '. ')
    pattern_after += [token_cleaned]
pattern_after

['75,000,000. The',
 '273,437,161.51. For',
 '66,818,292.38. The',
 '80,606,808.40. The',
 '4,557,462.71. The',
 '12,929,690.33. The',
 '560,000. The',
 '57,470,129.59. For',
 '37,951. Among',
 '247,131,549. The',
 '24,175,000,000. There',
 '104,000,000. The',
 '7,063,298.57. The',
 '2,226,876. The',
 '101,000,000. It',
 '196,778,017. The',
 '934,123.11. There',
 '63,000,000. During',
 '101,232,511.66. The',
 '31,414,788.04. Our',
 '40,000,000. For',
 '79,085. On',
 '451,963,981. On',
 '90,000. With',
 '14,381,808.40. The',
 '2,000. I',
 '300,000,000. FACILITIES',
 '218,196. The',
 '2,033,053.09. It',
 '5,125,638.14. The',
 '2,740,854,750. Every',
 '51,576,706. I',
 '58,579,780.08. It',
 '1,285,118.08. From',
 '300,000. The',
 '828,474.43. The',
 '4,140,211,139. There',
 '501,965,778. In',
 '33,801,378.78. The',
 '91. It',
 '290,712.07. The',
 '94,480,189. If',
 '427,834.62. The',
 '86,856,710. If',
 '16,838,240. There',
 '70,000,000. The',
 '70,129,195.56. The',
 '18,420. There',
 '6,

>> #### 24.3 Prepare tokens and update dictionary

In [146]:
# Zip pairs into before and after tuples
patterns = list(zip(pattern_before, pattern_after))

# Update dictionary and hyphen tokens
clean_dict = update_dict(clean_dict, patterns)
transcript_tokens = [token for token in transcript_tokens if token not in clean_dict]

>> #### 25.1 Problem: Some tokens have a question mark following a comma

In [147]:
pattern_before = [token for token in transcript_tokens if ',?' in token]
pattern_before

['everywhere,?the',
 'happiness,?15,000,000',
 'hostile,?all',
 'indispensable,?because',
 'it,?and',
 'moreover,?who',
 'power,?the',
 'privileges,?for',
 'resources,?some',
 'return,?unless',
 'seeking,?blindly',
 'vision,?the',
 'will,?any']

>> #### 25.2 Solution: Remove question mark

In [148]:
pattern_after = [token.replace(',?', ', ') for token in pattern_before]
pattern_after

['everywhere, the',
 'happiness, 15,000,000',
 'hostile, all',
 'indispensable, because',
 'it, and',
 'moreover, who',
 'power, the',
 'privileges, for',
 'resources, some',
 'return, unless',
 'seeking, blindly',
 'vision, the',
 'will, any']

>> #### 25.3 Prepare tokens and update dictionary

In [149]:
# Zip pairs into before and after tuples
patterns = list(zip(pattern_before, pattern_after))

# Update dictionary and hyphen tokens
clean_dict = update_dict(clean_dict, patterns)
transcript_tokens = [token for token in transcript_tokens if token not in clean_dict]

>> #### 26.1 Problem: There needs to be a space between periods and Capitals or remove period

In [150]:
pattern = "^\.(.*?)"
pattern_before = [token for token in list(set(transcript_tokens)) if re.search(pattern, token)]
pattern_before

['.decisions',
 '.If',
 '.limited',
 '.He',
 '.enhancement',
 '.I',
 '.The',
 '.Affairs',
 '.Our',
 '.Negotiations',
 '.outside',
 '.position',
 '.power',
 '.recovery',
 '.place',
 '.As',
 '.knowledge',
 '.promoted',
 '.nuclear',
 '.every',
 '.A',
 '.important',
 '.that',
 '.and',
 '.for',
 '.expenditures',
 '.with',
 '.at',
 '.pause',
 '.merely',
 '.In',
 '.own',
 '.By',
 '.past',
 '.the',
 '.There',
 '.upon',
 '.America',
 '.put',
 '.commends',
 '.is',
 '.With',
 '.Pursuant',
 '.by',
 '.During',
 '.this']

>> #### 26.2 Solution: Insert space between periods and capitals or remove

In [151]:
pattern_after = [token.replace('.', '. ') if token[1].isupper() else token[1:] for token in pattern_before]
pattern_after

['decisions',
 '. If',
 'limited',
 '. He',
 'enhancement',
 '. I',
 '. The',
 '. Affairs',
 '. Our',
 '. Negotiations',
 'outside',
 'position',
 'power',
 'recovery',
 'place',
 '. As',
 'knowledge',
 'promoted',
 'nuclear',
 'every',
 '. A',
 'important',
 'that',
 'and',
 'for',
 'expenditures',
 'with',
 'at',
 'pause',
 'merely',
 '. In',
 'own',
 '. By',
 'past',
 'the',
 '. There',
 'upon',
 '. America',
 'put',
 'commends',
 'is',
 '. With',
 '. Pursuant',
 'by',
 '. During',
 'this']

>> #### 26.3 Prepare tokens and update dictionary

In [152]:
# Zip pairs into before and after tuples
patterns = list(zip(pattern_before, pattern_after))

# Update dictionary and hyphen tokens
clean_dict = update_dict(clean_dict, patterns)
transcript_tokens = [token for token in transcript_tokens if token not in clean_dict]

>> #### 27.1 Problem: Some specified token patterns need to be cleaned

In [153]:
pattern_before = []
pattern_after = []

pattern = "^[0-9]{4}[.]"
pattern_before += [token for token in list(set(transcript_tokens)) if re.search(pattern, token)]
pattern_after += [token.replace('.', '. ') for token in list(set(transcript_tokens)) if re.search(pattern, token)]

pattern = "^[0-9]{4}[,]"
pattern_before += [token for token in list(set(transcript_tokens)) if re.search(pattern, token)]
pattern_after += [token.replace(',', ', ') for token in list(set(transcript_tokens)) if re.search(pattern, token)]

pattern_before += [token for token in transcript_tokens if '\\' in token]
pattern_after += [token.replace('\\', '') for token in transcript_tokens if '\\' in token]

pattern_before

['1858.It',
 '1921.DEPARTMENT',
 '1888.A',
 '1911.To',
 '1896.I',
 '1894.As',
 '1908.In',
 '1874.I',
 '1812.JAMES',
 '1912.CIVIL',
 '1818.In',
 '1897.I',
 '1812.I',
 '1912.To',
 '1876.It',
 '1872.It',
 '1827."We',
 '1890.I',
 '1890.In',
 '1808.As',
 '1898.In',
 '1833.I',
 '1813.JAMES',
 '1888.In',
 '1916.As',
 '1877.In',
 '1832.At',
 '1889.I',
 '1909.PORTO',
 '1884,as',
 '1928,"And',
 '1926,Amendments',
 '1850,providing',
 '1865,numbered',
 '1855,and']

>> #### 27.2 Solution: Clean specified token patterns

In [154]:
pattern_after

['1858. It',
 '1921. DEPARTMENT',
 '1888. A',
 '1911. To',
 '1896. I',
 '1894. As',
 '1908. In',
 '1874. I',
 '1812. JAMES',
 '1912. CIVIL',
 '1818. In',
 '1897. I',
 '1812. I',
 '1912. To',
 '1876. It',
 '1872. It',
 '1827. "We',
 '1890. I',
 '1890. In',
 '1808. As',
 '1898. In',
 '1833. I',
 '1813. JAMES',
 '1888. In',
 '1916. As',
 '1877. In',
 '1832. At',
 '1889. I',
 '1909. PORTO',
 '1884, as',
 '1928, "And',
 '1926, Amendments',
 '1850, providing',
 '1865, numbered',
 '1855, and']

>> #### 27.3 Prepare tokens and update dictionary

In [155]:
# Zip pairs into before and after tuples
patterns = list(zip(pattern_before, pattern_after))

# Update dictionary and hyphen tokens
clean_dict = update_dict(clean_dict, patterns)
transcript_tokens = [token for token in transcript_tokens if token not in clean_dict]

>> #### 28.1 Problem: Remaining tokens need to be cleaned

In [156]:
pattern_before = transcript_tokens[5:]
pattern_before

['0.9244.The',
 '001,856Schedule',
 '1,114,382,while',
 '1,150Attempts',
 '1.95M.',
 '1/2per',
 '100,000,000In',
 '104,706,922,but',
 '106,742,646M',
 '106D',
 '107,gig',
 '11,105,820B',
 '11,105,820H.',
 '11.4M.',
 '110,400.77The',
 '110M.',
 '111juries',
 '112C',
 '11⁄2',
 '129B',
 '14(b',
 '148,480.00State',
 '14E',
 '154H',
 '15A',
 '16,000,000which',
 '166,281,505.55The',
 '16o,657',
 '17,134,944It',
 '178,672miles',
 '17M.',
 '1819–20',
 '1825–07',
 '186118,000.00Of',
 '1861266,600.00Of',
 '186210,300.00Of',
 '1863116,850.00Of',
 '1864133,550.00Of',
 '18647,050.00Of',
 '186540,800.00Of',
 '18659,600.00Of',
 '1867235,700.00Of',
 '1868154,650.00Of',
 '186J',
 '1870$87,134,782.84From',
 '1871117,619,630.25From',
 '1872106,564,356.94Total',
 '187294,895,348.94From',
 '1874$8,883,940.933.65',
 '18742,088,168.73Certificates',
 '1881719,150.00Of',
 '188247,650.00Of',
 '1882were',
 '1895:"The',
 '1899:I',
 '18gi',
 '19,038,665.81on',
 '1900;(2',
 '19071,418,850.00Of',
 '1911To',
 '1913OU

>> #### 28.2 Solution: Clean tokens

In [157]:
pattern_after = [
'0.9244. The',
'1,114,382, while',
'1.95M.',
'1/2 per',
'100,000,000 In',
'104,706,922, but',
'11.4M.',
'110,400.77 The',
'110M.',
'11⁄2',
'14(b',
'15A',
'16,000,000 which',
'166,281,505.55 The',
'160,657',
'17,134,944 It',
'178,672 miles',
'17M.',
'1819–20',
'1825–07',
'1874 $8,883,940.933.65',
'1874 2,088,168.73 Certificates',
'1882 were',
'1895: "The',
'1899: I',
'19,038,665.81 on',
'1913 OUR',
'191:2',
'1925=100',
'1929; "Now',
'193,000) could',
'198,159,676.02, an',
'19M.',
'1908',
'1>e',
'See',
'1a',
'longitudes',
'1⁄2',
'20s',
'22]. I',
'23M.',
'25,902,683 During',
'25,902,683 This',
'257,981,439.57 Leaving',
'25 By',
'25M.',
'26M.',
'2M.',
'2 Secretary',
'20th',
'300 million',
'30s',
'35–$40',
'39, applied',
'3D',
'30th',
'4,345,521; of',
'400,000,000 and',
'401(k)s',
'401k',
'401ks',
'403,525,250.28 The',
'40s',
'41) 1,406 The',
'45пїЅ',
'45пїЅ.',
'46пїЅ',
'4 years',
'5,000,000 people',   
'50?” For',
'50s',
'51,000 men',
'512,977,32677,044,257 The',
'516,240,131 to',
'5M.',
'5s',
'6,038,091; and',
'60s',
'675 clerks',
'6:8',
'6e',
'6rst',
'70s',
'712,882.20, an',
'74,480,201.05',
'741.69 From',
'757s',
'767,111,964 Excess',
'799,959,736 Imports',
'7M.',
'8)',
'80s',
'90,786,064.02, which',
'90s',
'9M.',
'=',
'>',
'A&M',
'A&M.',
'A.: Your',
'A. By',
'AT&T',
'Army. "I',
'BUREN. By',
"Ba'athist",
'Ball, 10',
'Builders, 7',
'C3, 100,060,000',
'COMMUNIQUÉ',
'Clinton/Gore',
'Coast 190 The',
'Columbia 2 Total',
'Communiqué',
'Congress." 6',
'CrisisNextDoor.gov',
"D'AFFAIRES",
"D'Alesandro",
"D'Avino",
"Dara'a",
'Doré',
'E. Lee',
'E1',
'Europe and or Radio',
'FARLEY. It',
'February, 1899',
'GOVERNED.” I',
'Gardner',
'GeorgeWBush.com',
'Government. "A',
'Government. "I',
'Guaidó',
'Guantánamo',
'HIV/AIDS',
'Harriman, 1',
"I'm",
'I, do',
'JEFFERSON. By',
'José',
'June 30',
'Kaléo',
"Ku'damm",
"L'Enfant",
'López',
"M'Henry",
'MADISON. By',
'MacNeil and or Lehrer',
'Medicare and or prescription',
'Moyers',
"Mu'ammar",
'NECESSITY. But',
'Northwest 435 District',
'OB/GYNS',
'PIERCE. By',
'Ph.D.',
"Presqu'isle",
'R&D',
'Recovery.gov',
'RedCross.org',
'SS20',
"Sana'a",
'Schlesinger, 2',
"Shi'a",
'Solomon, 12',
'States. "I',
'States 181 Southern',
"Ty'Sheoma",
'U.S',
'U.S.A',
'U235',
'VII. Since',
'Vaughn, 6',
'Washington, "And',
'Y2',
'[America',
"'",
'a)round',
'a.m.',
'a14',
'act. ”Now',
'adaptation. “It',
'air)planes',
'and $38,336,450',
'and $4,575,397.97',
'and $6,469,643.04',
'and) to',
'and/or',
'aphorism. "This',
'approval: 1',
'argument. "I',
'asociations. "This',
'assesments (chargeable',
'assessments $1,614,054.37 Less',
'assets 13,994,613.24 In',
'assurance(s',
'at/east',
'audit 4,770,558.4515,742,667.61 Less',
'awards. "I',
'a–work',
'banker. "I',
'be: (1',
'believe that',
'bondage. "The',
'bonds 59,000.00 In',
'bonds 75,000.00 And',
'both/and',
'by $58,485,517. The',
'cal1ed',
'celebrat(e)ing',
'cents 13.1 From',
'cents 93.0 From',
'claims, 227,040',
'committee 1',
'committees. "I',
'competition. "I',
'constructively',
'crops. (2',
'crops. (3',
'customs $223',
'day! "What',
"didn't he",
'die, "To',
'disturb. ”I',
'do 15.6Of',
'duty; 2',
'effect.',
'either/or',
'equal. "This',
'etc, "a',
'etc 65,337,343$60,624,4642',
'execution. (Signed',
'fiancée',
"folks' pensions",
"follows: 'The",
'follows: "In',
'follows; "It',
'fund 1,748,054.37 Leaving',
'gathering(ed',
'government. But',
'granted. "I',
'great‐granddaughter',
'him! "While',
'him! "While',
'i.e',
'in 1881',
'injury. "A',
'instance(s',
"it'[s",
'it. "I',
'it. ”It',
"it's",
'it: “The',
'job‐crushing',
'justice(s',
'l500',
'l933',
'labor(s',
'length 3,949',
'libertyunites.org',
'loyalt(y)ies',
'manufactures; $21,462,534.34',
'mechanical) double',
'mills 8.5 From',
'naïve',
'negroes!! ”Well',
'not. "SUBSTITUTE',
'of the',
'of $425,000',
'of 250',
'of ____ for',
'office. "I',
'ones) are',
'orphans. "DEPARTMENT',
'other. "Mr',
'out.',
'overtures 92',
'p.m.',
'parent/teacher',
'passed. "A',
'pay) that',
'people; "And',
'points: 1',
'pound 19,950,1873,129,321 Total',
'pound 348,988,6482,996,4033',
'pound 78,701,14810,324,0694',
'powers. "I',
'present 45',
'proof 7',
'protect(ion',
're)sources',
'receipts $112,194,947',
'recovery.gov',
'revenue 130,881,513.92 From',
'revita1ize',
'régime',
'résumé',
'said, "About',
'said, "Sammy',
"said: 'All",
'says: “We',
'seas. "I',
'since 1860',
'slavery. ”This',
'sources 32,335,803.23 The',
'stand. "I',
'stringency. "I',
'subject. "I',
'the',
'tax‐cut',
'the 1st',
'there"? if',
'through(out',
'time.' "Henry'",
'time; I',
'to $25,013,650',
'to $443,889,495.88',
'tread. ”At',
'trying???with',
'up)on',
'usafreedomcorps.gov',
'vessels. "A',
'wage/pricing',
'war. "I',
'war] 1',
'well‐being',
"wisdom's no",
"world's peace",
'wrote: "I',
' To',
' more',
'does',
'°'
]

>> #### 28.3 Prepare tokens and update dictionary

In [158]:
# Zip pairs into before and after tuples
patterns = list(zip(pattern_before, pattern_after))

# Update dictionary and hyphen tokens
clean_dict = update_dict(clean_dict, patterns)
transcript_tokens = [token for token in transcript_tokens if token not in clean_dict]

## Clean Tokens

In [159]:
speeches.Summary = speeches.Summary.apply(lambda x: clean_text_tokens(x, clean_dict))
speeches.Transcript = speeches.Transcript.apply(lambda x: clean_text_tokens(x, clean_dict))

In [160]:
# Add pipeline
nlp.add_pipe(remove_excess_spaces_component)

# Convert Transcripts to doc to create tokens with hyphens
speeches.Transcript = list(nlp.pipe(speeches.Transcript, batch_size = 50, n_process=-1,  disable=["tagger", "parser", "ner"]))
speeches.Transcript = speeches.Transcript.apply(lambda x: x.text.replace(' — ', '—'))

# Convert Transcripts again to clean transcripts into final form
speeches.Transcript = list(nlp.pipe(speeches.Transcript, batch_size = 50, n_process=-1,  disable=["tagger", "parser", "ner"]))
speeches.Transcript = speeches.Transcript.apply(lambda x: clean_transcript(x))
speeches.Transcript = speeches.Transcript.apply(lambda x: x.strip())

## 4.6 Add Political Party

In [197]:
# Poltical Parties
president_party = {}
president_party['Abraham Lincoln'] = 'Republican'
president_party['Andrew Jackson'] = 'Democratic'
president_party['Andrew Johnson'] = 'Democratic'
president_party['Barack Obama'] = 'Democratic'
president_party['Benjamin Harrison'] = 'Republican'
president_party['Bill Clinton'] = 'Democratic'
president_party['Calvin Coolidge'] = 'Republican'
president_party['Chester A. Arthur'] = 'Republican'
president_party['Donald Trump'] = 'Republican'
president_party['Dwight D. Eisenhower'] = 'Republican'
president_party['Franklin D. Roosevelt'] = 'Democratic'
president_party['Franklin Pierce'] = 'Democratic'
president_party['George H. W. Bush'] = 'Republican'
president_party['George W. Bush'] = 'Republican'
president_party['George Washington'] = 'Unaffiliated'
president_party['Gerald Ford'] = 'Republican'
president_party['Grover Cleveland'] = 'Democratic'
president_party['Harry S. Truman'] = 'Democratic'
president_party['Herbert Hoover'] = 'Republican'
president_party['James A. Garfield'] = 'Republican'
president_party['James Buchanan'] = 'Democratic'
president_party['James K. Polk'] = 'Democratic'
president_party['James Madison'] = 'Democratic-Republican'
president_party['James Monroe'] = 'Democratic-Republican'
president_party['Jimmy Carter'] = 'Democratic'
president_party[ 'John Adams'] = 'Federalist'
president_party['John F. Kennedy'] = 'Democratic'
president_party['John Quincy Adams'] = 'Democratic-Republican'
president_party['John Tyler'] = 'Unaffiliated'
president_party['Lyndon B. Johnson'] = 'Democratic'
president_party[ 'Martin Van Buren'] = 'Democratic'
president_party['Millard Fillmore'] = 'Whig'
president_party['Richard M. Nixon'] = 'Republican'
president_party['Ronald Reagan'] = 'Republican'
president_party['Rutherford B. Hayes'] = 'Republican'
president_party[ 'Theodore Roosevelt'] = 'Republican'
president_party['Thomas Jefferson'] = 'Democratic-Republican'
president_party['Ulysses S. Grant'] = 'Republican'
president_party['Warren G. Harding'] = 'Republican'
president_party['William Harrison'] = 'Whig'
president_party['William McKinley'] = 'Republican'
president_party['William Taft'] = 'Republican'
president_party['Woodrow Wilson'] = 'Democratic'
president_party['Zachary Taylor'] = 'Whig'

In [198]:
# Add political party to speeches and rearrange column order
speeches['Party'] = speeches.President.apply(lambda x: president_party[x])
speeches = speeches[['Date', 'President', 'Party', 'Speech Title', 'Summary', 'Transcript', 'URL']]

# Reverse index ordering 
speeches = speeches.iloc[::-1]
speeches = speeches.reset_index(drop=True)

In [54]:
# Chester A. Arthur's date for his speech if off by a century so it must be corrected
speeches.iloc[774, speeches.columns.get_loc('Date')] = pd.Timestamp(1881, 12, 6)

In [56]:
# Save Speeches
pickle.dump(speeches, open('pickles/speeches.p', "wb" ))