In [1]:
from pathlib import Path

import pdfplumber
import re

from pdf_processing import *

TODO: 
 - Q&As / interviews
 - check more PDFs for differences in format

Get the path to the directory in which the PDFs are stored.

In [2]:
pdf_dir = Path.cwd().parent / "pdfs"

Filepaths of all the PDFs in the folder `pdf_dir`, can be used to iterate over all the PDFs to store the extracted speeches in a dataframe.

In [3]:
pdfs = list(pdf_dir.glob('*.pdf'))  
print("current number of PDFs:", len(pdfs))

current number of PDFs: 4


Get filepath of the PDF you want to process.

In [4]:
filepath = pdfs[2]

Make a PDFHandler object for the given filepath.

In [5]:
pdf = PDFHandler(filepath)

Print the first page of the PDF before it has been processed.

In [6]:
print(pdf.original_page(0))

  
AA RR
mmeerriiccaann hheettoorriicc..ccoomm  
 
B O
arack  bama 
Farewell Remarks at Joint Base Andrews 
 
delivered 20 January 2017, Prince George County, Maryland
 
 
AUTHENTICITY CERTIFIED: Text version below transcribed directly from audio 
Michelle and I, we've really been milking this goodbye thing, so it behooves me to be very 
 
brief.
Audience Members: No, no!  
President Obama: Yes, yes.  
You know, I said before and I will say again, that when we started on this journey we did so 
with an abiding faith in the American people and their ability, out ability, to join together to 
change the country in ways that would make life better for our kids and our grandkids, that 
 
change didn’t happen from the top down, but it happened from the bottom up.
It was met sometimes with skepticism and doubt. Some folks didn’t think we could pull it off. 
There were those who felt that the institutions of power and privilege in this country were too 
 
deeply entrenched.
Property of Americ

Define a regular expression to get the date, location, and content of the speech. Extract the entire speech from the PDF.

In [7]:
start = r"(?:hheettoorriicc\.\.ccoomm)"
date = r"(.*delivered\s+(?P<day>[0-9]{1,2})\s+(?P<mon>[A-Z][a-z]+)\s+(?P<year>[0-9]{2,4}),"
loc = r"\s+(?P<location_small>[A-Za-z0-9. ]+),\s+(?P<location_big>[A-Za-z0-9., ]+)"
auth = r"(?:\s+AUTHENTICITY CERTIFIED: Text version below transcribed directly from audio))?"
content = r"\s+(?P<content>.*)\n+"
end = r"(?:(Property\s+of\s+)?AmericanRhetoric\.com)"

pat = re.compile(start + date + loc + auth + content + end, re.DOTALL)

speech = pdf.extract_speech(pat)
print(speech)

Michelle and I, we've really been milking this goodbye thing, so it behooves me to be very 
 
brief.
Audience Members: No, no!  
President Obama: Yes, yes.  
You know, I said before and I will say again, that when we started on this journey we did so 
with an abiding faith in the American people and their ability, out ability, to join together to 
change the country in ways that would make life better for our kids and our grandkids, that 
 
change didn’t happen from the top down, but it happened from the bottom up.
It was met sometimes with skepticism and doubt. Some folks didn’t think we could pull it off. 
There were those who felt that the institutions of power and privilege in this country were too 
 
deeply entrenched. And yet, all of you came together, in small towns and big cities, a whole bunch of you really 
young, and you decided to believe. And you knocked on doors and you made phone calls, and 
you talked to your parents who didn’t know how to pronounce Barack Obama. And yo

Print the relevant info of the PDF.

In [8]:
pdf.print_info()

Title: Farewell_to_Staff_and_Supporters
Number of pages: 3
Date: ['20', 'January', '2017']
Location: ['Prince George County', 'Maryland']


Replace or delete some characters to clean the speech.

In [9]:
old = [r"-+", r"\.{2,}", r"\s+", r'[’‘]', r'"+']
new = [r" ",  r" ",      r" ",   r"'",    r""]

clean_speech = pdf.replace(speech, old, new)
print(clean_speech)

Michelle and I, we've really been milking this goodbye thing, so it behooves me to be very brief. Audience Members: No, no! President Obama: Yes, yes. You know, I said before and I will say again, that when we started on this journey we did so with an abiding faith in the American people and their ability, out ability, to join together to change the country in ways that would make life better for our kids and our grandkids, that change didn't happen from the top down, but it happened from the bottom up. It was met sometimes with skepticism and doubt. Some folks didn't think we could pull it off. There were those who felt that the institutions of power and privilege in this country were too deeply entrenched. And yet, all of you came together, in small towns and big cities, a whole bunch of you really young, and you decided to believe. And you knocked on doors and you made phone calls, and you talked to your parents who didn't know how to pronounce Barack Obama. And you got to know each

Old code, used for debugging.

In [None]:
pdfs = ["Farewell_to_Staff_and_Supporters", "Flint_Michigan_Community", "Guantanamo_Bay_Closing", "Post_G7_Presser_Japan"]

pdf_dir = Path.cwd().parent / "pdfs"
file_to_open = pdfs[2] + ".pdf" 
pdf = pdfplumber.open(pdf_dir / file_to_open)

print('Title:', pdf.metadata['Title'])
print("number of pages:", len(pdf.pages))

In [None]:
text = pdf.pages[0].extract_text()
    
#start = r"hheettoorriicc\.\.ccoomm(?:.*AUTHENTICITY CERTIFIED: Text version below transcribed directly from audio)?"
start = r"(?:hheettoorriicc\.\.ccoomm)"
date = r"(.*delivered\s+(?P<day>[ 123][0-9])\s+(?P<mon>[A-Z][a-z]+)\s+(?P<year>[0-9][0-9][0-9][0-9]),"
loc = r"\s+(?P<location_small>[A-Za-z0-9 ]+),\s+(?P<location_big>.*)\n+"
auth = r"(?:.*AUTHENTICITY CERTIFIED: Text version below transcribed directly from audio))?"
mid = r"\s*\n+(?P<content>.*)\s*\n+"
end = r"(?:(?:Property\s+of\s+)?AmericanRhetoric\.com)"

core_pat = re.compile(start + date+loc+auth + mid + end, re.DOTALL)

print("ORIGINAL:\n")
print(text)
print("\n\n" + 100*"-" + "\n\n") 

search = re.search(core_pat, text)
core = search.group("content")

dash = re.compile(r"-+")
no_dash_core = dash.sub(r" ", core)

dots = re.compile(r"\.{2,}")
no_dots_core = dots.sub(r" ", no_dash_core)

spaces = re.compile(r"\s+")
single_space_core = spaces.sub(r" ", no_dots_core)

print("PROCESSED:\n")
print(single_space_core)

In [None]:
def extract_speech(pages):
    full_text = ""
    
    start = r"hheettoorriicc\.\.ccoomm(?:.*AUTHENTICITY CERTIFIED: Text version below transcribed directly from audio)?"
    mid = r"\s+(?P<content>.*)\s+"
    end = r"(?:Property of )?AmericanRhetoric\.com"
    core_pat = re.compile(start+mid+end, re.DOTALL)
        
    for i in range(len(pages)):
        text = pages[i].extract_text()
        
        core = re.search(core_pat, text).group("content")
        
        core = replace(core, [r"-+", r"\.{2,}", r"\s+"], r" ")
        
        full_text += core + " "
        
    return full_text


def replace(text, old, new):
    for i in range(len(old)):
        pat = re.compile(old[i])
        text = pat.sub(new, text)
    return text


In [None]:
full_speech = extract_speech(pdf.pages)
print(full_speech)

In [None]:
pdf.close()