In [1]:
from pathlib import Path

import pdfplumber
import re
from pprint import pprint

import sys
src_path = str(Path.cwd().parent / "src")
sys.path.append(src_path)

In [2]:
from pdf_processing import *

TODO: 
 - Q&As / interviews
 - check more PDFs for differences in format
 - write documentation
 - locations not included after the date (might need to be taken care of manually)
 - use translate() instead of pdf.replace()



DONE: 
 - seems to work now:
     - footnotes (like in People_of_Greece.pdf and Senate_Floor_Immigration_Reform.pdf)
     
     
 - location is not split up anymore, but kept as a single element (will be easier to clean up afterwards)
     - (locations with multiple comma's (ex. when Washington, D.C. is included) --> might be better to store location as one elements instead of splitting it up, the location can be cleaned up afterwards)

In [3]:
# pdfs that have these in the title might have to be excluded: interview, presser, Press_Conference, press
multiple_people_talking = ['Barack_Obama_-_Al-Arabiya_Interview.pdf', 'CGI_2013.pdf', 
                           'Midterm_Elections_Presser_2014.pdf', 'YSEALI_Fellows.pdf', 
                           'PM_Abe_of_Japan_Joint_Presser.pdf', 'Paris_Press_Conference_2015.pdf',
                           'Gun_Violence_Presser.pdf', 'Associated_Press_Luncheon.pdf', 'Final_Press_Conference.pdf',
                           'CNN_Guns_in_America_Town_Hall.pdf', 'START_Treaty_Presser.pdf', 
                           'Midterm_Elections_Presser_2010.pdf', 'Joint_Presser_with_President_Benigno_Aquino.pdf',
                           'Worldwide_Troop_Talk.pdf', 'Post_ASEAN_Presser_2016.pdf', 'Post_G7_Presser_Japan.pdf',
                           'Presser_Unannounced_Heath_Care.pdf']


Get the path to the directory in which the PDFs are stored.

In [4]:
pdf_dir = Path.cwd().parent / "pdfs"

Filepaths of all the PDFs in the folder `pdf_dir`, can be used to iterate over all the PDFs to store the extracted speeches in a dataframe.

In [5]:
pdfs = list(pdf_dir.glob('*.pdf'))  
print("current number of PDFs:", len(pdfs))

current number of PDFs: 436


Get filepath of the PDF you want to process.

In [6]:
filepath = pdfs[57] 
print("File:", str(filepath.parts[-1]))

File: Afghanistan_Troop_Reduction.pdf


Make a PDFHandler object for the given filepath.

In [7]:
pdf = PDFHandler(filepath)

Print the last page of the PDF before it has been processed.

In [8]:
print(pdf.original_page(-1))

  
AAmmeerriiccaannRRhheettoorriicc..ccoomm  
 
That’s a lesson worth remembering -- that we are all a part of one American family. Though 
we have known disagreement and division, we are bound together by the creed that is written 
into our founding documents, and a conviction that the United States of America is a country 
that can achieve whatever it sets out to accomplish. Now, let us finish the work at hand. Let 
us responsibly end these wars, and reclaim the American Dream that is at the center of our 
story. With confidence in our cause, with faith in our fellow citizens, and with hope in our 
hearts, let us go about the work of extending the promise of America -- for this generation, 
and the next. 
May God bless our troops. 
And may God bless the United States of America. 
Property of AmericanRhetoric.com  Copyright ©2012. All rights reserved.     Page 5 


Print the first page of the PDF before it has been processed.

In [9]:
print(pdf.original_page(0))

  
AAmmeerriiccaannRRhheettoorriicc..ccoomm  
 
Barack Obama 
Afghanistan Troop Reduction Address to the Nation 
 
Delivered 22 June 2011, White House, Washington, D.C. 
 
AUTHENTICITY CERTIFIED: Text version below transcribed directly from audio 
Good evening. Nearly 10 years ago, America suffered the worst attack on our shores since 
Pearl Harbor. This mass murder was planned by Osama bin Laden and his al Qaeda network in 
Afghanistan, and signaled a new threat to our security -- one in which the targets were no 
longer soldiers on a battlefield, but innocent men, women and children going about their daily 
lives. 
In the days that followed, our nation was united as we struck at al Qaeda and routed the 
Taliban in Afghanistan. Then, our focus shifted. A second war was launched in Iraq, and we 
spent enormous blood and treasure to support a new government there. By the time I took 
office, the war in Afghanistan had entered its seventh year. But al Qaeda’s leaders had 
escaped into Pa

Define a regular expression to get the date, location, and content of the speech. Extract the entire speech from the PDF.

In [10]:
start = r"(?:hheettoorriicc\.\.ccoomm)"

date = r"(?P<day>[0-9]{1,2})\s+(?P<mon>[a-z]+)\s+(?P<year>[0-9]{2,4})"
#loc = r"(?P<location_small>[a-z0-9. ]+),\s+(?P<location_big>[a-z0-9., ]+)"
#loc = r"(?P<location>[a-z0-9.,\- ]+)"
loc = r"(?P<location>.+?)"
auth = r"(?:\[?AUTHENTICITY CERTIFIED: Text version below transcribed directly from audio( and edited for continuity)?\]?)?"

content = r"(?P<content>.*?)"

footnotes = r"(([1-9][0-9]* [a-z].*?)+\n+)?"

tr = r"(Transcription\s+by\s+.*?)?"
pr = r"(Property\s+of\s+)?"
end = r"AmericanRhetoric\.com"

full_patern = start + \
r"(\s+.*?delivered\s+" + date + r"(,\s+" + loc + r")?" + r"\s+" + auth + r")?" + \
r"\s+" + content + r"\n+" + \
r"(?:" + footnotes + tr + pr + end + r")"
pat = re.compile(full_patern, re.I | re.DOTALL)

speech = pdf.extract_speech(pat)
print(speech)

Good evening. Nearly 10 years ago, America suffered the worst attack on our shores since 
Pearl Harbor. This mass murder was planned by Osama bin Laden and his al Qaeda network in 
Afghanistan, and signaled a new threat to our security -- one in which the targets were no 
longer soldiers on a battlefield, but innocent men, women and children going about their daily 
lives. 
In the days that followed, our nation was united as we struck at al Qaeda and routed the 
Taliban in Afghanistan. Then, our focus shifted. A second war was launched in Iraq, and we 
spent enormous blood and treasure to support a new government there. By the time I took 
office, the war in Afghanistan had entered its seventh year. But al Qaeda’s leaders had 
escaped into Pakistan and were plotting new attacks, while the Taliban had regrouped and 
gone on the offensive. Without a new strategy and decisive action, our military commanders 
warned that we could face a resurgent al Qaeda and a Taliban taking over large pa

Print the relevant info of the PDF.

In [11]:
pdf.print_info()

Title: Afghanistan_Troop_Reduction
Number of pages: 5
Date: ['22', 'June', '2011']
Location: ['White House, Washington, D.C.']


Check whether there are multiple speakers. (If there is a high "Obama:" count, this might indicate he is answering questions, i.e., it is an interview, press conference, debate,...)

In [12]:
speakers = pdf.multiple_speakers(speech)
pprint(speakers)

"Obama:" count: 0
"President:" count: 0
"Question:" count: 0
"Audience:" count: 0
"Member:" count: 0
['objectives:', 'clear:', 'simply:']


Replace or delete some characters to clean the speech.

In [13]:
old = [r'-+', r'\.{2,}', r'[’‘]', r'"', r'’’', r'‘‘', r'“', r'”', r'\[sic\]', r'\s+', r'\[Source:\s*http.*\]', r'[\[\]\(\)]']
new = [r' ' , r' '     , r"'"   , r'' , r''  , r''  , r'' , r'' , r' '      , r' '  , r'',                     r''      ]

clean_speech = pdf.replace(speech, old, new)
print(clean_speech)

Good evening. Nearly 10 years ago, America suffered the worst attack on our shores since Pearl Harbor. This mass murder was planned by Osama bin Laden and his al Qaeda network in Afghanistan, and signaled a new threat to our security one in which the targets were no longer soldiers on a battlefield, but innocent men, women and children going about their daily lives. In the days that followed, our nation was united as we struck at al Qaeda and routed the Taliban in Afghanistan. Then, our focus shifted. A second war was launched in Iraq, and we spent enormous blood and treasure to support a new government there. By the time I took office, the war in Afghanistan had entered its seventh year. But al Qaeda's leaders had escaped into Pakistan and were plotting new attacks, while the Taliban had regrouped and gone on the offensive. Without a new strategy and decisive action, our military commanders warned that we could face a resurgent al Qaeda and a Taliban taking over large parts of Afghani

Old code, used for debugging.

In [None]:
pdfs = ["Farewell_to_Staff_and_Supporters", "Flint_Michigan_Community", "Guantanamo_Bay_Closing", "Post_G7_Presser_Japan"]

pdf_dir = Path.cwd().parent / "pdfs"
file_to_open = pdfs[2] + ".pdf" 
pdf = pdfplumber.open(pdf_dir / file_to_open)

print('Title:', pdf.metadata['Title'])
print("number of pages:", len(pdf.pages))

In [None]:
text = pdf.pages[0].extract_text()
    
#start = r"hheettoorriicc\.\.ccoomm(?:.*AUTHENTICITY CERTIFIED: Text version below transcribed directly from audio)?"
start = r"(?:hheettoorriicc\.\.ccoomm)"
date = r"(.*delivered\s+(?P<day>[ 123][0-9])\s+(?P<mon>[A-Z][a-z]+)\s+(?P<year>[0-9][0-9][0-9][0-9]),"
loc = r"\s+(?P<location_small>[A-Za-z0-9 ]+),\s+(?P<location_big>.*)\n+"
auth = r"(?:.*AUTHENTICITY CERTIFIED: Text version below transcribed directly from audio))?"
mid = r"\s*\n+(?P<content>.*)\s*\n+"
end = r"(?:(?:Property\s+of\s+)?AmericanRhetoric\.com)"

core_pat = re.compile(start + date+loc+auth + mid + end, re.DOTALL)

print("ORIGINAL:\n")
print(text)
print("\n\n" + 100*"-" + "\n\n") 

search = re.search(core_pat, text)
core = search.group("content")

dash = re.compile(r"-+")
no_dash_core = dash.sub(r" ", core)

dots = re.compile(r"\.{2,}")
no_dots_core = dots.sub(r" ", no_dash_core)

spaces = re.compile(r"\s+")
single_space_core = spaces.sub(r" ", no_dots_core)

print("PROCESSED:\n")
print(single_space_core)

In [None]:
def extract_speech(pages):
    full_text = ""
    
    start = r"hheettoorriicc\.\.ccoomm(?:.*AUTHENTICITY CERTIFIED: Text version below transcribed directly from audio)?"
    mid = r"\s+(?P<content>.*)\s+"
    end = r"(?:Property of )?AmericanRhetoric\.com"
    core_pat = re.compile(start+mid+end, re.DOTALL)
        
    for i in range(len(pages)):
        text = pages[i].extract_text()
        
        core = re.search(core_pat, text).group("content")
        
        core = replace(core, [r"-+", r"\.{2,}", r"\s+"], r" ")
        
        full_text += core + " "
        
    return full_text


def replace(text, old, new):
    for i in range(len(old)):
        pat = re.compile(old[i])
        text = pat.sub(new, text)
    return text


In [None]:
full_speech = extract_speech(pdf.pages)
print(full_speech)

In [None]:
pdf.close()