In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
import re, regex, csv, sys
from pprint import pprint

# add project folders to the system path
src_path = str(Path.cwd().parent / "src")
sys.path.append(src_path)
# python file for the PDFHandler class (located in the src folder)
from pdf_processing import PDFHandler

In [2]:
# pdfs that have these in the title might have to be excluded: interview, presser, Press_Conference, press, meeting, 
# debate, hall (maybe)
multiple_people_talking = ['Barack_Obama_-_Al-Arabiya_Interview.pdf', 'CGI_2013.pdf', 
                           'Midterm_Elections_Presser_2014.pdf', 'YSEALI_Fellows.pdf', 
                           'PM_Abe_of_Japan_Joint_Presser.pdf', 'Paris_Press_Conference_2015.pdf',
                           'Gun_Violence_Presser.pdf', 'Associated_Press_Luncheon.pdf', 'Final_Press_Conference.pdf',
                           'CNN_Guns_in_America_Town_Hall.pdf', 'START_Treaty_Presser.pdf', 
                           'Midterm_Elections_Presser_2010.pdf', 'Joint_Presser_with_President_Benigno_Aquino.pdf',
                           'Worldwide_Troop_Talk.pdf', 'Post_ASEAN_Presser_2016.pdf', 'Post_G7_Presser_Japan.pdf',
                           'Presser_Unannounced_Heath_Care.pdf', 'Kenya_Civil_Society_Meeting.pdf',
                           'Obama-Romney_-_Second_Live_Debate.pdf', 'Jamaica_Town_Hall.pdf', 'YSEALI_Town_Hall.pdf',
                           'BP_Oil_Spill_Presser.pdf', 'VA_Misconduct_Allegations.pdf', 'ISIL_Pentagon_Update.pdf',
                           'ISIL_Presser_Post_Security_Council_Meeting.pdf']
# maybe: 'Alabama_Tornado_Devestation.pdf', 'NAACP_Coference_2015.pdf'

# the year for Community_College_Plan.pdf is 20105 but should be 2015
# Recovery_and_Reinvestment_Act_2016.pdf has the year missing from the date

Get the path to the directory in which the PDFs are stored.

In [3]:
pdf_dir = Path.cwd().parent / "pdfs"

Filepaths of all the PDFs in the folder `pdf_dir`, can be used to iterate over all the PDFs to store the extracted speeches in a dataframe.

In [4]:
pdfs = list(pdf_dir.glob('*.pdf'))  
print("current number of PDFs:", len(pdfs))

current number of PDFs: 4


Get filepath of the PDF you want to process.

In [5]:
filepath = pdfs[0]
print("File:", str(filepath.parts[-1]))

File: Post_G7_Presser_Japan.pdf


Make a PDFHandler object for the given filepath.

In [6]:
pdf = PDFHandler(filepath)

Print the last page of the PDF before it has been processed.

In [7]:
print(pdf.original_page(-1))

  
AA RR
mmeerriiccaann hheettoorriicc..ccoomm  
 
President Obama:  Okay, guys.  I gave you a couple -- I already gave you bonuses.  I gave 
 
you a bunch of bonuses.  Thank you, guys.
 
AmericanRhetoric.com         Page 11 


Print the first page of the PDF before it has been processed.

In [8]:
print(pdf.original_page(0))

  
AA RR
mmeerriiccaann hheettoorriicc..ccoomm  
 
Barack Obama 
Post G7 Press Conference in Japan 
 
delivered 26 May 2016, Shima City, Japan
 
 
AUTHENTICITY CERTIFIED: Text version below transcribed directly from audio 
Hello, everybody.  So, as all of you know, we're going to Hiroshima tomorrow.  And in the 
interest of getting you all home at a reasonable hour, we're not going to be doing a press 
 
conference after, so I thought I'd give you guys a chance to fire off some questions now.
Just a quick comment on the G7 meeting so far.  It's been extremely productive.  I think that 
one of the benefits of the G7 is that you have likeminded countries who are committed to 
democracy and free markets, and international law and international norms.  And so for us to 
be able to get together and focus on critical issues that not only affect individual countries but 
affect the international order I think is vitally important.  And we very much appreciate the 
 
work that the Japanese and

Define a regular expression to get the date, location, and content of the speech. Extract the entire speech from the PDF.

In [9]:
start = r"^(?:\s*AA *RR\s+mmeerriiccaann *hheettoorriicc\.\.ccoomm|\s*AmericanRhetoric\.com|\s*AAmmeerriiccaannRRhheettoorriicc\.\.ccoomm)?"

datemarker =r"\s*(delivered|First *Broadcast|Uploaded[ a-z]*|Published)?"
date = r"(((?P<day>[0-9]{1,2}) *(?P<mon>[a-z]{1,10}))|((?P<mon>[a-z]{1,10}) *(?P<day>[0-9]{1,2}))),? +(?P<year>[0-9]* *)"
loc = r"(([,.] *location *)?[,.] *(?P<location>.+?))?"
auth = r"(?:.?AUTHENTICITY CERTIFIED: Text version below transcribed directly from audio( *and edited for continuity)?.?)?"

content = r"(?P<content>.*)"

tr = r"(Transcription *by *[a-z.\- ]+\s*)?"
url = r"((Property *of *)?AmericanRhetoric\.com)?"
page = r"(page *[0-9]+\s*$)"

full_pat = start + \
r"(\s+.*?" + r"^(" + datemarker + " *" + date + r")" + loc + r"\s+" + auth + r")?" + \
r"\s+" + content + \
r"^(?:\s*" + tr + url + r".*?" + page + r")"
pat = regex.compile(full_pat, regex.I | regex.DOTALL | regex.M | regex.V1)

speech = pdf.extract_speech(pat)
print(speech)

Hello, everybody.  So, as all of you know, we're going to Hiroshima tomorrow.  And in the 
interest of getting you all home at a reasonable hour, we're not going to be doing a press 
 
conference after, so I thought I'd give you guys a chance to fire off some questions now.
Just a quick comment on the G7 meeting so far.  It's been extremely productive.  I think that 
one of the benefits of the G7 is that you have likeminded countries who are committed to 
democracy and free markets, and international law and international norms.  And so for us to 
be able to get together and focus on critical issues that not only affect individual countries but 
affect the international order I think is vitally important.  And we very much appreciate the 
 
work that the Japanese and Prime Minister Abe have done in organizing an excellent meeting.
So far, we've discussed issues of the global economy and the need to continue to accelerate 
growth, to use all the tools at our disposal to make sure that w

Print the relevant info of the PDF.

In [10]:
pdf.print_info()

Title: Post_G7_Presser_Japan
Number of pages: 11
Date: 26 May 2016
Location: Shima City, Japan


Check whether there are multiple speakers. (If there is a high "Obama:" count, this might indicate he is answering questions, i.e., it is an interview, press conference, debate,...)

In [11]:
count = ('obama:', 'president:', 'question:', 'audience:', 'member:')
counts, speakers = pdf.multiple_speakers(speech, count)
print(counts)
print()
pprint(speakers)

{'obama:': 10, 'president:': 0, 'question:': 12, 'audience:': 0, 'member:': 0}

['question:',
 'obama:',
 'question:',
 'obama:',
 'question:',
 'obama:',
 'question:',
 'obama:',
 'question:',
 'obama:',
 'question:',
 'obama:',
 'question:',
 'question:',
 'obama:',
 'question:',
 'question:',
 'obama:',
 'question:',
 'obama:',
 'question:',
 'obama:']


Replace substrings.

In [12]:
rep_old = [r'"', r'’’', r'‘‘', r'“', r'”', r'[sic]']
rep_new = [r'' , r''  , r''  , r'' , r'' , r' '    ]

clean_speech = pdf.substring_replace(speech, rep_old, rep_new)

Replace or delete some characters to clean the speech (only the ones that need to be matched with a regular expression).

In [13]:
match_old = [r'\s+', r'\[(Source:)?\s*http.*?\]', r'[\[\]\(\)]']
match_new = [r' '  , r''                        , r''          ]

clean_speech = pdf.match_replace(clean_speech, match_old, match_new)
print(clean_speech)

Hello, everybody. So, as all of you know, we're going to Hiroshima tomorrow. And in the interest of getting you all home at a reasonable hour, we're not going to be doing a press conference after, so I thought I'd give you guys a chance to fire off some questions now. Just a quick comment on the G7 meeting so far. It's been extremely productive. I think that one of the benefits of the G7 is that you have likeminded countries who are committed to democracy and free markets, and international law and international norms. And so for us to be able to get together and focus on critical issues that not only affect individual countries but affect the international order I think is vitally important. And we very much appreciate the work that the Japanese and Prime Minister Abe have done in organizing an excellent meeting. So far, we've discussed issues of the global economy and the need to continue to accelerate growth, to use all the tools at our disposal to make sure that we're not only putt

Output to csv file.

In [14]:
# visually check the row that will be written and compare the the first lines of the extracted
# contents with the original page
def visual_check(nb, original_page, row):
    print("pdf", nb, '\n\n')
    print(original_page, '\n\n' + 50*'-' + '\n')
    print(row[:-1], '\n\n')
    print(row[-1][:500], '\n\n' + 50*'-' + '\n')
    print(row[-1][-500:], '\n\n' + 100*'-' + '\n\n')

In [None]:
csv_dir = Path.cwd().parent / "speeches_csv" 
csv_filename = csv_dir / "all_speeches.txt"

# column names
fields = ["title", "pages", "date", "location", "highest_speaker_count", "content"]

# use 'a+' instead of 'w' if you want to append at the bottom of the file instead of replacing the file
with open(csv_filename, 'w') as csvfile: 
    csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) 

    for i, filepath in enumerate(pdfs):  
        # the column names are only written at the top of the first file
        if i==0:
            csvwriter.writerow(fields)
            
        pdf = PDFHandler(filepath)
        
        row = np.array(pdf.full_extract(pat, count, rep_old, rep_new, match_old, match_new))
        visual_check(i, pdf.original_page(0), row)
        csvwriter.writerow(row)
        
        pdf.close_file()

The speeches were originally written to csv files in batches instead of all at once, and the separate csv files were merged.

In [None]:
csv_dir = Path.cwd().parent / "speeches_csv" 
csv_filename = csv_dir / "speeches_436.txt"

fields = ["title", "pages", "date", "location", "highest_speaker_count", "content"]

with open(csv_filename, 'w') as csvfile: 
    csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) 

    for i in range(420,436):  
        if i==0:
            csvwriter.writerow(fields)
            
        pdf = PDFHandler(pdfs[i])
        
        row = np.array(pdf.full_extract(pat, count, rep_old, rep_new, re_old, re_new))
        visual_check(i, pdf.original_page(0), row)
        csvwriter.writerow(row)
        
        pdf.close_file()

In [None]:
csvs = sorted(list(csv_dir.glob('*.txt')))
path_merged = csv_dir / "all_speeches.txt"

for path in csvs:
    tmp = pd.read_csv(path)
    tmp.to_csv(path_merged, index=False, header=True, mode='a+')