In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
import re
import regex
from pprint import pprint

import sys
src_path = str(Path.cwd().parent / "src")
sys.path.append(src_path)

import csv 

# Load data

Read the csv file (first row contains the column names), specify the data types.

In [2]:
csv_dir = Path.cwd().parent / "speeches_csv" 
speeches_path = csv_dir / "all_speeches.txt"
dtypes={'title':'string', 'pages':'int64', 'date':'string', 'location':'string', 
        'highest_speaker_count':'int64', 'content':'string'}
df = pd.read_csv(speeches_path, header=0, dtype=dtypes)
df.head()

Unnamed: 0,title,pages,date,location,highest_speaker_count,content
0,CGI_2013,19,24 September 2013,"Sheraton New York Hotel and Towers, New York, ...",10,Hillary Clinton: Thank you very much. I have t...
1,Prayer_Breakfast_2016,7,4 February 2016,"Washington Hilton, Washington, D.C.",0,"Well, good morning. Giving all praise and hono..."
2,Security_Team_Announcement,5,1 December 2008,"Chicago, Illinois",0,"Good morning, everybody. I hope you all had a ..."
3,Cairo_University,14,4 June 2009,"Cairo, Egypt",0,Thank you so much. Good afternoon. I am honore...
4,Umpqua_Community_College_Shootings,4,1 October 2015,"Washington, D.C.",0,There's been another mass shooting in America ...


In [3]:
df.dtypes

title                    string
pages                     int64
date                     string
location                 string
highest_speaker_count     int64
content                  string
dtype: object

# Dates

Some dates had the year missing. The years for several speeches were wrong due to typos.

In [4]:
temp = df.loc[:, ['title','date']]
temp['has_year'] = temp.apply(lambda row: row['date'][-4:].isnumeric(), axis=1)
temp.loc[temp.has_year==False, :]

Unnamed: 0,title,date,has_year
256,Recovery_and_Reinvestment_Act_2016,26 February,False
265,Post_Iran_Nuclear_Accord_Presser,15 July,False


Edit the dates that need to be corrected.

In [5]:
titles = ['Community_College_Plan', 'Recovery_and_Reinvestment_Act_2016', 'Post_Iran_Nuclear_Accord_Presser', 
          'Hispanic_Chamber_of_Commerce', 'Iraq_and_Ukraine', 'Barack_Obama_-_Weekly_Address_01-01-16']
dates = ['9 January 2015', '26 February 2016', '15 July 2015', '10 March 2009', '28 August 2014', 
         '1 January 2016']

for i in range(len(titles)):
    print(df.loc[df.title==titles[i],'date'])
    df.loc[df.title==titles[i],'date'] = dates[i]
    print(df.loc[df.title==titles[i],'date'], '\n')

78    9 January 20105
Name: date, dtype: string
78    9 January 2015
Name: date, dtype: string 

256    26 February 
Name: date, dtype: string
256    26 February 2016
Name: date, dtype: string 

265    15 July 
Name: date, dtype: string
265    15 July 2015
Name: date, dtype: string 

352    10 March 2010
Name: date, dtype: string
352    10 March 2009
Name: date, dtype: string 

217    29 August 2014
Name: date, dtype: string
217    28 August 2014
Name: date, dtype: string 

335    1 January 2015
Name: date, dtype: string
335    1 January 2016
Name: date, dtype: string 



Parse the dates.

In [6]:
df['date'] = pd.to_datetime(df['date'], dayfirst=True, format="%d %B %Y")
df.head()

Unnamed: 0,title,pages,date,location,highest_speaker_count,content
0,CGI_2013,19,2013-09-24,"Sheraton New York Hotel and Towers, New York, ...",10,Hillary Clinton: Thank you very much. I have t...
1,Prayer_Breakfast_2016,7,2016-02-04,"Washington Hilton, Washington, D.C.",0,"Well, good morning. Giving all praise and hono..."
2,Security_Team_Announcement,5,2008-12-01,"Chicago, Illinois",0,"Good morning, everybody. I hope you all had a ..."
3,Cairo_University,14,2009-06-04,"Cairo, Egypt",0,Thank you so much. Good afternoon. I am honore...
4,Umpqua_Community_College_Shootings,4,2015-10-01,"Washington, D.C.",0,There's been another mass shooting in America ...


The `date` column now has type `datetime`.

In [7]:
df.dtypes

title                            string
pages                             int64
date                     datetime64[ns]
location                         string
highest_speaker_count             int64
content                          string
dtype: object

# Locations

Locations that specify a specific place in the White House can be replaced by `White House, Washington D.C.`.

In [8]:
contains_WH = df.location.str.contains("White House", flags=re.I)
df.loc[contains_WH, 'location'] = "White House, Washington D.C."

Make a `country`column, values for `White House` can already be filled.

In [9]:
df.loc[contains_WH, 'country'] = "USA"
df.loc[~contains_WH, 'country'] = ""

Set country to `USA` for locations that contain state names or state abbreviations. In case it contains the abbreviation, replace it by the full state name.

In [10]:
states_full = ['Alabama','Alaska','Arizona','Arkansas','California','Colorado','Connecticut','Delaware','Florida','Georgia','Hawaii','Idaho','Illinois','Indiana','Iowa','Kansas','Kentucky','Louisiana','Maine','Maryland','Massachusetts','Michigan','Minnesota','Mississippi','Missouri','Montana','Nebraska','Nevada','New Hampshire','New Jersey','New Mexico','New York','North Carolina','North Dakota','Ohio','Oklahoma','Oregon','Pennsylvania','Rhode Island','South Carolina','South Dakota','Tennessee','Texas','Utah','Vermont','Virginia','Washington','West Virginia','Wisconsin','Wyoming']
states_abbr = ['AL','AK','AZ','AR','CA','CO','CT','DE','FL','GA','HI','ID','IL','IN','IA','KS','KY','LA','ME','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','RI','SC','SD','TN','TX','UT','VT','VA','WA','WV','WI','WY']

for state in states_full:
    contains = df.location.str.contains(state, flags=re.I)
    df.loc[contains, 'country'] = "USA"
    
for i in range(len(states_abbr)):
    contains = df.location.str.contains(r", \b"+states_abbr[i]+r"\b", flags=re.I)
    df.loc[contains, 'country'] = "USA"
    df['location'] = df.location.str.replace(
        r", \b"+states_abbr[i]+r"\b", repl=", "+states_full[i], flags=re.I, regex=True)

In [11]:
df.loc[df.country=="USA", :]

Unnamed: 0,title,pages,date,location,highest_speaker_count,content,country
0,CGI_2013,19,2013-09-24,"Sheraton New York Hotel and Towers, New York, ...",10,Hillary Clinton: Thank you very much. I have t...,USA
1,Prayer_Breakfast_2016,7,2016-02-04,"Washington Hilton, Washington, D.C.",0,"Well, good morning. Giving all praise and hono...",USA
2,Security_Team_Announcement,5,2008-12-01,"Chicago, Illinois",0,"Good morning, everybody. I hope you all had a ...",USA
4,Umpqua_Community_College_Shootings,4,2015-10-01,"Washington, D.C.",0,There's been another mass shooting in America ...,USA
5,White_House_Correspondent_Dinner_2013,6,2013-04-27,"Washington Hilton Hotel, Washington, D.C.",0,"Thank you. Thank you, everybody. How do you li...",USA
...,...,...,...,...,...,...,...
421,VFW_2012,9,2012-07-23,"Reno, Nevada",0,Thank you! Hello VFW! Thank you so much. Pleas...,USA
423,Wakefield_Back_to_High_School,7,2009-09-08,"Arlington, Virginia",0,"Hello, everybody! Thank you. Thank you. Thank ...",USA
424,CIA_First_Speech,5,2009-04-20,"Langley, Virginia",1,Thank you for the extraordinary welcome. And t...,USA
429,Second_Democratic_Nomination_Acceptance,9,2012-09-06,"Charlotte, North Carolina",0,"Thank you, so much. Thank you. Thank you very ...",USA


In [12]:
df.loc[(df.country!="USA") & (df.location!="unknown_location"), :]

Unnamed: 0,title,pages,date,location,highest_speaker_count,content,country
3,Cairo_University,14,2009-06-04,"Cairo, Egypt",0,Thank you so much. Good afternoon. I am honore...,
14,PM_Abe_of_Japan_Joint_Presser,14,2014-04-24,"Akasaka Palace, Tokyo, Japan",10,Prime Minister Abe: As interpreted. On behalf ...,
15,UN_Copenhagen_Climate_Change,3,2009-12-18,"Copenhagen, Denmark",0,Good morning. It is an honor for me to join th...,
16,People_of_Greece,15,2016-11-16,"Stavros Niarchos Foundation Cultural Center, A...",0,"Hello, Greece! Yia sas! Kalispera! Good evenin...",
17,India_Speech_at_New_Delhi,9,2015-01-27,"Siri Fort Auditorium, New Delhi, India",0,"Thank you so much. Thank you so much, Neha, fo...",
...,...,...,...,...,...,...,...
409,Saudi_Arabia_Presser,6,2005-04-21,"Diriyah Palace, Riyadh, Saudi Arabia",5,President Obama: You already heard my statemen...,
426,Hiroshima_Peace_Memorial_Address,5,2016-05-27,"Hiroshima, Japan",0,"Seventy-one years ago, on a bright, cloudless ...",
430,Shimon_Peres_Memorial,6,2016-09-30,"Mount Herzl, Jerusalem, Israel",0,"Zvia, Yoni, Chemi and generations of the Peres...",
431,ASEAN_Business_2015,9,2015-11-21,"Shangri-La Hotel, Kuala Lumpur, Malaysia",0,"Thank you so much. Please be seated. Well, goo...",


Change `Washington, D.C.` (and some variations) to `Washington D.C.`.

In [13]:
df['location'] = df.location.str.replace("Washington, D.?C.?", repl="Washington D.C.", flags=re.I, regex=True)

If `country=='USA'`: We assume the last substring to be the state, the second to last the city and everything before that a more specific locations.


If `country!='USA'`: We assume the last substring to be the country, the second to last the city and everything before that a more specific locations.

In [14]:
df.loc[:, 'count_commas'] = df.loc[:, 'location'].str.count(',')
df.loc[:,['location','country','count_commas']].sort_values(by='count_commas')

Unnamed: 0,location,country,count_commas
217,unknown_location,,0
181,Washington D.C.,USA,0
189,unknown_location,,0
198,unknown_location,,0
199,James L. Knight International Center,,0
...,...,...,...
246,"Phillips Center for the Performing Arts, Orlan...",USA,2
245,"JFK Space Center, Merritt Island, Florida",USA,2
316,"Gostinny Dvor, Moscow, Russia",,2
0,"Sheraton New York Hotel and Towers, New York, ...",USA,2


## USA

### No commas

In [15]:
print(df.loc[(df.country=="USA") & (df.count_commas == 0), ['title','location']].sort_values(by='location'), '\n')

df.loc[df.title=='Ebola_CDC', ['state','city','specific_location']] = ['Georgia', 'Atlanta', 'no_specific_location']
df.loc[df.location=='Washington D.C.', ['state','city','specific_location']] = ['no_state', 'Washington D.C.', 
                                                                                'no_specific_location']

                                      title         location
308                               Ebola_CDC  Atlanta Georgia
258                 State_of_the_Union_2012  Washington D.C.
278                 Health_Care_Law_Signing  Washington D.C.
284  White_House_Correspondents_Dinner_2015  Washington D.C.
285                 Paris_Terrorist_Attacks  Washington D.C.
288                     Brookings_Institute  Washington D.C.
293   Go_Presidential_Election_Outcome_2016  Washington D.C.
298          Howard_University_Commencement  Washington D.C.
4        Umpqua_Community_College_Shootings  Washington D.C.
314      National_Holocaust_Memorial_Museum  Washington D.C.
321     Iftar_Dinner_on_Religious_Tolerance  Washington D.C.
322   Second_Presidential_Inaugural_Address  Washington D.C.
330   White_House_Correspondent_Dinner_2014  Washington D.C.
356              Syria_Speech_to_the_Nation  Washington D.C.
370        Final_State_of_the_Union_Address  Washington D.C.
371                     

### One comma + `Washington D.C.`

In [16]:
contains_WDC = df.location.str.contains("Washington D.C.", flags=re.I)
select = contains_WDC & (df.count_commas == 1)
print(df.loc[select, 'location'])

locations = df.loc[select, 'location'].str.extract(r"(.+), Washington D.C. *", flags=re.I)

df.loc[select, ['state','city']]  = ['no_state', 'Washington D.C.']
df.loc[select, 'specific_location'] = locations.values

1               Washington Hilton, Washington D.C.
5         Washington Hilton Hotel, Washington D.C.
8                     White House, Washington D.C.
11                    White House, Washington D.C.
12                    White House, Washington D.C.
                          ...                     
414                   White House, Washington D.C.
415              State Department, Washington D.C.
416           Eisenhower Building, Washington D.C.
418    U.S. Capitol Western Front, Washington D.C.
434                   White House, Washington D.C.
Name: location, Length: 106, dtype: string


### One comma + other

In [17]:
select = (df.country=="USA") & ~contains_WDC & (df.count_commas == 1)
print(df.loc[select, 'location'])

states = df.loc[select, 'location'].str.extract(r".+?, *(.+)", flags=re.I)
cities = df.loc[select, 'location'].str.extract(r"(.+?), *.+", flags=re.I)

df.loc[select, 'state']  = states.values
df.loc[select, 'city']  = cities.values
df.loc[select, 'specific_location'] = 'no_specific_location'

2              Chicago, Illinois
10          San Jose, California
20             Fairfax, Virginia
28             Beaverton, Oregon
31                Kailua, Hawaii
                 ...            
420          Arlington, Virginia
421                 Reno, Nevada
423          Arlington, Virginia
424            Langley, Virginia
429    Charlotte, North Carolina
Name: location, Length: 73, dtype: string


In [18]:
df.loc[select, ['location','country','state','city','specific_location']].sort_values(
    by= ['state','city','specific_location'])

Unnamed: 0,location,country,state,city,specific_location
61,"Alberta, Alabama",USA,Alabama,Alberta,no_specific_location
227,"Hilton San Francisco Union Square, California",USA,California,Hilton San Francisco Union Square,no_specific_location
163,"Rancho Mirage, California",USA,California,Rancho Mirage,no_specific_location
224,"Rancho Mirage, California",USA,California,Rancho Mirage,no_specific_location
10,"San Jose, California",USA,California,San Jose,no_specific_location
...,...,...,...,...,...
153,"Joint Base Myer-Henderson Hall, Virginia",USA,Virginia,Joint Base Myer-Henderson Hall,no_specific_location
242,"Langley, Virginia",USA,Virginia,Langley,no_specific_location
424,"Langley, Virginia",USA,Virginia,Langley,no_specific_location
204,"McLean, Virginia",USA,Virginia,McLean,no_specific_location


Some cities need corrections. Cities that don't have a space in them are all ok, we only need to look at ones with spaces.

In [19]:
contains = df.loc[select, 'city'].str.contains(" ", flags=re.I)
df.loc[select & contains, ['title', 'location','state','city','specific_location']].sort_values(
    by= ['state','city','specific_location'])

Unnamed: 0,title,location,state,city,specific_location
227,Mayors_Conference_2015,"Hilton San Francisco Union Square, California",California,Hilton San Francisco Union Square,no_specific_location
163,ASEAN_Summit_2016,"Rancho Mirage, California",California,Rancho Mirage,no_specific_location
224,Antonin_Scalia_Passing,"Rancho Mirage, California",California,Rancho Mirage,no_specific_location
10,Affordable_Care_Act_and_Domestic_Surveillance,"San Jose, California",California,San Jose,no_specific_location
174,White_House_Correspondents_Dinner_First,"Washington Hilton Hotel, D.C.",D.C.,Washington Hilton Hotel,no_specific_location
400,Beau_Biden_Eulogy,"St. Anthony of Padua Church Wilmington, Delaware",Delaware,St. Anthony of Padua Church Wilmington,no_specific_location
191,Gun_Violence_Denver,"Denver Police Academy, Denver. Colorado",Denver. Colorado,Denver Police Academy,no_specific_location
241,Second_Presidential_Election_Victory_Speech,"McCormick Place Chicago, Illinois",Illinois,McCormick Place Chicago,no_specific_location
247,Iowa_Caucus_Victory_Speech,"Des Moines, Iowa",Iowa,Des Moines,no_specific_location
319,Joplin_Tornado_Victims_Memorial,"Missouri Southern University, Joplin Missouri",Joplin Missouri,Missouri Southern University,no_specific_location


In [20]:
need_corrections = ['Mayors_Conference_2015','White_House_Correspondents_Dinner_First','Beau_Biden_Eulogy',
                   'Gun_Violence_Denver','Second_Presidential_Election_Victory_Speech','Joplin_Tornado_Victims_Memorial',
                   'American_Legion_Conference','Iraq_War_Camp_Lejeune','Martin_Dempsey_Retirement']

states = ['California', 'no_state', 'Delaware', 'Colorado', 'Illinois', 'Missouri', 'Minnesota', 'North Carolina', 
          'Virginia']
cities = ['San Francisco', 'Washington D.C.', 'Wilmington', 'Denver', 'Chicago', 'Joplin', 'Minneapolis', 'Jacksonville', 
          'Arlington']
locations = ['Hilton San Francisco Union Square Hotel', 'Washington Hilton Hotel', 'St. Anthony of Padua Church', 
             'Denver Police Academy', 'McCormick Place', 'Missouri Southern University', 'Minneapolis Convention Center', 
             'Camp Lejeune', 'Joint Base Myer-Henderson Hall']

df.loc[df.title.isin(need_corrections), 'state'] = states
df.loc[df.title.isin(need_corrections), 'city'] = cities
df.loc[df.title.isin(need_corrections), 'specific_location'] = locations
df.loc[df.title.isin(need_corrections), ['location','state','city','specific_location']].sort_values(
    by= ['state','city','specific_location'])

Unnamed: 0,location,state,city,specific_location
77,"Minneapolis Convention Center, Minnesota",California,San Francisco,Hilton San Francisco Union Square Hotel
191,"Denver Police Academy, Denver. Colorado",Colorado,Denver,Denver Police Academy
174,"Washington Hilton Hotel, D.C.",Delaware,Wilmington,St. Anthony of Padua Church
227,"Hilton San Francisco Union Square, California",Illinois,Chicago,McCormick Place
319,"Missouri Southern University, Joplin Missouri",Minnesota,Minneapolis,Minneapolis Convention Center
241,"McCormick Place Chicago, Illinois",Missouri,Joplin,Missouri Southern University
354,"Camp Lejeune, North Carolina",North Carolina,Jacksonville,Camp Lejeune
400,"St. Anthony of Padua Church Wilmington, Delaware",Virginia,Arlington,Joint Base Myer-Henderson Hall
153,"Joint Base Myer-Henderson Hall, Virginia",no_state,Washington D.C.,Washington Hilton Hotel


### Two commas

In [21]:
select = (df.country=="USA") & (df.count_commas == 2)
print(df.loc[select, 'location'])

states = df.loc[select, 'location'].str.extract(r".+?,.+?, *(.+)", flags=re.I)
cities = df.loc[select, 'location'].str.extract(r".+?, *(.+?),.+", flags=re.I)
locations = df.loc[select, 'location'].str.extract(r" *(.+?),.+?,.+", flags=re.I)

df.loc[select, 'state']  = states.values
df.loc[select, 'city']  = cities.values
df.loc[select, 'specific_location'] = locations.values

0      Sheraton New York Hotel and Towers, New York, ...
24          Harborside Event Center, Fort Myers, Florida
26              Kaneohe Bay Marine Base, Kaneohe, Hawaii
38        Veterans Memorial Auditorium, Des Moines, Iowa
53     Newport News Shipbuilding, Newport News, Virginia
56                    McCormick Place, Chicago, Illinois
65               Hofstra University, Hempstead, New York
71                         Del Sol HS, Las Vegas, Nevada
78     Pellissippi State Community College, Knoxville...
80          Lyndon Baines Johnson Library, Austin, Texas
92                    Lincoln Center, New York, New York
99                         Pentagon, Arlington, Virginia
102       Joint Base Myer-Henderson, Fort Myer, Virginia
106    Philadelphia Convention Center, Philadelphia, ...
107                    Sun Devil Stadium, Tempe, Arizona
109        Illinois State Capitol, Springfield, Illinois
111          Hyde Park Career Academy, Chicago, Illinois
137     Arlington National Ceme

In [22]:
df.loc[select, ['title','location','state','city','specific_location']].sort_values(
    by= ['state','city','specific_location'])

Unnamed: 0,title,location,state,city,specific_location
367,Selma_50_March,"Edmund Pettus Bridge, Selma, Alabama",Alabama,Selma,Edmund Pettus Bridge
107,ASU_Commencement,"Sun Devil Stadium, Tempe, Arizona",Arizona,Tempe,Sun Devil Stadium
381,Stanford_Cybersecurity_Summit,"Stanford University, Stanford, California",California,Stanford,Stanford University
220,US_Air_Force_Commencement_Speech,"United States Air Force Academy, Colorado Spri...",Colorado,Colorado Springs,United States Air Force Academy
237,Obama-Romney_-_First_Live_Debate,"Magness Arena, University of Denver, Colorado",Colorado,University of Denver,Magness Arena
190,Obama-Romney_-_Third_Live_Debate,"Lynn University, Boca Raton, Florida",Florida,Boca Raton,Lynn University
24,Aurora_Tragedy,"Harborside Event Center, Fort Myers, Florida",Florida,Fort Myers,Harborside Event Center
245,NASA_21st_Century,"JFK Space Center, Merritt Island, Florida",Florida,Merritt Island,JFK Space Center
246,Orlando_Community,"Phillips Center for the Performing Arts, Orlan...",Florida,Orlando,Phillips Center for the Performing Arts
259,On_Counterterrorism_Approach_2016,"MacDill Air Force Base, Tampa, Florida",Florida,Tampa,MacDill Air Force Base


Some cities need corrections.

In [23]:
need_corrections = ['Obama-Romney_-_First_Live_Debate', 'NY_NJ_Explosions', 'Afghanistan_War_Troop_Surge', 
                    'Tucson_Memorial_Address', 'Armed_Forces_Farewell']

states = ['Colorado', 'New York', 'New York', 'Arizona', 'Virginia']
cities = ['Denver', 'New York', 'West Point', 'Tucson', 'Arlington']
locations = ['University of Denver Magness Arena', 'Lotte New York Palace Hotel', 'Military Academy Eisenhower Hall', 
             'University of Arizona McKale Memorial Center', 'Fort Myer Joint Base Myer-Henderson']

df.loc[df.title.isin(need_corrections), 'state'] = states
df.loc[df.title.isin(need_corrections), 'city'] = cities
df.loc[df.title.isin(need_corrections), 'specific_location'] = locations
df.loc[df.title.isin(need_corrections), ['location','state','city','specific_location']].sort_values(
    by= ['state','city','specific_location'])

Unnamed: 0,location,state,city,specific_location
326,"Eisenhower Hall, West Point Military Academy, ...",Arizona,Tucson,University of Arizona McKale Memorial Center
102,"Joint Base Myer-Henderson, Fort Myer, Virginia",Colorado,Denver,University of Denver Magness Arena
237,"Magness Arena, University of Denver, Colorado",New York,New York,Lotte New York Palace Hotel
289,"Lotte New York Palace Hotel, New York City, Ne...",New York,West Point,Military Academy Eisenhower Hall
376,"McKale Memorial Center, University of Arizona,...",Virginia,Arlington,Fort Myer Joint Base Myer-Henderson


### Result for locations in USA

In [24]:
df.loc[df.country=="USA", ['location','country','state','city','specific_location']].sort_values(
    by=['state','city','specific_location'])

Unnamed: 0,location,country,state,city,specific_location
61,"Alberta, Alabama",USA,Alabama,Alberta,no_specific_location
367,"Edmund Pettus Bridge, Selma, Alabama",USA,Alabama,Selma,Edmund Pettus Bridge
107,"Sun Devil Stadium, Tempe, Arizona",USA,Arizona,Tempe,Sun Devil Stadium
326,"Eisenhower Hall, West Point Military Academy, ...",USA,Arizona,Tucson,University of Arizona McKale Memorial Center
163,"Rancho Mirage, California",USA,California,Rancho Mirage,no_specific_location
...,...,...,...,...,...
356,Washington D.C.,USA,no_state,Washington D.C.,no_specific_location
370,Washington D.C.,USA,no_state,Washington D.C.,no_specific_location
371,Washington D.C.,USA,no_state,Washington D.C.,no_specific_location
401,Washington D.C.,USA,no_state,Washington D.C.,no_specific_location


## Not USA, but known location

Note: some US locations don't have `country=='USA'` yet

### Zero commas

In [25]:
select = (df.country!="USA") & (df.location!="unknown_location") & (df.count_commas == 0)
df.loc[select, ['title','location']]

Unnamed: 0,title,location
44,Joint_Presser_with_President_Benigno_Aquino,Manila
114,Benghazi_Remains_Transfer,James S. Brady Press Briefing Room
168,Berlin_Address,Victory Column
199,Miami_Dade_College_Commencement,James L. Knight International Center
200,Strasbourg_Town_Hall,Rhenus Sports Arena
273,Hradany_Square_Prague,Czech Republic
348,Hurricane_Sandy_ERT,James S. Brady Press Briefing Room


In [26]:
titles = df.loc[select, 'title']
df.loc[df.title=='Joint_Presser_with_President_Benigno_Aquino', 
       ['country','state','city','specific_location']] = ['Philippines', 'no_state', 'Manila', 'no_specific_location']
df.loc[df.title=='Benghazi_Remains_Transfer', 
       ['country','state','city','specific_location']] = ['USA', 'no_state', 'Washington D.C.', 'White House']
df.loc[df.title=='Berlin_Address', 
       ['country','state','city','specific_location']] = ['Germany', 'no_state', 'Berlin', 'Victory Column']
df.loc[df.title=='Miami_Dade_College_Commencement', 
       ['country','state','city','specific_location']] = ['USA', 'Florida', 'Miami', 'James L. Knight International Center']
df.loc[df.title=='Strasbourg_Town_Hall', 
       ['country','state','city','specific_location']] = ['France', 'no_state', 'Strasbourg', 'Rhenus Sports Arena']
df.loc[df.title=='Hradany_Square_Prague', 
       ['country','state','city','specific_location']] = ['Czech Republic', 'no_state', 'Prague', 'Hradany Square']
df.loc[df.title=='Hurricane_Sandy_ERT', 
       ['country','state','city','specific_location']] = ['USA', 'no_state', 'Washington D.C.', 'White House']

df.loc[df.title.isin(titles), ['location','country','state','city','specific_location']].sort_values(
    by= ['country','state','city','specific_location'])

Unnamed: 0,location,country,state,city,specific_location
273,Czech Republic,Czech Republic,no_state,Prague,Hradany Square
200,Rhenus Sports Arena,France,no_state,Strasbourg,Rhenus Sports Arena
168,Victory Column,Germany,no_state,Berlin,Victory Column
44,Manila,Philippines,no_state,Manila,no_specific_location
199,James L. Knight International Center,USA,Florida,Miami,James L. Knight International Center
114,James S. Brady Press Briefing Room,USA,no_state,Washington D.C.,White House
348,James S. Brady Press Briefing Room,USA,no_state,Washington D.C.,White House


### One comma

In [27]:
select = (df.country!="USA") & (df.location!="unknown_location") & (df.count_commas == 1)
titles = df.loc[select, 'title']
df.loc[select, ['title','location']]

Unnamed: 0,title,location
3,Cairo_University,"Cairo, Egypt"
15,UN_Copenhagen_Climate_Change,"Copenhagen, Denmark"
21,Paris_Press_Conference_2015,"Issy-les-Moulineaux, France"
25,2004_DNC_Address,"Fleet Center, Boston"
39,Fort_Bonifacio,"Manila, Philippines"
51,Post_ASEAN_Presser_2016,"Vientiane, Laos"
52,Post_G7_Presser_Japan,"Shima City, Japan"
69,Chile_Latin_America,"Santiago, Chile"
79,Jamaica_Town_Hall,"Kingston, Jamaica"
94,Yangon_University,"Yangon, Myanmar"


In [28]:
countries = df.loc[df.title.isin(titles), 'location'].str.extract(r".+?, *(.+)", flags=re.I)
cities = df.loc[df.title.isin(titles), 'location'].str.extract(r" *(.+?),.+", flags=re.I)

df.loc[df.title.isin(titles), 'country']  = countries.values
df.loc[df.title.isin(titles), 'state']  = 'no_state'
df.loc[df.title.isin(titles), 'city']  = cities.values
df.loc[df.title.isin(titles), 'specific_location'] = 'no_specific_location'

some corrections

In [29]:
df.loc[df.title=='2004_DNC_Address', 
       ['country','state','city','specific_location']] = ['USA', 'New York', 'Boston', 'no_specific_location']
df.loc[df.title=='Afghanistan_US_Troops_Bagram', 
       ['country','state','city','specific_location']] = ['Afghanistan', 'no_state', 'Bagram', 'Bagram Air Field']
df.loc[df.title=='Peru_Press_Conference', 
       ['country','state','city','specific_location']] = ['Peru', 'no_state', 'Lima', 'Lima Convention Center']
df.loc[df.title=='Rio_de_Janeiro', 
       ['country','state','city','specific_location']] = ['Brazil', 'no_state', 'Rio de Janeiro', 'Teatro Municipal']
df.loc[df.title=='Bagram_Air_Base_December_2010', 
       ['country','state','city','specific_location']] = ['Afghanistan', 'no_state', 'Bagram', 'Bagram Air Field']
df.loc[df.title=='Ebenezer_Baptist', 
       ['country','state','city','specific_location']] = ['USA', 'Georgia', 'Atlanta', 'Ebenezer Baptist Church']
df.loc[df.title=='British_Parliament', 
       ['country','state','city','specific_location']] = ['England', 'no_state', 'London', 'Westminster Hall']
df.loc[df.title=='Ted_Kennedy_Eulogy', 
       ['country','state','city','specific_location']] = ['USA', 'New York', 'Boston', 'Our Lady of Perpetual Help Basilica']
df.loc[df.title=='Post_G7_Conference_Presser_2015', 
       ['country','state','city','specific_location']] = ['Germany', 'no_state', 'Krun', 'Elmau Briefing Center']
df.loc[df.title=='Mexico_Address', 
       ['country','state','city','specific_location']] = ['Mexico', 'no_state', 'Mexico City', 'Anthropological Museum']
df.loc[df.title=='Estonia_People', 
       ['country','state','city','specific_location']] = ['Estonia', 'no_state', 'Tallinn', 'Nordea Concert Hall']

In [30]:
df.loc[df.title.isin(titles), ['location','country','state','city','specific_location']].sort_values(
    by= ['country','state','city','specific_location'])

Unnamed: 0,location,country,state,city,specific_location
133,"Clamshell, Bagram Air Field",Afghanistan,no_state,Bagram,Bagram Air Field
166,"Parwan Province, Afghanistan",Afghanistan,no_state,Bagram,Bagram Air Field
192,"Brisbane, Australia",Australia,no_state,Brisbane,no_specific_location
304,"Canberra, Australia",Australia,no_state,Canberra,no_specific_location
161,"Teatro Municipal, Rio de Janeiro",Brazil,no_state,Rio de Janeiro,Teatro Municipal
309,"Rangoon, Burma",Burma,no_state,Rangoon,no_specific_location
69,"Santiago, Chile",Chile,no_state,Santiago,no_specific_location
333,"Hangzhou, China",China,no_state,Hangzhou,no_specific_location
15,"Copenhagen, Denmark",Denmark,no_state,Copenhagen,no_specific_location
3,"Cairo, Egypt",Egypt,no_state,Cairo,no_specific_location


### Two commas

In [31]:
select = (df.country!="USA") & (df.location!="unknown_location") & (df.count_commas == 2)
titles = df.loc[select, 'title']
df.loc[select, ['title','location']]

Unnamed: 0,title,location
14,PM_Abe_of_Japan_Joint_Presser,"Akasaka Palace, Tokyo, Japan"
16,People_of_Greece,"Stavros Niarchos Foundation Cultural Center, A..."
17,India_Speech_at_New_Delhi,"Siri Fort Auditorium, New Delhi, India"
18,Northern_Ireland_Speech,"Waterfront, Belfast, Northern Ireland"
58,Kenya_Civil_Society_Meeting,"Kenyatta University, Nairobi, Kenya"
82,YSEALI_Town_Hall,"Yangon University, Rangoon/Yangon, Burma/Myanmar"
98,COP21,"Le Bourget, Paris, France"
108,Nobel_Lecture,"Oslo City Hall, Oslo, Norway"
125,Brandenburg_Gate_Speech,"Pariser Platz, Berlin, Germany"
128,Joint_Presser_with_Raul_Castro,"Palace of the Revolution, Havana, Cuba"


In [32]:
countries = df.loc[df.title.isin(titles), 'location'].str.extract(r".+?,.+?, *(.+)", flags=re.I)
cities = df.loc[df.title.isin(titles), 'location'].str.extract(r".+?, *(.+?),.+", flags=re.I)
locations = df.loc[df.title.isin(titles), 'location'].str.extract(r" *(.+?),.+?,.+", flags=re.I)

df.loc[df.title.isin(titles), 'country']  = countries.values
df.loc[df.title.isin(titles), 'state']  = 'no_state'
df.loc[df.title.isin(titles), 'city']  = cities.values
df.loc[df.title.isin(titles), 'specific_location'] = locations.values

df.loc[df.title=='YSEALI_Town_Hall', 
       ['country','state','city','specific_location']] = ['Myanmar', 'no_state', 'Yangon', 'Yangon University']

In [33]:
df.loc[df.title.isin(titles), ['title','location','country','state','city','specific_location']].sort_values(
    by= ['country','state','city','specific_location'])

Unnamed: 0,title,location,country,state,city,specific_location
433,YLAI_Town_Hall,"Usina Del Arte, Buenos Aires, Argentina",Argentina,no_state,Buenos Aires,Usina Del Arte
239,European_Youth,"Palais des Beaux Arts, Brussels, Belgium",Belgium,no_state,Brussels,Palais des Beaux Arts
345,Odyssey_Dawn,"The Convention Center Brasil 21, Brasilia, Brazil",Brazil,no_state,Brasilia,The Convention Center Brasil 21
374,Canada_Parliament,"House of Commons Chamber, Ottawa, Canada",Canada,no_state,Ottawa,House of Commons Chamber
306,Cuba_People_Speech,"Gran Teatro de la Habana, Havana, Cuba",Cuba,no_state,Havana,Gran Teatro de la Habana
128,Joint_Presser_with_Raul_Castro,"Palace of the Revolution, Havana, Cuba",Cuba,no_state,Havana,Palace of the Revolution
193,G20_Remarks_and_Presser_2009,"Excel Center, London, England",England,no_state,London,Excel Center
195,Toomas_Ilves_Joint_Presser,"Bank of Estonia, Tallinn, Estonia",Estonia,no_state,Tallinn,Bank of Estonia
185,African_Union_Address,"African Union Headquarters, Addis Ababa, Ethiopia",Ethiopia,no_state,Addis Ababa,African Union Headquarters
343,DDay_70,"Omaha Beach, Normandy, France",France,no_state,Normandy,Omaha Beach


### Three commas

In [34]:
select = (df.country!="USA") & (df.location!="unknown_location") & (df.count_commas == 3)
df.loc[select, ['title','location']]

Unnamed: 0,title,location
261,UK_Young_Leaders,"Lindley Hall, Royal Horticulture Halls, London..."


In [35]:
df.loc[df.title=='UK_Young_Leaders', 
       ['country','state','city','specific_location']] = ['England', 'no_state', 'London', 'Lindley Hall, Royal Horticulture Halls']

### Result for locations not in USA

In [36]:
df.loc[(df.country!="USA") & (df.location!="unknown_location"), ['location','country','state','city','specific_location']].sort_values(
    by=['country','state','city','specific_location'])

Unnamed: 0,location,country,state,city,specific_location
133,"Clamshell, Bagram Air Field",Afghanistan,no_state,Bagram,Bagram Air Field
166,"Parwan Province, Afghanistan",Afghanistan,no_state,Bagram,Bagram Air Field
433,"Usina Del Arte, Buenos Aires, Argentina",Argentina,no_state,Buenos Aires,Usina Del Arte
192,"Brisbane, Australia",Australia,no_state,Brisbane,no_specific_location
304,"Canberra, Australia",Australia,no_state,Canberra,no_specific_location
...,...,...,...,...,...
378,"Ankara, Turkey",Turkey,no_state,Ankara,no_specific_location
187,"Kaya Palazzo Resort, Antalya, Turkey",Turkey,no_state,Antalya,Kaya Palazzo Resort
283,"National Convention Center, Hanoi, Vietnam",Vietnam,no_state,Hanoi,National Convention Center
408,"GEM Convention Center, Ho Chi Minh City, Vietnam",Vietnam,no_state,Ho Chi Minh City,GEM Convention Center


## Unknown locations

In [37]:
print('There are %i unknown locations.' % len(df.loc[df.location=="unknown_location", :]))

There are 89 unknown locations.


In [38]:
df.loc[df.location=="unknown_location", ['title', 'date', 'content']].sort_values(by='date')

Unnamed: 0,title,date,content
173,Knox_College_Commencement,2005-06-04,"Good morning President Taylor, Board of Truste..."
279,Senate_Floor_Speech_on_the_Patriot_Act,2005-12-15,"Thank you very much, Mr. President. You know, ..."
123,Senate_Floor_Speech_on_Voting_Rights_Act_Renewal,2006-07-20,"Mr. President, I rise today both humbled and h..."
105,Candidate_Exploratory_Announcement,2007-01-16,"As many of you know, over the last few months ..."
344,Senate_Speech_Iraq_Federalism_Amendment,2007-03-13,"Mr. President, I rise, first, to offer strong ..."
...,...,...,...
141,US_Detainees_Released,2016-01-17,"This is a good day, because, once again, we're..."
66,Weekly_Address_on_Cuba,2016-02-20,"Hi, everybody. This week, we made it official ..."
248,Barack_Obama_-_ISIL_Update_02-25-16,2016-02-25,"Good evening, everybody. I just met with my Na..."
238,Weekly_Address_Opioid_Addiction_Macklemore,2016-05-14,"President Obama: Hi, everybody. I've got a spe..."


A lot of them seem to be in Washington D.C., so set everything to these values and make the necessary corrections.

In [39]:
titles = df.loc[df.location=="unknown_location", 'title'].values
df.loc[df.title.isin(titles), 'country'] = 'USA'
df.loc[df.title.isin(titles), 'state'] = 'no_state'
df.loc[df.title.isin(titles), 'city'] = 'Washington D.C.'
df.loc[df.title.isin(titles), 'specific_location'] = 'no_specific_location'

In [40]:
temp = df.loc[df.location=="unknown_location", ['title', 'date']].sort_values(by='date')
temp.iloc[80:,:]

Unnamed: 0,title,date
131,Iran_Nuclear_Accord,2015-07-14
84,Burundi_People_Message,2015-11-13
311,Counter_ISIL_Campaign_Update,2015-12-14
335,Barack_Obama_-_Weekly_Address_01-01-16,2016-01-01
141,US_Detainees_Released,2016-01-17
66,Weekly_Address_on_Cuba,2016-02-20
248,Barack_Obama_-_ISIL_Update_02-25-16,2016-02-25
238,Weekly_Address_Opioid_Addiction_Macklemore,2016-05-14
270,Hillary_Clinton_Endorsement,2016-06-09


Need corrections.

In [41]:
need_corrections = ['Knox_College_Commencement','Notre_Dame_Commencement','Buchenwald_Concentration_Camp',
                    'Health_Care_George_Mason','START_Treaty_Presser','Michigan_University_Commencement',
                    'Kalamazoo_HS_Commencement','Pensacola_Military_Personnel','Nurses_Association',
                    'Disabled_Veterans_Conference','GW_Fiscal_Policy','Fort_Campbell',
                    'Clinton_Global_Initiative_Human_Trafficking']

countries = ['USA','USA','Germany','USA','Czech Republic','USA','USA','USA','USA','USA','USA','USA','USA']
states = ['Illinois','Indiana','no_state','Virginia','no_state','Michigan','Michigan','Florida','Maryland',
          'Georgia','no_state','Kentucky','New York']
cities = ['Galesburg','Notre Dame','Thuringia','Fairfax County','Prague','Ann Arbor','Kalamazoo','Pensacola',
          'Silver Spring','Atlanta','Washington D.C.','Hopkinsville','New York']
locations = ['Knox College','Notre Dame University','Buchenwald Concentration Camp','George Mason University',
             'no_specific_location','Michigan University','Kalamazoo Central High School',
             'Pensacola Military Station','American Nurses Association',
             'Disabled Veterans of America Conference','George Washington University','Fort Campbell',
             'Clinton Global Initiative on Human Trafficking']

df.loc[df.title.isin(need_corrections), 'country'] = countries
df.loc[df.title.isin(need_corrections), 'state'] = states
df.loc[df.title.isin(need_corrections), 'city'] = cities
df.loc[df.title.isin(need_corrections), 'specific_location'] = locations
df.loc[df.title.isin(need_corrections), ['country','state','city','specific_location']].sort_values(
    by= ['country','state','city','specific_location'])

Unnamed: 0,country,state,city,specific_location
119,Czech Republic,no_state,Prague,no_specific_location
113,Germany,no_state,Thuringia,Buchenwald Concentration Camp
206,USA,Florida,Pensacola,Pensacola Military Station
365,USA,Georgia,Atlanta,Disabled Veterans of America Conference
36,USA,Illinois,Galesburg,Knox College
85,USA,Indiana,Notre Dame,Notre Dame University
427,USA,Kentucky,Hopkinsville,Fort Campbell
210,USA,Maryland,Silver Spring,American Nurses Association
136,USA,Michigan,Ann Arbor,Michigan University
173,USA,Michigan,Kalamazoo,Kalamazoo Central High School


Were correct, add specific location where possible.

In [42]:
whitehouse = ['Barack_Obama_-_Al-Arabiya_Interview','Third_Major_Press_Conference','Dignity_and_Courage_of_Iranians',
              'Regina_Benjamin_Nomination','Kennedy_Center_Honorees_2009','Aviation_Security','Finance_Crisis_Fee',
              'Auschwitz_and_Birkenau_Liberation_65th','Prayer_Breakfast_2010','Presser_Unannounced_Heath_Care',
              'No_Child_Left_Behind_Blueprint','Health_Care_Bill_Passage',
              'Justice_Stevens_and_West_Virginia_Mining_Tragedy','Elena_Kagan_USSC_Nomination','UN_Iran_Sanctions',
              'General_McChrystal_Afghanistan','Midterm_Elections_Presser_2010','Tax_Presser','Egypt_History_Revolution',
              'Libya_Violence_Must_Stop','Address_on_Libya','Osama_bin_Laden_Death','David_Petraeus_Etc_Presser',
              'Sandy_Hook_Elementary_School','Final_First_Term_Presser','Gun_Violence_Exec_Order_23',
              'Boston_Marathon_Bombing_First_Statement','Boston_Marathon_Bombing_Second_Statement',
              'Boston_Marathon_Bombing_Third_Statement','Trayvon_Martin','Intelligence_Reform_Presser',
              'Affordable_Care_Act_Modifications_Presser','Iran_Accord_Statement','On_the_Passing_of_Nelson_Mandela',
              'Year_End_Presser_2013','Border_Security_and_Immigration_Reform','MH_Flight_17_Downing',
              'SSG_Bryan_Pitts_MOH','Presser_on_the_Economy_and_Foreign_Policy','Iraq_and_Ukraine','ISIL_',
              'Fixing_Immigration_Policy','Cuba_Policy_Changes','Final_Presser_of_2014',
              'Barack_Obama_-_Weekly_Address_01-01-16','State_of_the_Union_2015','ISIL_Force_Authorization_Request',
              'Iran_Nuclear_Accord','Burundi_People_Message','US_Detainees_Released','Weekly_Address_on_Cuba',
              'Weekly_Address_Opioid_Addiction_Macklemore']

add_loc = ['Senate_Floor_Speech_on_the_Patriot_Act','Senate_Floor_Speech_on_Voting_Rights_Act_Renewal',
            'Senate_Speech_Iraq_Federalism_Amendment','Senate_Floor_Speech_on_Wall_Street_Bailout_Bill',
            'State_of_the_Nation_2009','To_Congress_on_Health_Care','2010_Budget_to_Congress',
            'Hispanic_Chamber_of_Commerce','Democrat_Caucus_Health_Care','911_Pentagon_Memorial_2010',
            'Counter_ISIL_Campaign_Update']

locations = ['Senate','Senate','Senate','Senate','Congress','Congress','Congress','Hispanic Chamber of Commerce',
             'The Capitol','Pentagon','Pentagon']

df.loc[df.title.isin(whitehouse), 'specific_location'] = 'White House'
df.loc[df.title.isin(add_loc), 'specific_location'] = locations
df.loc[df.title.isin(add_loc), ['country','state','city','specific_location']].sort_values(
    by= ['country','state','city','specific_location'])

Unnamed: 0,country,state,city,specific_location
281,USA,no_state,Washington D.C.,Congress
295,USA,no_state,Washington D.C.,Congress
303,USA,no_state,Washington D.C.,Congress
311,USA,no_state,Washington D.C.,Hispanic Chamber of Commerce
352,USA,no_state,Washington D.C.,Pentagon
411,USA,no_state,Washington D.C.,Pentagon
70,USA,no_state,Washington D.C.,Senate
123,USA,no_state,Washington D.C.,Senate
138,USA,no_state,Washington D.C.,Senate
279,USA,no_state,Washington D.C.,Senate


Assumed to be correct, but not entirely known: Candidate_Exploratory_Announcement, Global_Climate_Message, Regina_Benjamin_Nomination, Kennedy_Center_Honorees_2009, Finance_Crisis_Fee,Health_Care_Bill_Passage, Justice_Stevens_and_West_Virginia_Mining_Tragedy, Elena_Kagan_USSC_Nomination, Nurses_Association, Egypt_History_Revolution, Libya_Violence_Must_Stop, Hillary_Clinton_Endorsement

In [43]:
df.loc[df.title.isin(titles), ['country','state','city','specific_location']].sort_values(
    by= ['country','state','city','specific_location'])

Unnamed: 0,country,state,city,specific_location
119,Czech Republic,no_state,Prague,no_specific_location
113,Germany,no_state,Thuringia,Buchenwald Concentration Camp
206,USA,Florida,Pensacola,Pensacola Military Station
365,USA,Georgia,Atlanta,Disabled Veterans of America Conference
36,USA,Illinois,Galesburg,Knox College
...,...,...,...,...
270,USA,no_state,Washington D.C.,no_specific_location
275,USA,no_state,Washington D.C.,no_specific_location
277,USA,no_state,Washington D.C.,no_specific_location
282,USA,no_state,Washington D.C.,no_specific_location


# Completely cleaned data

In [44]:
df.loc[:, ['title','date','country','state','city','specific_location']].sort_values(
    by= ['country','state','city','specific_location','date'])

Unnamed: 0,title,date,country,state,city,specific_location
133,Afghanistan_US_Troops_Bagram,2010-03-28,Afghanistan,no_state,Bagram,Bagram Air Field
166,Bagram_Air_Base_December_2010,2010-12-03,Afghanistan,no_state,Bagram,Bagram Air Field
433,YLAI_Town_Hall,2016-03-23,Argentina,no_state,Buenos Aires,Usina Del Arte
192,Queensland_University,2014-11-15,Australia,no_state,Brisbane,no_specific_location
304,Australian_Parliament,2011-11-16,Australia,no_state,Canberra,no_specific_location
...,...,...,...,...,...,...
139,VOX_Interview,2017-01-06,USA,no_state,Washington D.C.,no_specific_location
93,Chicago_Cubs_WH_Visit,2017-01-17,USA,no_state,Washington D.C.,no_specific_location
283,Speech_to_the_Vietnam_People,2016-05-24,Vietnam,no_state,Hanoi,National Convention Center
408,Vietnam_YSEALI,2016-05-25,Vietnam,no_state,Ho Chi Minh City,GEM Convention Center


# Make new csv

Only known locations

In [45]:
path_only_known = csv_dir / "speeches_loc_known_cleaned.txt"
only_known = df.loc[df.location!="unknown_location", :]
#only_known.to_csv(path_only_known, index=False, header=True, mode='w')

Fully cleaned

In [46]:
path = csv_dir / "all_speeches_cleaned.txt"
#df.to_csv(path, index=False, header=True, mode='w')