In [41]:
import pandas as pd

In [42]:
data = pd.read_csv('presidents.csv')
data.head()

Unnamed: 0,President Name,Years In Office,Number,Party
0,George Washington,1789-1797,1st,Federalist
1,John Adams,1797-1801,2nd,Federalist
2,Thomas Jefferson,1801-1809,3rd,Democratic Republican
3,James Madison,1809-1817,4th,Democratic Republican
4,James Monroe,1817-1825,5th,Democratic Republican


In [43]:
# data cleaning using regex. let's start with Names
# make a copy of the President column
data['First'] = data['President Name']
data['First'] = data['First'].replace("[ ].*", "", regex=True)
data.head()

# This is slow

Unnamed: 0,President Name,Years In Office,Number,Party,First
0,George Washington,1789-1797,1st,Federalist,George
1,John Adams,1797-1801,2nd,Federalist,John
2,Thomas Jefferson,1801-1809,3rd,Democratic Republican,Thomas
3,James Madison,1809-1817,4th,Democratic Republican,James
4,James Monroe,1817-1825,5th,Democratic Republican,James


In [44]:
del data['First']

In [45]:
def splitname(row):
    row['First'] = row['President Name'].split(" ")[0]
    row['Last'] = row['President Name'].split(" ")[-1]
    return row

In [46]:
# apply it to the dataframe using apply()
data = data.apply(splitname, axis=1)
data.head()

# this is much better

Unnamed: 0,President Name,Years In Office,Number,Party,First,Last
0,George Washington,1789-1797,1st,Federalist,George,Washington
1,John Adams,1797-1801,2nd,Federalist,John,Adams
2,Thomas Jefferson,1801-1809,3rd,Democratic Republican,Thomas,Jefferson
3,James Madison,1809-1817,4th,Democratic Republican,James,Madison
4,James Monroe,1817-1825,5th,Democratic Republican,James,Monroe


In [47]:
del data['First']
del data['Last']

In [48]:
# extract() takes a regex as input and specifically requires you to set capture groups that correspond to the output columns
# you are interested in.

pattern = "(?P<First>^[\w]*)(?:.*\s)(?P<Last>[\w]*$)"
names = data['President Name'].str.extract(pattern)
names.head()

Unnamed: 0,First,Last
0,George,Washington
1,John,Adams
2,Thomas,Jefferson
3,James,Madison
4,James,Monroe


In [49]:
# and we can just copy these into our main dataframe
data['First'] = names['First']
data['Last'] = names['Last']
data.head()

Unnamed: 0,President Name,Years In Office,Number,Party,First,Last
0,George Washington,1789-1797,1st,Federalist,George,Washington
1,John Adams,1797-1801,2nd,Federalist,John,Adams
2,Thomas Jefferson,1801-1809,3rd,Democratic Republican,Thomas,Jefferson
3,James Madison,1809-1817,4th,Democratic Republican,James,Madison
4,James Monroe,1817-1825,5th,Democratic Republican,James,Monroe


In [50]:
dates = pd.read_csv('dates.csv')
dates.head()

Unnamed: 0,Born
0,Feb 1 2012sd
1,Mar 2 2012d
2,Dec 3 2012d
3,Jan 2 2012as
4,Sep 2 2012d


In [51]:
# get rid of anything that isn't in the pattern of month day and year
dates['Born'] = dates['Born'].str.extract("([\w]{3}\s[\w]{1,2}\s[\w]{4})")
dates.head()

Unnamed: 0,Born
0,Feb 1 2012
1,Mar 2 2012
2,Dec 3 2012
3,Jan 2 2012
4,Sep 2 2012


In [52]:
# date/time features
dates['Born'] = pd.to_datetime(dates['Born'])
dates.head()

Unnamed: 0,Born
0,2012-02-01
1,2012-03-02
2,2012-12-03
3,2012-01-02
4,2012-09-02


In [53]:
data.head()

Unnamed: 0,President Name,Years In Office,Number,Party,First,Last
0,George Washington,1789-1797,1st,Federalist,George,Washington
1,John Adams,1797-1801,2nd,Federalist,John,Adams
2,Thomas Jefferson,1801-1809,3rd,Democratic Republican,Thomas,Jefferson
3,James Madison,1809-1817,4th,Democratic Republican,James,Madison
4,James Monroe,1817-1825,5th,Democratic Republican,James,Monroe


In [57]:
#  separate 'Years In Office' into two, 'Started', 'Ended'
pattern_SE = "(?P<Started>[\w]{4})-(?P<Ended>[\w]{4})"
ss = data['Years In Office'].str.extract(pattern_SE)
ss.head()

Unnamed: 0,Started,Ended
0,1789,1797
1,1797,1801
2,1801,1809
3,1809,1817
4,1817,1825


In [59]:
del data['Years In Office']
data['Started'] = ss['Started']
data['Ended'] = ss['Ended']
data.head()

Unnamed: 0,President Name,Number,Party,First,Last,Started,Ended
0,George Washington,1st,Federalist,George,Washington,1789,1797
1,John Adams,2nd,Federalist,John,Adams,1797,1801
2,Thomas Jefferson,3rd,Democratic Republican,Thomas,Jefferson,1801,1809
3,James Madison,4th,Democratic Republican,James,Madison,1809,1817
4,James Monroe,5th,Democratic Republican,James,Monroe,1817,1825


In [73]:
data['Party'].replace({'Federalist': 'F',
                       'Democratic Republican': 'DR',
                       'Democratic': 'D',
                       'Whig': 'W',
                       'Democratic/Union': 'D/U',
                       'Republican': 'R'}, inplace=True)
data.head()

Unnamed: 0,President Name,Number,Party,First,Last,Started,Ended
0,George Washington,1st,F,George,Washington,1789,1797
1,John Adams,2nd,F,John,Adams,1797,1801
2,Thomas Jefferson,3rd,DR,Thomas,Jefferson,1801,1809
3,James Madison,4th,DR,James,Madison,1809,1817
4,James Monroe,5th,DR,James,Monroe,1817,1825


In [77]:
data['Number'] = data['Number'].str.extract("([\w]{1})")
data.head()

Unnamed: 0,President Name,Number,Party,First,Last,Started,Ended
0,George Washington,1,F,George,Washington,1789,1797
1,John Adams,2,F,John,Adams,1797,1801
2,Thomas Jefferson,3,DR,Thomas,Jefferson,1801,1809
3,James Madison,4,DR,James,Madison,1809,1817
4,James Monroe,5,DR,James,Monroe,1817,1825


In [83]:
data.sort_values('Number')
data.head()

Unnamed: 0,President Name,Number,Party,First,Last,Started,Ended
0,George Washington,1,F,George,Washington,1789,1797
16,Andrew Johnson,1,D/U,Andrew,Johnson,1865,1869
15,Abraham Lincoln,1,R,Abraham,Lincoln,1861,1865
14,James Buchanan,1,D,James,Buchanan,1857,1861
13,Franklin Pierce,1,D,Franklin,Pierce,1853,1857
