# Academy Award for Best Actor

---

## Setup

In [1]:
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from io import StringIO
import re

---

## Request and parse HTML

In [2]:
url = 'https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actor'

r = get(url)
print(r.status_code)

200


In [3]:
soup = BeautifulSoup(r.content, 'html.parser')

table = soup.find_all('table', class_='wikitable sortable')

---

## HTML to Dataframe

In [4]:
df = pd.read_html(StringIO(str(table)))
print(f'Tables: {len(df)}')
print(f'Tables type: {type(df[0])}')
df[0].head()

Tables: 11
Tables type: <class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Year,Actor,Role(s),Film,Ref.
0,1927/28 (1st),Emil Jannings ‡[A],Grand Duke Sergius Alexander,The Last Command,[7]
1,1927/28 (1st),Emil Jannings ‡[A],August Schilling,The Way of All Flesh,[7]
2,1927/28 (1st),Richard Barthelmess,Nickie Elkins,The Noose,[7]
3,1927/28 (1st),Richard Barthelmess,Patent Leather Kid,The Patent Leather Kid,[7]
4,1927/28 (1st),Charlie Chaplin[B],The Tramp,The Circus,[8]


In [5]:
# Drop 'Ref.' column
for i in range(len(df)):
    df[i] = df[i].drop(columns='Ref.')

df[0].columns

Index(['Year', 'Actor', 'Role(s)', 'Film'], dtype='object')

In [6]:
winners = []

for table in df:
    mask1 = table['Actor'].str.contains('‡|§', regex=True)
    mask2 = table['Actor'].str.contains('†') & (table.index.isin([0,5,10,15,20,25,30,35,40,45]))
    table.loc[mask1 | (mask2)]
    winners.append(table.loc[(mask1) | (mask2)])

df = pd.concat(winners, ignore_index=True)
df.head()

Unnamed: 0,Year,Actor,Role(s),Film
0,1927/28 (1st),Emil Jannings ‡[A],Grand Duke Sergius Alexander,The Last Command
1,1927/28 (1st),Emil Jannings ‡[A],August Schilling,The Way of All Flesh
2,1928/29 (2nd) [note 1],Warner Baxter ‡,The Cisco Kid,In Old Arizona
3,1929/30 (3rd),George Arliss ‡[C],Benjamin Disraeli,Disraeli
4,1930/31 (4th),Lionel Barrymore ‡,Stephen Ashe,A Free Soul


### Cleaning

In [7]:
for col in df:
    df[col] = df[col].str.replace(r'([\[\(].*[\]\)])', '', regex=True)\
    .str.replace(r'‡|§|†', '', regex=True).str.strip() # Removes text in square brackets and parenthesis, legend from website and leading/trailing white spaces
    
df.head()

Unnamed: 0,Year,Actor,Role(s),Film
0,1927/28,Emil Jannings,Grand Duke Sergius Alexander,The Last Command
1,1927/28,Emil Jannings,August Schilling,The Way of All Flesh
2,1928/29,Warner Baxter,The Cisco Kid,In Old Arizona
3,1929/30,George Arliss,Benjamin Disraeli,Disraeli
4,1930/31,Lionel Barrymore,Stephen Ashe,A Free Soul


In [8]:
actors = df['Actor'].unique()

df['ActorID'] = 0

for idx, actor in enumerate(actors):
    df.loc[df['Actor'] == actor, 'ActorID'] = idx

df

Unnamed: 0,Year,Actor,Role(s),Film,ActorID
0,1927/28,Emil Jannings,Grand Duke Sergius Alexander,The Last Command,0
1,1927/28,Emil Jannings,August Schilling,The Way of All Flesh,0
2,1928/29,Warner Baxter,The Cisco Kid,In Old Arizona,1
3,1929/30,George Arliss,Benjamin Disraeli,Disraeli,2
4,1930/31,Lionel Barrymore,Stephen Ashe,A Free Soul,3
...,...,...,...,...,...
93,2019,Joaquin Phoenix,Arthur Fleck / Joker,Joker,82
94,2020/21,Anthony Hopkins,Anthony,The Father,59
95,2021,Will Smith,Richard Williams,King Richard,83
96,2022,Brendan Fraser,Charlie,The Whale,84


In [9]:
df = df.reindex(columns=['Year', 'ActorID', 'Actor', 'Role(s)', 'Film'])
df.head()

Unnamed: 0,Year,ActorID,Actor,Role(s),Film
0,1927/28,0,Emil Jannings,Grand Duke Sergius Alexander,The Last Command
1,1927/28,0,Emil Jannings,August Schilling,The Way of All Flesh
2,1928/29,1,Warner Baxter,The Cisco Kid,In Old Arizona
3,1929/30,2,George Arliss,Benjamin Disraeli,Disraeli
4,1930/31,3,Lionel Barrymore,Stephen Ashe,A Free Soul


In [10]:
df.loc[0, 'Role(s)']

'Grand Duke Sergius Alexander'

In [11]:
df.loc[0, 'Role(s)'] = df.loc[0, 'Role(s)'] + ', ' + df.loc[1, 'Role(s)']
df.loc[0, 'Film'] = df.loc[0, 'Film'] + ', ' + df.loc[1, 'Film']
df.head(2)

Unnamed: 0,Year,ActorID,Actor,Role(s),Film
0,1927/28,0,Emil Jannings,"Grand Duke Sergius Alexander, August Schilling","The Last Command, The Way of All Flesh"
1,1927/28,0,Emil Jannings,August Schilling,The Way of All Flesh


In [12]:
df.drop(index=1, inplace=True)
df.head(2)

Unnamed: 0,Year,ActorID,Actor,Role(s),Film
0,1927/28,0,Emil Jannings,"Grand Duke Sergius Alexander, August Schilling","The Last Command, The Way of All Flesh"
2,1928/29,1,Warner Baxter,The Cisco Kid,In Old Arizona


In [13]:
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,Year,ActorID,Actor,Role(s),Film
0,1927/28,0,Emil Jannings,"Grand Duke Sergius Alexander, August Schilling","The Last Command, The Way of All Flesh"
1,1928/29,1,Warner Baxter,The Cisco Kid,In Old Arizona
2,1929/30,2,George Arliss,Benjamin Disraeli,Disraeli
3,1930/31,3,Lionel Barrymore,Stephen Ashe,A Free Soul
4,1931/32,4,Wallace Beery,"Andy ""Champ"" Purcell",The Champ


---

## Extracting links

In [14]:
tables = soup.find_all('table', class_='wikitable sortable')

In [15]:
# Gets href from the first <a> whose text is the actor's name
def get_link(actor):
    for table in tables:
        a = table.find('a', string=actor)
        if a is not None:
            return a['href']

In [16]:
df['Link'] = df['Actor'].apply(get_link)
df['Link']

0        /wiki/Emil_Jannings
1        /wiki/Warner_Baxter
2        /wiki/George_Arliss
3     /wiki/Lionel_Barrymore
4        /wiki/Wallace_Beery
               ...          
92     /wiki/Joaquin_Phoenix
93     /wiki/Anthony_Hopkins
94          /wiki/Will_Smith
95      /wiki/Brendan_Fraser
96      /wiki/Cillian_Murphy
Name: Link, Length: 97, dtype: object

In [17]:
len(df['Link'].unique()) == len(df['Actor'].unique())

True

In [18]:
df.head()

Unnamed: 0,Year,ActorID,Actor,Role(s),Film,Link
0,1927/28,0,Emil Jannings,"Grand Duke Sergius Alexander, August Schilling","The Last Command, The Way of All Flesh",/wiki/Emil_Jannings
1,1928/29,1,Warner Baxter,The Cisco Kid,In Old Arizona,/wiki/Warner_Baxter
2,1929/30,2,George Arliss,Benjamin Disraeli,Disraeli,/wiki/George_Arliss
3,1930/31,3,Lionel Barrymore,Stephen Ashe,A Free Soul,/wiki/Lionel_Barrymore
4,1931/32,4,Wallace Beery,"Andy ""Champ"" Purcell",The Champ,/wiki/Wallace_Beery


---

## Retrieving and treating relevant data

In [19]:
url = 'https://en.wikipedia.org/'

actor_list = []
actor_id_list = []
bday_list = []
death_list = []
birthplace_list = []
deathplace_list = []
spouses_list = []
wedding_date_list = []
divorce_date_list = []
n_children_list = []

pat = re.compile(r'(\d{4})') # Matches groups of 4 digits

for link in df['Link'].unique():
    r = get(url + link)
    soup = BeautifulSoup(r.content, 'html.parser')
    
    actor_id = df.loc[df['Link'] == link, 'ActorID'].values[0]
    actor = df.loc[df['ActorID'] == actor_id, 'Actor'].values[0]
    actor_list.append(actor)

    # BIRTHDAY ---------------------
    bday = soup.find('span', class_='bday').string
    bday_list.append(bday)
    
    print(f'Actor: {actor}.\nBirthday: {bday}')

    # DEATH ------------------------
    death_tag = soup.find('th', string='Died')
    if death_tag == None:
        death = None

    else:
        death = death_tag.findNext('span', style='display:none').text

    death_list.append(death)
    print(f'Died: {death}')

    # BIRTHPLACE -------------------
    birthplace_tag = soup.find('div', class_='birthplace')
    if birthplace_tag == None:
        birthplace = None

    else:
        birthplace = birthplace_tag.text

    print(f'Birthplace: {birthplace}')
    birthplace_list.append(birthplace)

    # DEATHPLACE -------------------
    deathplace_tag = soup.find('div', class_='deathplace')
    if deathplace_tag == None:
        deathplace = None
    else:
        deathplace = deathplace_tag.text

    print(f'Deathplace: {deathplace}')
    deathplace_list.append(deathplace)

    # SPOUSES ----------------------
    spouses = soup.find_all('div', class_='marriage-display-ws')

    for spouse in spouses:
        actor_id_list.append(actor_id)
        print(f"Spouse: {spouse.text.split('(')[0].replace('\n', '').strip()}")
        spouse_name = spouse.text.split('(')[0].replace('\n', '').strip()
        spouses_list.append(spouse_name)
        dates = spouse.text
        dates = re.findall(pat, dates)
        
        if len(dates) == 1:
            print(f'Married in {dates[0]}')
            wedding_date_list.append(dates[0])
            divorce_date_list.append(None)
        else:
            print(f'Married in {dates[0]}, divorced in {dates[1]}.')
            wedding_date_list.append(dates[0])
            divorce_date_list.append(dates[1])


    # CHILDREN ---------------------
    children_tag = soup.find('th', string='Children')
    if children_tag == None:
        n_children = 0

    else:
        n_children = children_tag.findNext('td').text
        
    n_children_list.append(n_children)
    print(f'Number of children: {n_children}')
    
    print('------------------------------------------------')

Actor: Emil Jannings.
Birthday: 1884-07-23
Died: (1950-01-02)
Birthplace: Rorschach, Switzerland
Deathplace: Strobl, Allied-occupied Austria
Spouse: Hanna Ralph​ ​
Married in 1919, divorced in 1921.
Spouse: Lucie Höflich​ ​
Married in 1921, divorced in 1921.
Spouse: Gussy Holl ​
Married in 1923
Number of children: 1
------------------------------------------------
Actor: Warner Baxter.
Birthday: 1889-03-29
Died: (1951-05-07)
Birthplace: Columbus, Ohio, U.S.
Deathplace: Beverly Hills, California, U.S.
Spouse: Viola Caldwell​ ​
Married in 1911, divorced in 1913.
Spouse: Winifred Bryson ​
Married in 1918
Number of children: 0
------------------------------------------------
Actor: George Arliss.
Birthday: 1868-04-10
Died: (1946-02-05)
Birthplace: London, England
Deathplace: London, England
Spouse: Florence Montgomery ​
Married in 1899
Number of children: 0
------------------------------------------------
Actor: Lionel Barrymore.
Birthday: 1878-04-28
Died: (1954-11-15)
Birthplace: Philadel

In [20]:
for actor, bday, death, n_children, birthplace, deathplace in zip(actor_list, bday_list, death_list, n_children_list, birthplace_list, deathplace_list):
    df.loc[df['Actor'] == actor, 'Born'] = bday
    df.loc[df['Actor'] == actor, 'Died'] = death
    df.loc[df['Actor'] == actor, 'Birthplace'] = birthplace
    df.loc[df['Actor'] == actor, 'Deathplace'] = deathplace
    df.loc[df['Actor'] == actor, 'Children'] = n_children

df['Died'] = df['Died'].str.replace(r'[\(\)]', '', regex=True)

In [21]:
df

Unnamed: 0,Year,ActorID,Actor,Role(s),Film,Link,Born,Died,Birthplace,Deathplace,Children
0,1927/28,0,Emil Jannings,"Grand Duke Sergius Alexander, August Schilling","The Last Command, The Way of All Flesh",/wiki/Emil_Jannings,1884-07-23,1950-01-02,"Rorschach, Switzerland","Strobl, Allied-occupied Austria",1
1,1928/29,1,Warner Baxter,The Cisco Kid,In Old Arizona,/wiki/Warner_Baxter,1889-03-29,1951-05-07,"Columbus, Ohio, U.S.","Beverly Hills, California, U.S.",0
2,1929/30,2,George Arliss,Benjamin Disraeli,Disraeli,/wiki/George_Arliss,1868-04-10,1946-02-05,"London, England","London, England",0
3,1930/31,3,Lionel Barrymore,Stephen Ashe,A Free Soul,/wiki/Lionel_Barrymore,1878-04-28,1954-11-15,"Philadelphia, Pennsylvania, U.S.","Los Angeles, California, U.S.",2
4,1931/32,4,Wallace Beery,"Andy ""Champ"" Purcell",The Champ,/wiki/Wallace_Beery,1885-04-01,1949-04-15,"Clay County, Missouri, U.S.","Beverly Hills, California, U.S.",1
...,...,...,...,...,...,...,...,...,...,...,...
92,2019,82,Joaquin Phoenix,Arthur Fleck / Joker,Joker,/wiki/Joaquin_Phoenix,1974-10-28,,"San Juan, Puerto Rico",,1
93,2020/21,59,Anthony Hopkins,Anthony,The Father,/wiki/Anthony_Hopkins,1937-12-31,,"Port Talbot, Glamorgan, Wales",,1
94,2021,83,Will Smith,Richard Williams,King Richard,/wiki/Will_Smith,1968-09-25,,"Philadelphia, Pennsylvania, U.S.",,"3, including Jaden and Willow"
95,2022,84,Brendan Fraser,Charlie,The Whale,/wiki/Brendan_Fraser,1968-12-03,,"Indianapolis, Indiana, U.S.",,3


In [22]:
df['Children'].unique()

array(['1', 0, '2', '2, including Judy Lewis', '3', '4',
       'GaryDennisPhillipLindsay (with Dixie)Harry IIIMaryNathaniel (with Kathryn)',
       '6, including Miguel Ferrer', '2, including Stephen Humphrey',
       '11, including Christian and Cheyenne', '5', 'Matthew Guinness',
       '4, including David Jr.', '2, including Fraser Clarke Heston',
       '5; including Bill', '5, including Cecilia Peck',
       '6, including Sydney Tamiia', '\nNoel\nCarey\n',
       '7, including Michael, Patrick, and Ethan',
       '7, including Devon and Campbell Scott',
       '2, including Chris Lemmon', '6, including Lorraine and Ray',
       '4; including Charles Finch', '3, including Emily and Ben',
       'James HavenAngelina Jolie', '6, including Jake',
       '7, including Drena and Raphael', '3, including Jane and Peter',
       '4, including Ferdinand Kingsley',
       '6, including Scott, Nell, and Melissa',
       '3, including Cameron Douglas', '2, including Max Irons',
       '4, inc

In [23]:
children = df.loc[df['Children'].str.len() > 1, ['ActorID','Children']].set_index('ActorID')
children

Unnamed: 0_level_0,Children
ActorID,Unnamed: 1_level_1
7,"2, including Judy Lewis"
16,GaryDennisPhillipLindsay (with Dixie)Harry III...
21,"6, including Miguel Ferrer"
22,"2, including Stephen Humphrey"
24,"11, including Christian and Cheyenne"
27,Matthew Guinness
28,"4, including David Jr."
29,"2, including Fraser Clarke Heston"
30,5; including Bill
32,"5, including Cecilia Peck"


In [24]:
def extract_number(text):
  match = re.search(r'\d+', text)
  return match.group() if match else text

In [25]:
children = children['Children'].apply(extract_number)

In [26]:
children.unique()

array(['2',
       'GaryDennisPhillipLindsay (with Dixie)Harry IIIMaryNathaniel (with Kathryn)',
       '6', '11', 'Matthew Guinness', '4', '5', '\nNoel\nCarey\n', '7',
       '3', 'James HavenAngelina Jolie', 'Dylan PennHopper Penn'],
      dtype=object)

In [27]:
children

ActorID
7                                                     2
16    GaryDennisPhillipLindsay (with Dixie)Harry III...
21                                                    6
22                                                    2
24                                                   11
27                                     Matthew Guinness
28                                                    4
29                                                    2
30                                                    5
32                                                    5
33                                                    6
34                                      \nNoel\nCarey\n
39                                                    7
40                                                    7
24                                                   11
42                                                    2
44                                                    6
45                                      

In [28]:
children[16] = '7'
children[27] = '1'
children[34] = '2'
children[47] = '2'
children[69] = '2'

In [29]:
children

ActorID
7      2
16     7
21     6
22     2
24    11
27     1
28     4
29     2
30     5
32     5
33     6
34     2
39     7
40     7
24    11
42     2
44     6
45     4
46     3
47     2
48     6
49     7
50     3
51     4
55     6
56     3
48     6
58     2
61     4
61     4
44     6
67     4
69     2
70     2
71     3
69     2
83     3
Name: Children, dtype: object

In [30]:
for actor_id, child_count in children.items():
    df.loc[df['ActorID'] == actor_id, 'Children'] = child_count

df['Children'].unique()

array(['1', 0, '2', '3', '4', '7', '6', '11', '5'], dtype=object)

---

## Marriages

In [31]:
data = np.stack([actor_id_list, spouses_list, wedding_date_list,divorce_date_list]).T

marriages = pd.DataFrame(data, columns=['ActorID', 'Spouse', 'From', 'To'])
marriages.head()

Unnamed: 0,ActorID,Spouse,From,To
0,0,Hanna Ralph​ ​,1919,1921.0
1,0,Lucie Höflich​ ​,1921,1921.0
2,0,Gussy Holl ​,1923,
3,1,Viola Caldwell​ ​,1911,1913.0
4,1,Winifred Bryson ​,1918,


In [32]:
marriages['Spouse'][0]

'Hanna Ralph\u200b \u200b'

In [33]:
len(marriages['Spouse'][0])

14

In [34]:
marriages['Spouse'] = marriages['Spouse'].str.replace('\u200b', '').str.strip()

In [35]:
len(marriages['Spouse'][0])

11

In [36]:
marriages.to_csv('data/marriages.csv')

---

## Awards dates

In [37]:
url = 'https://en.wikipedia.org/wiki/List_of_Academy_Awards_ceremonies'

r = get(url)
soup = BeautifulSoup(r.content, 'html.parser')

tables = soup.find_all('table', class_='wikitable sortable sticky-header')
tables = pd.read_html(StringIO(str(tables)))
awards = tables[0]

In [38]:
awards

Unnamed: 0,#,Date,Best Picture,Most awarded film(s),Number of viewers P2+,HH Rating,Host(s),Producer(s),Venue,Broadcast partner(s)
0,1st,"May 16, 1929",Wings,7th Heaven Sunrise,—,—,Douglas Fairbanks,—,The Hollywood Roosevelt Hotel,none
1,2nd,"April 3, 1930",The Broadway Melody,The Bridge of San Luis Rey The Broadway Melody...,—,—,William C. deMille,—,Ambassador Hotel,KNX-AM[a][7]
2,3rd,"November 5, 1930",All Quiet on the Western Front,All Quiet on the Western Front The Big House,—,—,Conrad Nagel,—,Ambassador Hotel,KNX-AM[a]
3,4th,"November 10, 1931",Cimarron,Cimarron,—,—,Lawrence Grant,—,Biltmore Hotel,KHJ-AM[b]
4,5th,"November 18, 1932",Grand Hotel,Bad Girl The Champ,—,—,Lionel Barrymore Conrad Nagel,—,Ambassador Hotel,KECA-AM[c]
...,...,...,...,...,...,...,...,...,...,...
92,93rd,"April 25, 2021",Nomadland,Nomadland,10.40 million,5.9,none,Jesse Collins Stacey Sher Steven Soderbergh,Union Station,ABC
93,94th,"March 27, 2022",CODA,Dune,16.62 million,9.0,Regina Hall Amy Schumer Wanda Sykes,Will Packer Shayla Cowan,Dolby Theatre,ABC
94,95th,"March 12, 2023",Everything Everywhere All at Once,Everything Everywhere All at Once,18.70 million,9.9,Jimmy Kimmel,Ricky Kirshner Glenn Weiss,Dolby Theatre,ABC
95,96th,"March 10, 2024",Oppenheimer,Oppenheimer,19.50 million,9.9,Jimmy Kimmel,Raj Kapoor Katy Mullan,Dolby Theatre,ABC


In [39]:
awards.drop(96, inplace=True)

In [40]:
awards

Unnamed: 0,#,Date,Best Picture,Most awarded film(s),Number of viewers P2+,HH Rating,Host(s),Producer(s),Venue,Broadcast partner(s)
0,1st,"May 16, 1929",Wings,7th Heaven Sunrise,—,—,Douglas Fairbanks,—,The Hollywood Roosevelt Hotel,none
1,2nd,"April 3, 1930",The Broadway Melody,The Bridge of San Luis Rey The Broadway Melody...,—,—,William C. deMille,—,Ambassador Hotel,KNX-AM[a][7]
2,3rd,"November 5, 1930",All Quiet on the Western Front,All Quiet on the Western Front The Big House,—,—,Conrad Nagel,—,Ambassador Hotel,KNX-AM[a]
3,4th,"November 10, 1931",Cimarron,Cimarron,—,—,Lawrence Grant,—,Biltmore Hotel,KHJ-AM[b]
4,5th,"November 18, 1932",Grand Hotel,Bad Girl The Champ,—,—,Lionel Barrymore Conrad Nagel,—,Ambassador Hotel,KECA-AM[c]
...,...,...,...,...,...,...,...,...,...,...
91,92nd,"February 9, 2020",Parasite,Parasite,23.64 million,13.59[9],none,Lynette Howell Taylor Stephanie Allain,Dolby Theatre,ABC
92,93rd,"April 25, 2021",Nomadland,Nomadland,10.40 million,5.9,none,Jesse Collins Stacey Sher Steven Soderbergh,Union Station,ABC
93,94th,"March 27, 2022",CODA,Dune,16.62 million,9.0,Regina Hall Amy Schumer Wanda Sykes,Will Packer Shayla Cowan,Dolby Theatre,ABC
94,95th,"March 12, 2023",Everything Everywhere All at Once,Everything Everywhere All at Once,18.70 million,9.9,Jimmy Kimmel,Ricky Kirshner Glenn Weiss,Dolby Theatre,ABC


In [41]:
df.loc[df['Actor'].isin(['Wallace Beery', 'Fredric March'])]

Unnamed: 0,Year,ActorID,Actor,Role(s),Film,Link,Born,Died,Birthplace,Deathplace,Children
4,1931/32,4,Wallace Beery,"Andy ""Champ"" Purcell",The Champ,/wiki/Wallace_Beery,1885-04-01,1949-04-15,"Clay County, Missouri, U.S.","Beverly Hills, California, U.S.",1
5,1931/32,5,Fredric March,Dr. Henry Jekyll / Mr. Edward Hyde,Dr. Jekyll and Mr. Hyde,/wiki/Fredric_March,1897-08-31,1975-04-14,"Racine, Wisconsin, U.S.","Los Angeles, California, U.S.",2
19,1946,5,Fredric March,Platoon Sergeant Al Stephenson,The Best Years of Our Lives,/wiki/Fredric_March,1897-08-31,1975-04-14,"Racine, Wisconsin, U.S.","Los Angeles, California, U.S.",2


In [42]:
award_dates = awards['Date'].to_list()

In [43]:
award_dates[4]

'November 18, 1932'

In [44]:
award_dates.insert(5, award_dates[4])

In [45]:
award_dates

['May 16, 1929',
 'April 3, 1930',
 'November 5, 1930',
 'November 10, 1931',
 'November 18, 1932',
 'November 18, 1932',
 'March 16, 1934',
 'February 27, 1935',
 'March 5, 1936',
 'March 4, 1937',
 'March 10, 1938',
 'February 23, 1939',
 'February 29, 1940',
 'February 27, 1941',
 'February 26, 1942',
 'March 4, 1943',
 'March 2, 1944',
 'March 15, 1945',
 'March 7, 1946',
 'March 13, 1947',
 'March 20, 1948',
 'March 24, 1949',
 'March 23, 1950',
 'March 29, 1951',
 'March 20, 1952',
 'March 19, 1953',
 'March 25, 1954',
 'March 30, 1955',
 'March 21, 1956',
 'March 27, 1957',
 'March 26, 1958',
 'April 6, 1959',
 'April 4, 1960',
 'April 17, 1961',
 'April 9, 1962',
 'April 8, 1963',
 'April 13, 1964',
 'April 5, 1965',
 'April 18, 1966',
 'April 10, 1967',
 'April 10, 1968',
 'April 14, 1969',
 'April 7, 1970',
 'April 15, 1971',
 'April 10, 1972',
 'March 27, 1973',
 'April 2, 1974',
 'April 8, 1975',
 'March 29, 1976',
 'March 28, 1977',
 'April 3, 1978',
 'April 9, 1979',
 'Ap

In [46]:
df['Year'] = award_dates
df.head()

Unnamed: 0,Year,ActorID,Actor,Role(s),Film,Link,Born,Died,Birthplace,Deathplace,Children
0,"May 16, 1929",0,Emil Jannings,"Grand Duke Sergius Alexander, August Schilling","The Last Command, The Way of All Flesh",/wiki/Emil_Jannings,1884-07-23,1950-01-02,"Rorschach, Switzerland","Strobl, Allied-occupied Austria",1
1,"April 3, 1930",1,Warner Baxter,The Cisco Kid,In Old Arizona,/wiki/Warner_Baxter,1889-03-29,1951-05-07,"Columbus, Ohio, U.S.","Beverly Hills, California, U.S.",0
2,"November 5, 1930",2,George Arliss,Benjamin Disraeli,Disraeli,/wiki/George_Arliss,1868-04-10,1946-02-05,"London, England","London, England",0
3,"November 10, 1931",3,Lionel Barrymore,Stephen Ashe,A Free Soul,/wiki/Lionel_Barrymore,1878-04-28,1954-11-15,"Philadelphia, Pennsylvania, U.S.","Los Angeles, California, U.S.",2
4,"November 18, 1932",4,Wallace Beery,"Andy ""Champ"" Purcell",The Champ,/wiki/Wallace_Beery,1885-04-01,1949-04-15,"Clay County, Missouri, U.S.","Beverly Hills, California, U.S.",1


In [47]:
df.loc[df['Actor'].isin(['Wallace Beery', 'Fredric March'])]

Unnamed: 0,Year,ActorID,Actor,Role(s),Film,Link,Born,Died,Birthplace,Deathplace,Children
4,"November 18, 1932",4,Wallace Beery,"Andy ""Champ"" Purcell",The Champ,/wiki/Wallace_Beery,1885-04-01,1949-04-15,"Clay County, Missouri, U.S.","Beverly Hills, California, U.S.",1
5,"November 18, 1932",5,Fredric March,Dr. Henry Jekyll / Mr. Edward Hyde,Dr. Jekyll and Mr. Hyde,/wiki/Fredric_March,1897-08-31,1975-04-14,"Racine, Wisconsin, U.S.","Los Angeles, California, U.S.",2
19,"March 13, 1947",5,Fredric March,Platoon Sergeant Al Stephenson,The Best Years of Our Lives,/wiki/Fredric_March,1897-08-31,1975-04-14,"Racine, Wisconsin, U.S.","Los Angeles, California, U.S.",2


In [48]:
df.rename(columns={'Year': 'Date'}, inplace=True)
df.head(1)

Unnamed: 0,Date,ActorID,Actor,Role(s),Film,Link,Born,Died,Birthplace,Deathplace,Children
0,"May 16, 1929",0,Emil Jannings,"Grand Duke Sergius Alexander, August Schilling","The Last Command, The Way of All Flesh",/wiki/Emil_Jannings,1884-07-23,1950-01-02,"Rorschach, Switzerland","Strobl, Allied-occupied Austria",1


In [49]:
df['Date'] = pd.to_datetime(df['Date'])

In [50]:
df.to_csv('data/awards.csv')

---