# Web scraping to create relevant datasets
## Josh Villarreal

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [2]:
# useful macro for ensuring website thinks requests come from browser
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

### Official Academy Awards Database

Here, we scrape <a href='http://awardsdatabase.oscars.org/'>The Official Academy Awards Database</a> for data relevant to our project. We trace back past Oscar-winners and Oscar-nomineers in the categories of "Best Actor," "Best Actress," "Best Actor in a Supporting Role," "Best Actress in a Supporting Role," and "Best Motion Picture." The goals of this section of the notebook include:
- creating a CSV file that consolidate the best actor/actress award-winners and nominees that store the actor/actress's names, year they were nominated for the award, whether they won the award, and the movie title for which they were nominated
- creating a CSV file that consolidates the best picture award-winners and nominees, years the movies were nominated for the awards, and whether the movie won the award.

These CSV files will be used in the data visualization website to better understand the relationships between Oscar-nominees, and perhaps even what differentiates award-nominees from award-winners.

In [3]:
# url for relevant awards database search result
awards_url = 'http://awardsdatabase.oscars.org/Search/GetResults?query=%7B%22AwardCategory%22:[%221%22,%222%22,%223%22,%224%22,%2219%22],%22Sort%22:%223-Award%20Category-Chron%22,%22Search%22:%22Basic%22%7D'

In [4]:
# create soup from academy_url
awards_html = requests.get(awards_url, headers=headers)
awards_soup = BeautifulSoup(awards_html.text, 'html.parser')

In [5]:
# find all Oscars info
oscars = awards_soup.find_all('div', {'class':'awards-result-chron'})

In [19]:
# initialize actors/actress
act_list = []

In [46]:
# scrape for best actor/actress awards
for oscar in oscars:
    
    # get current year and make legible
    unparsed_year = oscar.find('div', {'class': 'result-group-title'}).find('a').get_text()
    year = str(int(unparsed_year.split('/', 1)[0])+1) if '/' in unparsed_year else unparsed_year.split(' ', 1)[0]

    # extract all awards listed under current Oscar year
    awards = oscar.find_all('div', {'class': 'subgroup-awardcategory-chron'})

    # iterate over all awards
    for award in awards:
        
        # get current award title
        award_title_div = award.find('div', {'class': 'result-subgroup-title'})
        award_title = award_title_div.find('a').get_text()
        
        # only best actor/actress
        if award_title == 'OUTSTANDING PICTURE' or award_title == 'OUTSTANDING PRODUCTION':
            award_title = 'BEST PICTURE'
        if award_title == 'BEST PICTURE' or award_title == 'SPECIAL AWARD': # special award of '27 is Charlie Chaplin; we'll add this in later
            continue
            
        # get list of nominees
        unparsed_nominees = award.find_all('div', {'class': 'awards-result-actingorsimilar'})
        for nominee in unparsed_nominees:
            
            # get nominee name
            nominee_div = nominee.find_all('div', {'class': 'awards-result-nominationstatement'})
            nominee_name = nominee_div[0].find('a').get_text()
            
            # get nominee movie(s)
            movie_title = nominee.find('div', {'class': 'awards-result-film-title'}).find('a').get_text()
            
            # get winner boolean
            win = nominee.find('span') is not None
            
            # append nominee info
            act_list.append({'name': nominee_name, 'year': year, 'award': award_title, 'movie': movie_title, 'win': win})

In [51]:
# create dataframe
act_df = pd.DataFrame(act_list)
act_df.tail(20)

Unnamed: 0,name,year,award,movie,win
1848,Antonio Banderas,2019,ACTOR IN A LEADING ROLE,Pain and Glory,False
1849,Leonardo DiCaprio,2019,ACTOR IN A LEADING ROLE,Once upon a Time...in Hollywood,False
1850,Adam Driver,2019,ACTOR IN A LEADING ROLE,Marriage Story,False
1851,Joaquin Phoenix,2019,ACTOR IN A LEADING ROLE,Joker,True
1852,Jonathan Pryce,2019,ACTOR IN A LEADING ROLE,The Two Popes,False
1853,Tom Hanks,2019,ACTOR IN A SUPPORTING ROLE,A Beautiful Day in the Neighborhood,False
1854,Anthony Hopkins,2019,ACTOR IN A SUPPORTING ROLE,The Two Popes,False
1855,Al Pacino,2019,ACTOR IN A SUPPORTING ROLE,The Irishman,False
1856,Joe Pesci,2019,ACTOR IN A SUPPORTING ROLE,The Irishman,False
1857,Brad Pitt,2019,ACTOR IN A SUPPORTING ROLE,Once upon a Time...in Hollywood,True
