# Web scraping to create relevant datasets
**Josh Villarreal**

In [3]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from progressbar import ProgressBar
import requests

In [4]:
# useful macro for ensuring website thinks requests come from browser
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

---
## Official Academy Awards Database

Here, we scrape <a href='http://awardsdatabase.oscars.org/'>The Official Academy Awards Database</a> for data relevant to our project. We trace back past Oscar-winners and Oscar-nomineers in the categories of "Best Actor," "Best Actress," "Best Actor in a Supporting Role," "Best Actress in a Supporting Role," and "Best Motion Picture." The goal of this section of the notebook is to create a CSV file `awards.csv` that consolidate the best actor/actress award-winners and nominees, storing:
- actor/actress's names
- year they were nominated for the award
- whether they won the award
- the movie title for which they were nominated.

This CSV file will be used in the data visualization website to better understand the relationships between Oscar-nominees, and perhaps even what differentiates award-nominees from award-winners.

In [8]:
# url for relevant awards database search result
awards_url = 'http://awardsdatabase.oscars.org/Search/GetResults?query=%7B%22AwardCategory%22:[%221%22,%222%22,%223%22,%224%22,%2219%22],%22Sort%22:%223-Award%20Category-Chron%22,%22Search%22:%22Basic%22%7D'

In [9]:
# create soup from academy_url
awards_html = requests.get(awards_url, headers=headers)
awards_soup = BeautifulSoup(awards_html.text, 'html.parser')

In [10]:
# find all Oscars info
oscars = awards_soup.find_all('div', {'class':'awards-result-chron'})

### Data creation for Oscar-nominated actors and actresses

In [None]:
# initialize actors/actress data structure
awards_list = []

# init progress bar
pb = ProgressBar()

# scraping
for oscar in pb(oscars):
    
    # get current year and make legible
    unparsed_year = oscar.find('div', {'class': 'result-group-title'}).find('a').get_text()
    year = str(int(unparsed_year.split('/', 1)[0])+1) if '/' in unparsed_year else unparsed_year.split(' ', 1)[0]

    # extract all awards listed under current Oscar year
    awards = oscar.find_all('div', {'class': 'subgroup-awardcategory-chron'})

    # iterate over all awards
    for award in awards:
        
        # get current award title
        award_title_div = award.find('div', {'class': 'result-subgroup-title'})
        award_title = award_title_div.find('a').get_text()
        
        # only best actor/actress
        if 'PICTURE' is in award_title or 'PRODUCTION' is in award_title:
            award_title = 'BEST PICTURE'
        if award_title == 'BEST PICTURE' or award_title == 'SPECIAL AWARD': # special award of '27 is Charlie Chaplin; we'll add this in later
            continue
            
        # get list of nominees
        unparsed_nominees = award.find_all('div', {'class': 'awards-result-actingorsimilar'})
        for nominee in unparsed_nominees:
            
            # get nominee name
            nominee_div = nominee.find_all('div', {'class': 'awards-result-nominationstatement'})
            nominee_name = nominee_div[0].find('a').get_text()
            
            # get nominee movie(s)
            movie_title = nominee.find('div', {'class': 'awards-result-film-title'}).find('a').get_text()
            
            # get winner boolean
            win = nominee.find('span') is not None
            
            # append nominee info
            awards_list.append({'name': nominee_name, 'year': int(year), 'award': award_title, 'movie': movie_title, 'win': win})

In [None]:
# add in special award from '27
awards_list.append({'name': 'Charles Chaplin', 'year': 1928, 'award': 'SPECIAL AWARD', 'movie': 'The Circus', 'win': True})

# create dataframe
awards_df = pd.DataFrame(awards_list)
awards_df.sort_values(by=['year', 'award'], inplace=True)
awards_df.reset_index(drop=True, inplace=True)

In [5]:
# store resulting dataframe as csv file
#awards_df.to_csv('../data/awards.csv', index=False)

# read in data as necessary
awards_df = pd.read_csv('../data/awards_backup.csv')

In [6]:
awards_df.head(20)

Unnamed: 0,name,year,award,movie,win
0,Richard Barthelmess,1928,ACTOR,The Noose,False
1,Emil Jannings,1928,ACTOR,The Last Command,True
2,Louise Dresser,1928,ACTRESS,A Ship Comes In,False
3,Janet Gaynor,1928,ACTRESS,7th Heaven,True
4,Gloria Swanson,1928,ACTRESS,Sadie Thompson,False
5,Charles Chaplin,1928,SPECIAL AWARD,The Circus,True
6,George Bancroft,1929,ACTOR,Thunderbolt,False
7,Warner Baxter,1929,ACTOR,In Old Arizona,True
8,Chester Morris,1929,ACTOR,Alibi,False
9,Paul Muni,1929,ACTOR,The Valiant,False


---

## IMDb
Using the <a href='https://buildmedia.readthedocs.org/media/pdf/imdbpy/latest/imdbpy.pdf'>IMDbPY Python package</a> (imported below), we are able to retrieve data from the IMDb servers on actors and actresses to allow us to finish our data acquisition. The goals of the next few sections of code are to:
- generate a CSV file `actors.csv` that stores actors' and actress' biographical information, including their place of birth, birthday, and headshot, alongside the names of the actor's/actress' film and television appearances *(note: the film titles of the actor/actress' appearances is delimited as a single string, with titles separated by the delimeter ";;", something we will parse later)*
- generate a CSV file `connections.csv` that encodes the movies/television shows that the Oscar-nominated actors and actresses from above costarred in
- generate an accompanying CSV file `connected_films.csv` that denotes a list of movies that two actors costar in.

These CSV files will be useful in our visualization because biographical data on actors/actresses and the production data on movies/TV shows will allow us to provide finer detail when presenting movies and the like that actors have in common, while the `connections.csv` file will be especially useful when thinking about displaying the web of nominees that have worked together.

### Data creation for actor/actress biographical information

In [7]:
import imdb
ia = imdb.IMDb()

In [8]:
# TO DELETE LATER: DROP IRRELEVANT AWARDS
awards_df.drop(index=awards_df.loc[awards_df.award.str.contains('PICTURE')].index, inplace=True)

In [9]:
# get list of Oscar-nominated actors and actresses
actors = sorted(list(set(awards_df.name.values)))

In [10]:
# init actor bio list
bio_list = []

# init progress bar
pb = ProgressBar()

for actor in pb(actors):
    
    # get imdb profile of current actor
    profile = ia.get_person(ia.search_person(actor)[0].personID)
    
    # get birth info
    try:
        birthday = profile['birth date']
    except:
        birthday = ''
        print(actor + ' doesn\'t have a birthday')
    
    try:
        birthplace = profile['birth info']['birth place']
    except:
        birthplace = ''
        print(actor + ' doesn\'t have a birthday')
    
    # get headshot info
    try:
        headshot = profile['headshot']
    except:
        headshot = ''
        print(actor + ' doesn\'t have a headshot')
    
    # get list of film appearances (as an actor or actress)
    try:
        appearances = profile['filmography']['actor']
    except:
        appearances = profile['filmography']['actress']
    
    # get & format list of film titles
    unparsed_titles = list(map(lambda appearance: appearance['title'], appearances))
    film_titles = list(map(lambda title: title.split(' (', 1)[0] if title[-1] == ')' else title, unparsed_titles))
    film_titles_string = ';;'.join(film_titles)
    
    # store data
    bio_list.append({'name': actor, 'birthday': birthday, 'birthplace': birthplace, 'headshot': headshot, 'films': film_titles_string})

  0% |                                                                        |

KeyboardInterrupt: 

In [None]:
# generate DataFrame
actors_df = pd.DataFrame(bio_list)
actors_df.head(10)

In [13]:
# save resulting dataframe
#actors_df.to_csv('../data/actors.csv', index=False)

# read in dataframe if necessary
actors_df = pd.read_csv('../data/actors_backup.csv')
actors_df.head(10)

Unnamed: 0,name,birthday,birthplace,headshot,films
0,Abigail Breslin,1996-04-14,"New York City, New York, USA",https://m.media-amazon.com/images/M/MV5BMzU5MT...,Slayers;;Saturday at the Starlight;;Stillwater...
1,Adam Driver,1983-11-19,"San Diego, California, USA",https://m.media-amazon.com/images/M/MV5BOWViYj...,65;;Gucci;;Annette;;The Last Duel;;Star Wars: ...
2,Adolph Caesar,1933-12-05,"New York City, New York, USA",https://m.media-amazon.com/images/M/MV5BMTUwMj...,Silverhawks;;Club Paradise;;ABC Afterschool Sp...
3,Adolphe Menjou,1890-02-18,"Pittsburgh, Pennsylvania, USA",https://m.media-amazon.com/images/M/MV5BMTg2Nz...,The DuPont Show with June Allyson;;Pollyanna;;...
4,Adriana Barraza,1956-03-05,"Toluca, Estado de Mexico, Mexico",https://m.media-amazon.com/images/M/MV5BM2E4OD...,Quatremares;;Yefon;;Monica;;We Can Be Heroes;;...
5,Adrien Brody,1973-04-14,"Woodhaven, Queens, New York City, New York, USA",https://m.media-amazon.com/images/M/MV5BMjI3OD...,The Salamander Lives Twice;;Chapelwaite;;El To...
6,Agnes Moorehead,1900-12-06,"Clinton, Massachusetts, USA",https://m.media-amazon.com/images/M/MV5BMTc4MD...,Rex Harrison Presents Stories of Love;;Franken...
7,Akim Tamiroff,1899-10-29,"Tiflis, Russian Empire [now Tbilisi, Republic ...",https://m.media-amazon.com/images/M/MV5BMjE5Nj...,Don Quixote;;Death of a Jew;;The Great Bank Ro...
8,Al Pacino,1940-04-25,"Manhattan, New York City, New York, USA",https://m.media-amazon.com/images/M/MV5BMTQzMz...,King Lear;;Gucci;;Axis Sally;;Hunters;;The Iri...
9,Alan Alda,1936-01-28,"Manhattan, New York City, New York, USA",https://m.media-amazon.com/images/M/MV5BMTE5ND...,Ray Donovan;;Marriage Story;;The Good Fight;;T...


### Network data creation

In [14]:
# all movies mentioned
all_films = []
for film_string in actors_df.films.values:
    sep_films = film_string.split(';;')
    for sep_film in sep_films:
        all_films.append(sep_film)

# sort result
all_films = sorted(list(set(all_films)))

# create hashtables relating films with filmIDs
film_to_id = {film: i for i,film in enumerate(all_films)}
id_to_film = {i: film for i,film in enumerate(all_films)}

Our plan for finding out which films cast with two or more Oscar-nominated actors is as follows:
1. Generate a vector $\mathbf{x}_i = (x_1, x_2, \dots x_j)^T_i$ for actor $i$, where $x_k$ is a boolean value that is $1$ if actor $i$ is casted in one of the $j$ films $k$, $0$ otherwise.
2. Compute the vector $\mathbf{X} = \sum_{i} \mathbf{x}_i$. This vector has $j$ components, with each element listing the number of Oscar-nominated actors starring in each of the $j$ films.
3. Throw out movies that have component $\mathbf{X}_k = 1$. These correspond to movies with only one participant.

In [15]:
# init vector X
summed_n_actors = np.zeros((len(all_films),))

# init total matrix
actor_film_matrix = []

# init progress bar
pb = ProgressBar()

# generate actor vectors
for actor in pb(actors):
    
    # generate actor vector
    actor_vector = np.zeros((len(all_films),))
    
    # get list of films actor appeared in
    actor_appearances = actors_df.loc[actors_df['name']==actor]['films'].values[0].split(';;')
    
    # update corresponding components
    components = list(map(lambda film: film_to_id[film], actor_appearances))
    actor_vector[components] += 1

    # sum
    summed_n_actors += actor_vector
    
    # append
    actor_film_matrix.append(actor_vector)

actor_film_matrix = np.array(actor_film_matrix)

100% |########################################################################|


In [16]:
# get indeces of all productions with component >1
costarred_ind = np.where(summed_n_actors > 1)[0]

# get list of all films with n stars > 1
costarred_films = list(map(lambda ind: id_to_film[ind], list(costarred_ind)))

In [17]:
# create actor/id keys
actor_to_id = {actor: i for i,actor in enumerate(actors)}
id_to_actor = {i: actor for i,actor in enumerate(actors)}

In [18]:
actor_film_matrix.shape

(935, 38054)

Notice that `actor_film_matrix` is a (N. actors) $\times$ (N. films)-dimensional matrix, with each row corresponding to (N. actors)-dimensional vector with components as booleans outlining the filmography of said actor. That means that if we take the dot product of two rows of this matrix, if the result is a number $>0$, then the two actors are cast in the same film. We can generate a list of actors that are connected to one another by computing the quantity `actor_film_matrix` $\cdot$ `actor_film_matrix`$^T$. The result, `connection_matrix` will be a symmetric matrix with diagonal elements equal to the number of movies in each actor's filmography.

In [19]:
# generating connection matrix
connection_matrix = np.matmul(actor_film_matrix, actor_film_matrix.transpose())
connection_matrix

array([[ 49.,   1.,   0., ...,   2.,   0.,   0.],
       [  1.,  46.,   0., ...,   2.,   0.,   0.],
       [  0.,   0.,  25., ...,   0.,   0.,   0.],
       ...,
       [  2.,   2.,   0., ..., 104.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   4.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,  47.]])

In [20]:
# generate connection dataframe
connection_df = pd.DataFrame(connection_matrix)

# update column names
connection_df.rename(columns=id_to_actor, index=id_to_actor, inplace=True)
connection_df

Unnamed: 0,Abigail Breslin,Adam Driver,Adolph Caesar,Adolphe Menjou,Adriana Barraza,Adrien Brody,Agnes Moorehead,Akim Tamiroff,Al Pacino,Alan Alda,...,William H. Macy,William Hickey,William Holden,William Hurt,William Powell,Winona Ryder,Woody Allen,Woody Harrelson,Yalitza Aparicio,Yul Brynner
Abigail Breslin,49.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
Adam Driver,1.0,46.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,...,2.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0
Adolph Caesar,0.0,0.0,25.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Adolphe Menjou,0.0,0.0,0.0,151.0,0.0,0.0,2.0,2.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
Adriana Barraza,0.0,0.0,0.0,0.0,62.0,0.0,0.0,0.0,0.0,1.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Winona Ryder,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,65.0,0.0,2.0,0.0,0.0
Woody Allen,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,3.0,...,2.0,0.0,1.0,0.0,0.0,0.0,48.0,0.0,0.0,0.0
Woody Harrelson,2.0,2.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,2.0,0.0,104.0,0.0,0.0
Yalitza Aparicio,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0


In [21]:
# save connection df
#connection_df.to_csv('../data/connections.csv')

Now that we have insight into which actors costar in movies together, we should store data on those movies. This is what the next fragment of code takes care of.

In [23]:
# get tuples of actor indeces where an edge exists
connection_matrix

array([[ 49.,   1.,   0., ...,   2.,   0.,   0.],
       [  1.,  46.,   0., ...,   2.,   0.,   0.],
       [  0.,   0.,  25., ...,   0.,   0.,   0.],
       ...,
       [  2.,   2.,   0., ..., 104.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   4.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,  47.]])

In [72]:
# init costars dict
costars_dict = {}

# init progress bar
pb = ProgressBar()

# get shared movie information
for actor1 in pb(actors):
    
    # connection_matrix is symmetric, so only need to look at half of matrix
    for actor2 in actors[:actor_to_id[actor1]]:
        
        # no connection
        if connection_matrix[(actor_to_id[actor1], actor_to_id[actor2])] == 0:
            continue
            
        # there is a connection: get names of films for each actor
        filmography1 = actors_df.loc[actors_df['name'] == actor1]['films'].values[0].split(';;')
        filmography2 = actors_df.loc[actors_df['name'] == actor2]['films'].values[0].split(';;')
        
        # find movies in common
        shared_movies = list(set(filmography1).intersection(filmography2))
        
        # append to dict
        costars_dict[tuple(sorted([actor1, actor2]))] = ';;'.join(sorted(shared_movies))

100% |########################################################################|


In [80]:
# generate dataframe
connected_films_df = pd.DataFrame.from_dict(costars_dict, orient='index', columns=['films'])
connected_films_df.reset_index(inplace=True)
connected_films_df.rename(columns={'index': 'actors'}, inplace=True)
#connected_films_df.actors = *connected_films_df.actors
connected_films_df.head(10)

Unnamed: 0,actors,films
0,"(Abigail Breslin, Adam Driver)",Law & Order: Special Victims Unit
1,"(Adolph Caesar, Agnes Moorehead)",The Twilight Zone;;The Wild Wild West
2,"(Adolphe Menjou, Agnes Moorehead)",Pollyanna;;The Swan
3,"(Adolphe Menjou, Akim Tamiroff)",The DuPont Show with June Allyson;;The Great F...
4,"(Agnes Moorehead, Akim Tamiroff)",Climax!;;Dragon Seed;;Matinee Theatre;;Playhou...
5,"(Adam Driver, Al Pacino)",Gucci;;You Don't Know Jack
6,"(Adam Driver, Alan Alda)",Marriage Story
7,"(Adriana Barraza, Alan Alda)",ER
8,"(Akim Tamiroff, Alan Alda)",Naked City;;Route 66
9,"(Abigail Breslin, Alan Arkin)",Little Miss Sunshine;;The Santa Clause 3: The ...
