This notebook is used to generate the data used in the javascript sketch.js. This notebook scrapes best picture nominated name, runtime, whether it was directed by a female, and rotten tomatoes score and saves it all in the movie_dict.json file.

In [98]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import json
import time
import numpy as np

In [2]:
# list of number of nominees each year (descending at first)
movie_num = [10,10,8,9,8,9,9,8,8,9,9,9,10,10,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,10,10,10,10,10,10,10,10,12,12,10,8,5,5,5,3];
movie_num.reverse()

## Get All Nominated Movies

In [3]:
url = 'https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

In [4]:
table = soup.find_all('table', class_='wikitable')

In [5]:
best_pictures = []
for tab in table:
    for row in tab.find_all('tr')[1:]:
        cells = row.find_all('td')
        movie = cells[0].find('a').text.strip()
        if movie == 'Columbia Pictures':
            break
        if movie == '1917':
            best_pictures.append(movie)
        if movie.isdigit() or movie.split("/")[0].isdigit():
            continue
        else:
            best_pictures.append(movie)

In [6]:
len(best_pictures)

591

In [7]:
## account for duplicates
duplicates = []
seen = {}

for item in best_pictures:
    if item not in seen:
        seen[item] = 1
    else:
        if seen[item] == 1:
            duplicates.append(item)
        seen[item] += 1

print(duplicates)

['Mutiny on the Bounty', 'Cleopatra', 'Romeo and Juliet', 'Heaven Can Wait', 'Les Misérables', 'A Star Is Born', 'Little Women', 'West Side Story', 'All Quiet on the Western Front']


In [8]:
def clean_list(lst):
    result = []
    seen = {}

    for item in lst:
        if item not in seen:
            seen[item] = 2
            result.append(item)
        else:
            suffix = "_" + str(seen[item])
            new_item = item + suffix
            seen[item] += 1
            result.append(new_item)

    return result

In [9]:
best_pictures_cleaned = clean_list(best_pictures)

In [10]:
duplicates2 = []
seen = {}

for item in best_pictures_cleaned:
    if item not in seen:
        seen[item] = 1
    else:
        if seen[item] == 1:
            duplicates.append(item)
        seen[item] += 1

print(duplicates2)

[]


## Films Directed By Female

In [11]:
female_directed = ['Children of a Lesser God', 
                   'Awakenings', 
                   'The Prince of Tides',
                   'The Piano',
                   'Lost in Translation',
                   'Little Miss Sunshine',
                   'The Hurt Locker',
                   'An Education',
                   'The Kids Are All Right',
                   "Winter's Bone",
                   "Zero Dark Thirty",
                   "Selma",
                   "Lady Bird",
                   "Little Women_2",
                   "Nomadland",
                   "Promising Young Woman",
                   "CODA",
                   "The Power of the Dog",
                   "Women Talking"
                  ]

## Scrape Runtimes

In [12]:
runtime_tuple = []
base_url = 'https://en.wikipedia.org/'
seen_movies = []
seen = {}
for tab in table:
    for row in tab.find_all('tr')[1:]:
        cells = row.find_all('td')
        movie = cells[0].find('a').text.strip()

        if movie not in seen:
            seen[movie] = 2
            seen_movies.append(movie)
        else:
            suffix = "_" + str(seen[movie])
            new_item = movie + suffix
            seen[movie] += 1
            seen_movies.append(new_item)
            movie = new_item
        if movie == 'Columbia Pictures':
            break
        if (movie.isdigit() and movie != '1917') or (movie.split("/")[0].isdigit() and movie != '1917'):
            continue
        else:
            movie_url = base_url + cells[0].a['href']
            response = requests.get(movie_url)
            soup = BeautifulSoup(response.content, 'html.parser')
            # Find the infobox table on the movie's Wikipedia page
            infobox = soup.find('table', {'class': 'infobox vevent'})
            # Extract the running time from the infobox
            running_time_row = infobox.find('th', text='Running time').parent
            # Extract the running time value
            running_time = running_time_row.find('td', {'class': 'infobox-data'}).text.strip()
            runtime_tuple.append((movie, running_time))
            print(movie, running_time)

Wings Original release:111 minutes[2]Restoration:144 minutes[3]
7th Heaven 110 min
The Racket 84 minutes
The Broadway Melody 100 minutes
Alibi 90 minutes
Hollywood Revue 130 minutes (roadshow)118 min (Turner library print)
In Old Arizona 95 minutes
The Patriot 113 minutes
All Quiet on the Western Front 152 minutes[2]133 minutes (restored)
The Big House 87 minutes
Disraeli 90 minutes (1929 release)87 minutes (1934 re-release)
The Divorcee 84 mins.
The Love Parade 107 minutes
Cimarron 124 minutes[1]
East Lynne 102 minutes, 9,188 ft., or 10 reels[2]
The Front Page 101 minutes
Skippy 85 minutes
Trader Horn 122 minutes
Grand Hotel 112 minutes
Arrowsmith 108 minutes
Bad Girl 90 minutes
The Champ 87 minutes
Five Star Final 89 minutes
One Hour with You 80 minutes
Shanghai Express 80 minutes
The Smiling Lieutenant 89 minutes
Cavalcade 112 minutes
42nd Street 89 minutes
A Farewell to Arms 88 minutes
I Am a Fugitive from a Chain Gang 93 minutes
Lady for a Day 96 minutes
Little Women 115 minutes
T

Bonnie and Clyde 111 minutes
Doctor Dolittle 152 minutes
The Graduate 106 minutes[1]
Guess Who's Coming to Dinner 108 minutes[1]
Oliver! 153 minutes
Funny Girl 149 minutes[1]
The Lion in Winter 134 minutes
Rachel, Rachel 101 minutes[1]
Romeo and Juliet_2 138 minutes
Midnight Cowboy 113 minutes
Anne of the Thousand Days 145 minutes
Butch Cassidy and the Sundance Kid 110 minutes[2]
Hello, Dolly! 146 minutes
Z 127 minutes[1]
Patton 172 minutes
Airport 137 minutes
Five Easy Pieces 98 minutes
Love Story 101 minutes[1]
M*A*S*H 116 minutes
The French Connection 104 minutes[2]
A Clockwork Orange 136 minutes[2]
Fiddler on the Roof 179 minutes[1]
The Last Picture Show 118 minutes
Nicholas and Alexandra 188 minutes[1]
The Godfather 175 minutes[1]
Cabaret 124 minutes
Deliverance 109 minutes
The Emigrants 191 minutes
Sounder 105 minutes
The Sting 129 minutes
American Graffiti 112 minutes
Cries and Whispers 91 minutes[2]
The Exorcist 122 minutes
A Touch of Class 106 minutes
The Godfather Part II 202

Lion 118 minutes[2]
Manchester by the Sea 137 minutes[1]
The Shape of Water 123 minutes[1]
Call Me by Your Name 132 minutes[1]
Darkest Hour 125 minutes[2]
Dunkirk 106 minutes[4]
Get Out 104 minutes
Lady Bird 94 minutes[2]
Phantom Thread 130 minutes[2]
The Post 116 minutes[5]
Three Billboards Outside Ebbing, Missouri 115 minutes[2]
Green Book 130 minutes[3]
Black Panther 134 minutes[1]
BlacKkKlansman 135 minutes[1]
Bohemian Rhapsody 134 minutes[2]
The Favourite 120 minutes[3]
Roma 135 minutes[9]
A Star Is Born_2 136 minutes[5]
Vice 132 minutes
Parasite 132 minutes[3][4]
Ford v Ferrari 152 minutes[2]
The Irishman 209 minutes[1]
Jojo Rabbit 108 minutes
Joker 122 minutes[1]
Little Women_2 135 minutes[1]
Marriage Story 137 minutes[2]
1917 119 minutes[1]
Once Upon a Time in Hollywood 161 minutes[1]
Nomadland 108 minutes[2]
The Father 97 minutes[2]
Judas and the Black Messiah 126 minutes
Mank 131 minutes[2]
Minari 115 minutes[1]
Promising Young Woman 113 minutes
Sound of Metal 120 minutes
The

In [13]:
import re

text = "The movie is 120 minutes long and has a runtime of 130 min."

pattern = r'(\d+)\s*(?:minutes|min)'
matches = re.findall(pattern, text)

if matches:
    minutes = max(int(m) for m in matches)
    print(minutes)

130


In [14]:
runtime_dict = {}
for tup in tqdm(runtime_tuple):
    matches = re.findall(pattern, tup[1])
    runtime = max(int(m) for m in matches)
    runtime_dict[tup[0]] = runtime

100%|██████████| 591/591 [00:00<00:00, 84173.78it/s]


In [16]:
runtime_dict['The Sundowners'] = 133

## Scrape Rotten Tomatoes Score

In [None]:
count = 0
ratings_tuple = []
base_url = 'https://en.wikipedia.org/'
seen_movies = []
seen = {}

In [55]:

for j, tab in enumerate(table[8:]):
    print("Table index: " + str(j))
    for i, row in enumerate(tab.find_all('tr')[60:]):
        print("Tab Index: " + str(i))
        cells = row.find_all('td')
        movie = cells[0].find('a').text.strip()

        if movie not in seen:
            seen[movie] = 2
            seen_movies.append(movie)
        else:
            suffix = "_" + str(seen[movie])
            new_item = movie + suffix
            seen[movie] += 1
            seen_movies.append(new_item)
            movie = new_item
        if movie == 'Columbia Pictures':
            break
        if (movie.isdigit() and movie != '1917') or (movie.split("/")[0].isdigit() and movie != '1917'):
            continue
        else:
            movie_url = base_url + cells[0].a['href']
            response = requests.get(movie_url)
            soup = BeautifulSoup(response.content, 'html.parser')
            count += 1
            print(movie)
            if count % 10 == 0:
                time.sleep(5)
            # Find the infobox table on the movie's Wikipedia page
            for link in soup.find_all('a', href=True, attrs={"class": "external text"}):
                if 'rottentomatoes' in link['href']:
                    tomatoes_response = requests.get(link['href'])
                    soup = BeautifulSoup(tomatoes_response.content, 'html.parser')
                    percentage_dict = soup.find('score-board')
                    if percentage_dict:
                        percentage = percentage_dict['tomatometerscore']
                    else:
                        percentage = 'Missing'
                    ratings_tuple.append((movie, percentage))
                    print(percentage)

Table index: 0
Tab Index: 0
An Education
93
93
Tab Index: 1
Inglourious Basterds
89
89
Tab Index: 2
Precious: Based on the Novel 'Push' by Sapphire
92
92
92
Tab Index: 3
A Serious Man
89
Missing
89
Tab Index: 4
Up
98
Tab Index: 5
Up in the Air
90
Missing
Missing
Table index: 1
Tab Index: 0
2016_2
Tab Index: 1
Moonlight_2
98
Missing
98
Tab Index: 2
Arrival_2
95
Missing
95
Tab Index: 3
Fences_2
92
92
Tab Index: 4


KeyboardInterrupt: 

In [56]:
rating_dict = {}
two_scores = []
for movie in tqdm(best_pictures_cleaned):
    for ratings_tup in ratings_tuple:
        if movie == ratings_tup[0]:
            if movie in rating_dict:
                if rating_dict[movie].isdigit() and ratings_tup[1].isdigit():
                    if rating_dict[movie] == ratings_tup[1]:
                        pass
                    else:
                        two_scores.append(movie)
                if rating_dict[movie] == 'Missing':
                    rating_dict[movie] = ratings_tup[1]
            else:
                rating_dict[movie] = ratings_tup[1]

100%|██████████| 591/591 [00:00<00:00, 2828.59it/s]


In [57]:
rating_dict['The Exorcist'] = '84'

In [58]:
missing_score = []
for movie in tqdm(best_pictures_cleaned):
    if movie not in rating_dict:
        missing_score.append(movie)

100%|██████████| 591/591 [00:00<00:00, 625968.10it/s]


In [59]:
missing_score

['127 Hours',
 'Black Swan',
 'Nomadland',
 'The Father',
 'Judas and the Black Messiah']

In [60]:
rating_dict['127 Hours'] = '93'
rating_dict['Black Swan'] = '85'
rating_dict['Nomadland'] = '93'
rating_dict['The Father'] = '98'
rating_dict['Judas and the Black Messiah'] ='97'

In [61]:
len(rating_dict)

591

In [66]:
for k, v in rating_dict.items():
    if v.isdigit():
        pass
    else:
        print(k)

East Lynne
Bad Girl
Lady for a Day
Smilin' Through
State Fair
The Little Foxes
The More the Merrier
Henry V
The Razor's Edge
The Yearling
Great Expectations
Battleground
Father of the Bride
King Solomon's Mines
Decision Before Dawn
The Rose Tattoo
Friendly Persuasion
Peyton Place
Cat on a Hot Tin Roof
Sons and Lovers
Fanny
Patton
The Theory of Everything


In [67]:
rating_dict['East Lynne'] = '94'
rating_dict['Bad Girl'] = '40'
rating_dict['Lady for a Day'] = '100'
rating_dict['Smilin\' Through'] = '46'
rating_dict['State Fair'] = '52'
rating_dict['The Little Foxes'] = '100'
rating_dict['The More the Merrier'] = '100'
rating_dict['Henry V'] = '100'
rating_dict['The Razor\'s Edge'] = '60'
rating_dict['The Yearling'] = '100'
rating_dict['Great Expectations'] = '85'
rating_dict['Battleground'] = '75'
rating_dict['Father of the Bride'] = '90'
rating_dict['King Solomon\'s Mines'] = '92'
rating_dict['Decision Before Dawn'] = '100'
rating_dict['The Rose Tattoo'] = '63'
rating_dict['Friendly Persuasion'] = '89'
rating_dict['Peyton Place'] = '65'
rating_dict['Cat on a Hot Tin Roof'] = '97'
rating_dict['Sons and Lovers'] = '85'
rating_dict['Fanny'] = '80'
rating_dict['Patton'] = '90'
rating_dict['The Theory of Everything'] = '80'

In [69]:
for key in rating_dict:
    rating_dict[key] = int(rating_dict[key])

In [70]:
with open('rating_dict.json', 'w') as f:
    json.dump(rating_dict, f)

## Get Best Picture Winners

In [71]:
best_picture_winners = []
for tab in table:
    for row in tab.find_all('tr')[1:]:
        if 'style' in row.attrs and 'background:#FAEB86' in row['style']:
            cells = row.find_all('td')
            movie = cells[0].find('a').text.strip()
            if movie == 'Columbia Pictures':
                break
            if (movie.isdigit() and movie != '1917') or (movie.split("/")[0].isdigit() and movie != '1917'):
                continue
            else:
                best_picture_winners.append(movie)

In [72]:
len(best_picture_winners)

94

In [73]:
best_picture_winners.append("Everything Everywhere All at Once")

## Create Final Movie Dict

In [92]:
movie_dict = {}
year = 1928
movies = 0
year_count = 0

for i, movie in enumerate(best_pictures_cleaned):

    female_director = False
    
    # female directed 
    if movie in female_directed:
        female_director = True
            
    # best picture
    if movie in best_picture_winners:
         best_picture = True
    else:
        best_picture = False
        
    # runtime
    runtime = runtime_dict[movie]
    
    if year in movie_dict:
        movie_dict[year][movie] = {
            'female_director': female_director,
            'best_picture': best_picture,
            'runtime': runtime,
            'rating': rating_dict[movie]
        }
    else:
        movie_dict[year] = {
            movie: {
                'female_director': female_director,
                'best_picture': best_picture,
                'runtime': runtime,
                'rating': rating_dict[movie]
            }
        }
    movies += 1
    if movies == movie_num[year_count]:
        year += 1
        year_count += 1
        movies = 0

In [93]:
for k, v in movie_dict.items():
    contains = False
    for l, m in v.items():
        if m['best_picture'] == True:
            contains = True
    if contains == False:
        print(k)

In [94]:
movie_dict

{1928: {'Wings': {'female_director': False,
   'best_picture': True,
   'runtime': 144,
   'rating': 93},
  '7th Heaven': {'female_director': False,
   'best_picture': False,
   'runtime': 110,
   'rating': 100},
  'The Racket': {'female_director': False,
   'best_picture': False,
   'runtime': 84,
   'rating': 100}},
 1929: {'The Broadway Melody': {'female_director': False,
   'best_picture': True,
   'runtime': 100,
   'rating': 42},
  'Alibi': {'female_director': False,
   'best_picture': False,
   'runtime': 90,
   'rating': 50},
  'Hollywood Revue': {'female_director': False,
   'best_picture': False,
   'runtime': 130,
   'rating': 43},
  'In Old Arizona': {'female_director': False,
   'best_picture': False,
   'runtime': 95,
   'rating': 56},
  'The Patriot': {'female_director': False,
   'best_picture': False,
   'runtime': 113,
   'rating': 100}},
 1930: {'All Quiet on the Western Front': {'female_director': False,
   'best_picture': True,
   'runtime': 152,
   'rating': 98},


In [95]:
# Define a custom key function to ignore 'The ' at the beginning of titles
def ignore_the(title):
    if title.startswith('The '):
        return title[4:]
    else:
        return title

# Loop over each year in the dictionary
for year in movie_dict:
    # Sort the movies in alphabetical order by title while ignoring 'The '
    movie_dict[year] = dict(sorted(movie_dict[year].items(), key=lambda x: ignore_the(x[0])))

In [96]:
with open('movie_dict.json', 'w') as f:
    json.dump(movie_dict, f)

In [97]:
count=0
for k, v in movie_dict.items():
    if count == 94:
        print(k)
        print(v)
    count+=1

2022
{'All Quiet on the Western Front_2': {'female_director': False, 'best_picture': False, 'runtime': 147, 'rating': 90}, 'Avatar: The Way of Water': {'female_director': False, 'best_picture': False, 'runtime': 192, 'rating': 76}, 'The Banshees of Inisherin': {'female_director': False, 'best_picture': False, 'runtime': 114, 'rating': 96}, 'Elvis': {'female_director': False, 'best_picture': False, 'runtime': 159, 'rating': 77}, 'Everything Everywhere All at Once': {'female_director': False, 'best_picture': True, 'runtime': 139, 'rating': 95}, 'The Fabelmans': {'female_director': False, 'best_picture': False, 'runtime': 151, 'rating': 92}, 'Top Gun: Maverick': {'female_director': False, 'best_picture': False, 'runtime': 130, 'rating': 96}, 'Triangle of Sadness': {'female_director': False, 'best_picture': False, 'runtime': 147, 'rating': 72}, 'Tár': {'female_director': False, 'best_picture': False, 'runtime': 158, 'rating': 91}, 'Women Talking': {'female_director': True, 'best_picture': 

In [103]:
all_ratings = []
for k, v in rating_dict.items():
    all_ratings.append(v)

In [104]:
# Calculate the 6 quantiles
all_ratings.sort()

In [105]:
grouped_list = np.array_split(all_ratings, 6)

In [106]:
grouped_list

[array([20, 29, 33, 38, 38, 40, 42, 43, 45, 45, 46, 49, 50, 50, 52, 52, 52,
        53, 56, 56, 56, 56, 57, 57, 60, 60, 60, 60, 61, 63, 63, 63, 63, 64,
        65, 65, 66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 67, 68, 69, 69, 69,
        69, 70, 70, 71, 71, 71, 71, 71, 71, 72, 72, 72, 72, 72, 73, 74, 74,
        74, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 76, 76, 76, 77, 77, 77,
        78, 78, 78, 78, 78, 78, 78, 79, 79, 79, 79, 79, 79, 79]),
 array([79, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 81, 81, 81,
        81, 82, 82, 82, 82, 82, 82, 83, 83, 83, 83, 83, 83, 83, 83, 83, 83,
        83, 83, 83, 83, 83, 83, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
        84, 84, 84, 84, 84, 84, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85,
        85, 85, 85, 85, 85, 85, 85, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86,
        86, 86, 86, 86, 86, 86, 86, 86, 86, 87, 87, 87, 87, 87]),
 array([87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 88, 88, 88, 88, 88, 88, 88,
        88, 88, 88, 88, 88, 88, 