# Live Session 05: BeautifulSoup

In [4]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import json
url = 'https://www.rottentomatoes.com/browse/movies_in_theaters/sort:a_z?page=5'

In [None]:
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')

[]

```{html}
<a data-track="scores" data-qa="discovery-media-list-item-caption" href="/m/a_complete_unknown" slot="caption" >
```

Anatomy of HTML:

* Tag: code that represents a specific visual thing to put on the page. 'a' is typically the tag for hyperlinks.
    * Tags are often, but not always, closed by typing `</a>`
    * here it's `a`

* Attributes: paramters contained within a tag
    * here it's `data-track="scores" data-qa="discovery-media-list-item-caption" href="/m/a_complete_unknown" slot="caption"`

* Navigable string: whatever is typed between the opening and closing tags
    * whatever's between `<a>` and `</a>`

In [16]:
movie_list = soup.find_all('a',attrs={'data-track':'scores'})

In [20]:
movie_list[4]['href']

'/m/all_we_imagine_as_light'

In [22]:
# make list comprehension to get all movie links
movie_links = ['https://www.rottentomatoes.com'+movie['href'] for movie in movie_list]
movie_links

['https://www.rottentomatoes.com/m/2121',
 'https://www.rottentomatoes.com/m/a_complete_unknown',
 'https://www.rottentomatoes.com/m/a_knights_war',
 'https://www.rottentomatoes.com/m/a_real_pain',
 'https://www.rottentomatoes.com/m/all_we_imagine_as_light',
 'https://www.rottentomatoes.com/m/anora',
 'https://www.rottentomatoes.com/m/are_you_there_2024',
 'https://www.rottentomatoes.com/m/armand',
 'https://www.rottentomatoes.com/m/attack_on_titan_the_last_attack',
 'https://www.rottentomatoes.com/m/becoming_led_zeppelin',
 'https://www.rottentomatoes.com/m/blades_in_the_darkness',
 'https://www.rottentomatoes.com/m/bring_them_down',
 'https://www.rottentomatoes.com/m/captain_america_brave_new_world',
 'https://www.rottentomatoes.com/m/companion_2025',
 'https://www.rottentomatoes.com/m/conclave',
 'https://www.rottentomatoes.com/m/creation_of_the_gods_ii_demon_force',
 'https://www.rottentomatoes.com/m/dark_nuns',
 'https://www.rottentomatoes.com/m/den_of_thieves_2_pantera',
 'https:

In [23]:
url = 'https://www.rottentomatoes.com/m/all_we_imagine_as_light'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')

In [32]:
titletag = soup.find('h1',attrs={'class':'unset'})
title = titletag.text.strip()

In [34]:
mydict = {'title':title}
mydict

{'title': 'All We Imagine as Light'}

In [41]:
desc = soup.find('rt-text',attrs={'slot':'content'}).text.strip()
mydict['description'] = desc
mydict

{'title': 'All We Imagine as Light',
 'description': "The light, the lives, and the textures of contemporary, working-class Mumbai are explored and celebrated by writer/director Payal Kapadia, who won the Grand Prize at this year's Cannes Film Festival for her revelatory fiction feature debut. Centering on two roommates who also work together in a city hospital--head nurse Prabha (Kani Kusruti) and recent hire Anu (Divya Prabha)--plus their coworker, cook Parvaty (Chhaya Kadam), Kapadia's film alights on moments of connection and heartache, hope and disappointment. Prabha, her husband from an arranged marriage living in faraway Germany, is courted by a doctor at her hospital; Anu carries on a romance with a Muslim man, which she must keep a secret from her strict Hindu family; Parvaty finds herself dealing with a sudden eviction from her apartment. Kapadia captures the bustle of the metropolis and the open-air tranquility of a seaside village with equal radiance, articulated by her sup

In [44]:
def scrape_one(url):
    mydict = {}
    r = requests.get(url)
    mysoup = BeautifulSoup(r.text, 'html.parser')
 
    mydict['Title'] = mysoup.find('h1', 'unset').text.strip()
 
    sum = mysoup.find('meta', attrs={'name':'description'})
    mydict['Synopsis'] = sum['content']
 
    consensus = mysoup.find('div', 'consensus')
    try:
        mydict['Critic Consensus'] = consensus.p.text
    except:
        mydict['Critic Consensus'] = ''
 
    mydict['Critics Score'] = mysoup.find('rt-text', slot="criticsScore").text.strip()
    mydict['Audience Score'] = mysoup.find('rt-text', slot="audienceScore").text.strip()
 
    cat = mysoup.find_all('div',"category-wrap")
    cats = [x.text.strip().split('\n\n\n') for x in cat]
    for c in cats:
        mydict[c[0]] = c[1].replace('\n','')
 
    return mydict

In [46]:
scrape_one(movie_links[0])

{'Title': '2121',
 'Synopsis': '100 years from now when humans are forced underground by an ecological disaster and live in bunkers throughout the world. Newborns are prized, middle-aged people are good for labor and the elderly are harvested. When a woman has an unplanned pregnancy, her mother is now in jeopardy and she must figure out how to help her mother escape before the harvesters come to collect her organs....',
 'Critic Consensus': '',
 'Critics Score': '',
 'Audience Score': '',
 'Director': 'Serpil Altin',
 'Producer': 'Korhan Ugur, Serpil Altin',
 'Screenwriter': 'Korhan Ugur, Serpil Altin',
 'Distributor': 'Indican Pictures',
 'Genre': 'Mystery & Thriller, Drama, Sci-Fi',
 'Original Language': 'English',
 'Release Date (Theaters)': 'Jan 31, 2025, Limited',
 'Box Office (Gross USA)': '$7.9K',
 'Runtime': '1h 32m'}

In [47]:
full_movie_list = [scrape_one(l) for l in movie_links]
full_movie_list

[{'Title': '2121',
  'Synopsis': '100 years from now when humans are forced underground by an ecological disaster and live in bunkers throughout the world. Newborns are prized, middle-aged people are good for labor and the elderly are harvested. When a woman has an unplanned pregnancy, her mother is now in jeopardy and she must figure out how to help her mother escape before the harvesters come to collect her organs....',
  'Critic Consensus': '',
  'Critics Score': '',
  'Audience Score': '',
  'Director': 'Serpil Altin',
  'Producer': 'Korhan Ugur, Serpil Altin',
  'Screenwriter': 'Korhan Ugur, Serpil Altin',
  'Distributor': 'Indican Pictures',
  'Genre': 'Mystery & Thriller, Drama, Sci-Fi',
  'Original Language': 'English',
  'Release Date (Theaters)': 'Jan 31, 2025, Limited',
  'Box Office (Gross USA)': '$7.9K',
  'Runtime': '1h 32m'},
 {'Title': 'A Complete Unknown',
  'Synopsis': "New York, 1961. Against the backdrop of a vibrant music scene and tumultuous cultural upheaval, an 

In [49]:
pd.DataFrame.from_records(full_movie_list)

Unnamed: 0,Title,Synopsis,Critic Consensus,Critics Score,Audience Score,Director,Producer,Screenwriter,Distributor,Genre,Original Language,Release Date (Theaters),Box Office (Gross USA),Runtime,Production Co,Rating,Sound Mix,Aspect Ratio,Release Date (Streaming),Rerelease Date (Theaters)
0,2121,100 years from now when humans are forced unde...,,,,Serpil Altin,"Korhan Ugur, Serpil Altin","Korhan Ugur, Serpil Altin",Indican Pictures,"Mystery & Thriller, Drama, Sci-Fi",English,"Jan 31, 2025, Limited",$7.9K,1h 32m,,,,,,
1,A Complete Unknown,"New York, 1961. Against the backdrop of a vibr...",Charged by Timothée Chalamet's electric perfor...,80%,96%,James Mangold,"Fred Berger, Bob Bookman, Timothée Chalamet, A...","James Mangold, Jay Cocks",Searchlight Pictures,"Biography, Drama, Music",English,"Dec 25, 2024, Wide",$69.0M,2h 21m,"Searchlight Pictures, The Picture Company, Ver...",R (Language),"Dolby Atmos, Dolby Digital",Digital 2.39:1,,
2,A Knight's War,A fearless knight braves a deadly realm to sav...,,,,Matthew Ninaber,Matthew Ninaber,Matthew Ninaber,DREAD,Fantasy,English,"Feb 7, 2025, Limited",,1h 44m,High Rise Studio,,,,"Feb 11, 2025",
3,A Real Pain,Mismatched cousins David (Jesse Eisenberg) and...,Led by a scene-stealing turn from Kieran Culki...,96%,81%,Jesse Eisenberg,"Jesse Eisenberg, Ali Herting, Dave McCary, Ewa...",Jesse Eisenberg,Searchlight Pictures,"Comedy, Drama",English,"Nov 15, 2024, Wide",$8.3M,1h 29m,"Topic Studios, Extreme Emotions, Fruit Tree",R (Some Drug Use|Language Throughout),,,"Dec 31, 2024",
4,All We Imagine as Light,"The light, the lives, and the textures of cont...",Capturing the here and now of modern India wit...,100%,67%,Payal Kapadia,,Payal Kapadia,Sideshow / Janus Films,Drama,Malayalam,"Nov 15, 2024, Limited",$1.0M,1h 58m,"Les Films Fauves, BALDR Film, Petit Chaos, art...",,Dolby Digital,Flat (1.66:1),"Feb 4, 2025",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,Vermiglio,"The lush and breathtaking beauty of the Alps, ...","Painterly and patient, Vermiglio carefully obs...",94%,,Maura Delpero,"Carole Baraton, Tatjana Kozar, Francesca Andreoli",Maura Delpero,Sideshow / Janus Films,Drama,Italian,"Dec 25, 2024, Limited",$158.2K,1h 59m,"RAI Cinema, Cinedora, Versus Production, Charades",,,,,
64,When I'm Ready,As asteroids threaten to wipe out all life on ...,,,,Andrew Johnson,"Andrew Ortenberg, Eli Samek, Jordan Dykstra, R...",Andrew Ortenberg,Quiver Distribution,"Drama, Romance",English,"Feb 7, 2025, Limited",,1h 48m,"19th Hole Productions, Film Bridge International",,,,,
65,Wicked,"Wicked, the untold story of the witches of Oz,...",Defying gravity with its magical pairing of Cy...,88%,95%,Jon M. Chu,"Marc Platt, David Stone","Winnie Holzman, Dana Fox",Universal Pictures,"Kids & Family, Musical, Fantasy, Adventure",English,"Nov 22, 2024, Wide",$471.9M,2h 40m,Marc Platt Productions,PG (Some Scary Action|Brief Suggestive Materia...,Dolby Atmos,Digital 2.39:1,"Dec 31, 2024",
66,Wolf Man,From Blumhouse and visionary writer-director L...,Director Leigh Whannell's attempt at bringing ...,51%,56%,Leigh Whannell,"Jason Blum, Ryan Gosling","Leigh Whannell, Corbett Tuck, Lauren Schuker B...",Universal Pictures,"Horror, Mystery & Thriller",English,"Jan 17, 2025, Wide",$20.6M,1h 43m,"Blumhouse Productions, Universal Pictures, Way...",R (Grisly Images|Bloody Violent Content|Some L...,Dolby Atmos,Digital 2.39:1,"Feb 4, 2025",
