# Data 360 Final

### Final web scraping code.

Scraping data from IMDb's 100 most popular movies. Data scraped includes titles, release dates, watch time, MPAA rating, star rating, genres, and url to the movies page on IMDb.

In [20]:
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
import pandas as pd


driver = webdriver.Safari()
driver.get("https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm")


last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
    time.sleep(2)
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height


soup = BeautifulSoup(driver.page_source, "html.parser")
driver.quit()

results = soup.find('ul', class_='ipc-metadata-list ipc-metadata-list--dividers-between sc-e22973a9-0 khSCXM compact-list-view ipc-metadata-list--base')
movie_elements = results.find_all('li', class_="ipc-metadata-list-summary-item")

In [36]:
movie = []

for movie_element in movie_elements:
    genre = []
    title_element = movie_element.find("h3").text.strip()
    
    descr_elements = movie_element.find_all("span", class_="sc-4b408797-8 iurwGb cli-title-metadata-item")
    star_element = movie_element.find("span", class_="ipc-rating-star--rating")
    star_element = star_element.text.strip() if star_element else None
    link_element = movie_element.find("a")["href"]
    movie_url = "https://www.imdb.com" + link_element

    release_element = descr_elements[0].text.strip() if len(descr_elements) > 0 else None
    time_element = descr_elements[1].text.strip() if len(descr_elements) > 1 else None
    movie_element = descr_elements[2].text.strip() if len(descr_elements) > 2 else None

    try:
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",}
        movie_page = requests.get(movie_url, headers=headers)
        movie_soup = BeautifulSoup(movie_page.content, "html.parser")

    
        genre_elements = movie_soup.find_all('a', class_="ipc-chip ipc-chip--on-baseAlt")
        for genre_element in genre_elements:
            genre_name = genre_element.find('span', class_="ipc-chip__text").text.strip()

            genre.append(genre_name)

    except:
        genre.append(None)
     
    movie.append([title_element, release_element, time_element, movie_element, star_element, genre, movie_url])
movie

[['Thunderbolts*',
  '2025',
  '2h 6m',
  'PG-13',
  '7.7',
  ['Political Drama',
   'Superhero',
   'Action',
   'Adventure',
   'Crime',
   'Drama',
   'Fantasy',
   'Sci-Fi'],
  'https://www.imdb.com/title/tt20969586/?ref_=chtmvm_i_1'],
 ['Sinners',
  '2025',
  '2h 17m',
  'R',
  '8.1',
  ['Period Drama',
   'Supernatural Horror',
   'Vampire Horror',
   'Action',
   'Drama',
   'Horror',
   'Thriller'],
  'https://www.imdb.com/title/tt31193180/?ref_=chtmvm_i_2'],
 ['Havoc',
  '2025',
  '1h 47m',
  'TV-MA',
  '5.7',
  ['Conspiracy Thriller',
   'Drug Crime',
   'Gun Fu',
   'Martial Arts',
   'One-Person Army Action',
   'Suspense Mystery',
   'Action',
   'Crime',
   'Drama',
   'Mystery'],
  'https://www.imdb.com/title/tt14123284/?ref_=chtmvm_i_3'],
 ['Another Simple Favor',
  '2025',
  '2h',
  'R',
  '5.3',
  ['Dark Comedy', 'Whodunnit', 'Comedy', 'Crime', 'Mystery', 'Thriller'],
  'https://www.imdb.com/title/tt20214908/?ref_=chtmvm_i_4'],
 ['The Accountant 2',
  '2025',
  '2h 12

In [39]:
imdb_ranking = pd.DataFrame(movie, columns=["Title", "Release Date", "Run Time", "MPAA Film Rating", "Rating", "Genres", "Webpage"])
imdb_ranking

Unnamed: 0,Title,Release Date,Run Time,MPAA Film Rating,Rating,Genres,Webpage
0,Thunderbolts*,2025,2h 6m,PG-13,7.7,"[Political Drama, Superhero, Action, Adventure...",https://www.imdb.com/title/tt20969586/?ref_=ch...
1,Sinners,2025,2h 17m,R,8.1,"[Period Drama, Supernatural Horror, Vampire Ho...",https://www.imdb.com/title/tt31193180/?ref_=ch...
2,Havoc,2025,1h 47m,TV-MA,5.7,"[Conspiracy Thriller, Drug Crime, Gun Fu, Mart...",https://www.imdb.com/title/tt14123284/?ref_=ch...
3,Another Simple Favor,2025,2h,R,5.3,"[Dark Comedy, Whodunnit, Comedy, Crime, Myster...",https://www.imdb.com/title/tt20214908/?ref_=ch...
4,The Accountant 2,2025,2h 12m,R,7.1,"[Whodunnit, Action, Crime, Drama, Mystery, Thr...",https://www.imdb.com/title/tt7068946/?ref_=cht...
...,...,...,...,...,...,...,...
95,Trap,2024,1h 45m,PG-13,5.8,"[Psychological Horror, Psychological Thriller,...",https://www.imdb.com/title/tt26753003/?ref_=ch...
96,Inception,2010,2h 28m,PG-13,8.8,"[Action Epic, Adventure Epic, Epic, Psychologi...",https://www.imdb.com/title/tt1375666/?ref_=cht...
97,Karate Kid: Legends,2025,1h 58m,,,"[Coming-of-Age, Martial Arts, Teen Drama, Acti...",https://www.imdb.com/title/tt1674782/?ref_=cht...
98,Deep Cover,2025,1h 49m,R,,"[Comedy, Crime]",https://www.imdb.com/title/tt31121295/?ref_=ch...


In [43]:
imdb_ranking_exploded = imdb_ranking.explode('Genres')
imdb_ranking_exploded

Unnamed: 0,Title,Release Date,Run Time,MPAA Film Rating,Rating,Genres,Webpage
0,Thunderbolts*,2025,2h 6m,PG-13,7.7,Political Drama,https://www.imdb.com/title/tt20969586/?ref_=ch...
0,Thunderbolts*,2025,2h 6m,PG-13,7.7,Superhero,https://www.imdb.com/title/tt20969586/?ref_=ch...
0,Thunderbolts*,2025,2h 6m,PG-13,7.7,Action,https://www.imdb.com/title/tt20969586/?ref_=ch...
0,Thunderbolts*,2025,2h 6m,PG-13,7.7,Adventure,https://www.imdb.com/title/tt20969586/?ref_=ch...
0,Thunderbolts*,2025,2h 6m,PG-13,7.7,Crime,https://www.imdb.com/title/tt20969586/?ref_=ch...
...,...,...,...,...,...,...,...
98,Deep Cover,2025,1h 49m,R,,Comedy,https://www.imdb.com/title/tt31121295/?ref_=ch...
98,Deep Cover,2025,1h 49m,R,,Crime,https://www.imdb.com/title/tt31121295/?ref_=ch...
99,A Real Pain,2024,1h 30m,R,7.1,Buddy Comedy,https://www.imdb.com/title/tt21823606/?ref_=ch...
99,A Real Pain,2024,1h 30m,R,7.1,Comedy,https://www.imdb.com/title/tt21823606/?ref_=ch...


In [None]:
imdb_ranking.to_csv('imdb_ranking.csv', index=False)

In [None]:
imdb_ranking_exploded.to_csv('imdb_ranking_genres.csv', index=False)