*Web Scraping on the IMDb Website*

### Importing Required Libraies

In [53]:
import requests                        # helps us to send HTTP requests to get HTML files
from requests import get

from bs4 import BeautifulSoup          # web scraper library in python 

import pandas as pd                    # help us to ata manipulation and analysis of HTML files 
import numpy as np                     # help in use of  mathematical functions and tools for working with arrays 

English-translated titles

In [54]:
headers = {"Accept-Language": "en-US, en;q=0.5"}

Request contents of the URL

In [55]:
url = "https://www.imdb.com/search/title/?groups=top_1000&ref_=adv_prv"

results = requests.get(url, headers=headers) 

Using BeautifulSoup

In [56]:
soup = BeautifulSoup(results.text, "html.parser")

print(soup.prettify())

<!DOCTYPE html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
 <head>
  <meta charset="utf-8"/>
  <script type="text/javascript">
   var IMDbTimer={starttime: new Date().getTime(),pt:'java'};
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <title>
   IMDb "Top 1000"
(Sorted by Popularity Ascending) - IMDb
  </title>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   if (typeof uex == 'function') {
      uex("ld", "LoadTitle", {wb: 1});
    }
  </script>
  <link href="https://www.imdb.com/search/title/?groups=top_1000" rel="canonical"/>
  <meta content="http://www.i

"soup" variable is used to assign the method BeatifulSoup, which specifies a format for the HTML parser to use instead of treating the page as a single string.

We can print the data we've grabbed in a more readable tree format by printing soup.prettify().

Initialize empty lists where you'll store your data

In [57]:
titles = []
years = []
time = []
imdb_ratings = []
metascores = []
votes = []
us_gross = []

In [58]:
movie_div = soup.find_all('div', class_='lister-item mode-advanced') # extracts all the div containers that have a class attribute of lister-item mode-advanced from website 

In [59]:
#initiate the for loop 
#this tells your scraper to iterate through 
#every div container we stored in move_div
for container in movie_div:
    
    # Movies Title 
    name = container.h3.a.text   # h3 and a is attribute notation and tells the scraper to access each of those tags in 'text' format 
    titles.append(name)
    
    # Movies Year
    year = container.h3.find('span', class_='lister-item-year').text   # method is use to access this particular tag
    years.append(year)
    
    # Movies Duration
    runtime = container.find('span', class_='runtime').text if container.p.find('span', class_='runtime') else '' 
    time.append(runtime)
    
    # Movies Rating 
    imdb = float(container.strong.text) # strong is attribute notation that tells the scraper to access that tag
    imdb_ratings.append(imdb)
    
    # Metascore
    m_score = container.find('span', class_='metascore').text if container.find('span', class_='metascore') else '-'
    metascores.append(m_score)
    
    #  votes and gross earnings
    nv = container.find_all('span', attrs={'name': 'nv'})
    
    vote = nv[0].text  # nv[0]--->  The scraper should grab the first data from the nv tag - the votes.
    votes.append(vote)
        
    grosses = nv[1].text if len(nv) > 1 else '-'  # The nv[1] tag tells the scraper to look into the nv tag and grab the second item in the list.
    us_gross.append(grosses) 

In [60]:
print(titles)
print(years)
print(time)
print(imdb_ratings)
print(metascores)
print(votes)
print(us_gross)

['Avatar: The Way of Water', 'Avatar', 'The Banshees of Inisherin', "Guillermo del Toro's Pinocchio", 'Knives Out', 'Home Alone', 'A Christmas Story', 'Top Gun: Maverick', "It's a Wonderful Life", 'Everything Everywhere All at Once', 'Die Hard', 'The Sound of Music', 'The Batman', 'Prisoners', 'Klaus', 'All Quiet on the Western Front', 'Miracle on 34th Street', 'The Muppet Christmas Carol', 'Titanic', "Harry Potter and the Sorcerer's Stone", 'The Godfather', 'Pulp Fiction', 'Interstellar', 'The Shawshank Redemption', 'The Nightmare Before Christmas', 'Once Upon a Time in Hollywood', 'Good Will Hunting', 'Dune', 'Kantara', 'Spider-Man: No Way Home', 'The Wolf of Wall Street', 'RRR', 'American Psycho', 'Avengers: Endgame', 'The Dark Knight', 'Inception', 'The Lord of the Rings: The Fellowship of the Ring', 'Edward Scissorhands', 'Whiplash', 'In Bruges', 'Fight Club', 'Blade Runner 2049', 'Guardians of the Galaxy', 'Goodfellas', 'Django Unchained', 'Back to the Future', 'Inglourious Baste

### Build a dataframe with pandas 

In [61]:
movies = pd.DataFrame({ 'movie': titles, 'year': years, 'timeMin': time, 'imdb': imdb_ratings, 
                       'metascore': metascores, 'votes': votes, 'us_grossMillions': us_gross })

In [62]:
movies.head()

Unnamed: 0,movie,year,timeMin,imdb,metascore,votes,us_grossMillions
0,Avatar: The Way of Water,(2022),192 min,7.9,67,167963,-
1,Avatar,(2009),162 min,7.9,83,1285648,$760.51M
2,The Banshees of Inisherin,(2022),114 min,7.9,87,56507,-
3,Guillermo del Toro's Pinocchio,(2022),117 min,7.7,79,53823,-
4,Knives Out,(2019),130 min,7.9,82,658206,$165.36M


In [63]:
movies.info()   

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   movie             50 non-null     object 
 1   year              50 non-null     object 
 2   timeMin           50 non-null     object 
 3   imdb              50 non-null     float64
 4   metascore         50 non-null     object 
 5   votes             50 non-null     object 
 6   us_grossMillions  50 non-null     object 
dtypes: float64(1), object(6)
memory usage: 2.9+ KB


The year, timeMin, metascore votes, and us_grossMillions data are integer data, while us_grossMillions is a float data.

In [64]:
# The values in dataframe are in (2022) form, so to remove them we need to use the extract function & convert them to integers
movies['year'] = movies['year'].str.extract('(\d+)').astype(int)

In [65]:
movies['timeMin'] = movies['timeMin'].str.extract('(\d+)').astype(int)

In [66]:
movies['metascore'] = movies['metascore'].str.replace('-', '0').astype(int)

In [67]:
movies['votes'] = movies['votes'].str.replace(',', '').astype(int)

In [68]:
movies['us_grossMillions'] = movies['us_grossMillions'].map(lambda x: x.lstrip('$').rstrip('M'))
# Remove $ symbol and M from the string  
movies['us_grossMillions'] = pd.to_numeric(movies['us_grossMillions'], errors='coerce')

In [69]:
movies.head(10)

Unnamed: 0,movie,year,timeMin,imdb,metascore,votes,us_grossMillions
0,Avatar: The Way of Water,2022,192,7.9,67,167963,
1,Avatar,2009,162,7.9,83,1285648,760.51
2,The Banshees of Inisherin,2022,114,7.9,87,56507,
3,Guillermo del Toro's Pinocchio,2022,117,7.7,79,53823,
4,Knives Out,2019,130,7.9,82,658206,165.36
5,Home Alone,1990,103,7.7,63,587945,285.76
6,A Christmas Story,1983,93,7.9,77,158661,20.61
7,Top Gun: Maverick,2022,130,8.4,78,472465,
8,It's a Wonderful Life,1946,130,8.6,89,462825,
9,Everything Everywhere All at Once,2022,139,8.1,81,267640,


In [70]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   movie             50 non-null     object 
 1   year              50 non-null     int32  
 2   timeMin           50 non-null     int32  
 3   imdb              50 non-null     float64
 4   metascore         50 non-null     int32  
 5   votes             50 non-null     int32  
 6   us_grossMillions  39 non-null     float64
dtypes: float64(2), int32(4), object(1)
memory usage: 2.1+ KB


### Saving to CSV file

In [71]:
movies.to_csv('movies.csv')

### Summary

- Inspected our HTML for the data we need
- Wrote code to extract the data
- Put our code in a loop to grab all the data from each movie
- Built a DataFrame with pandas
- Cleaned our data in pandas
- Handled type conversion to make our data consistent 
- Saved your scraped data to a CSV