In [1]:


# Step 1 : import

import requests
print('Requests version: ' + requests.__version__)

import bs4
print('Beautiful Soup version: ' + bs4.__version__)
from bs4 import BeautifulSoup

Requests version: 2.22.0
Beautiful Soup version: 4.7.1


In [2]:
# Step 2 : Send Requests


# Send a request to https://www.imdb.com/chart/top and download the HTML Content of the page
r = requests.get('https://www.imdb.com/chart/top')
page_html = r.text

page_html[:500]

'\n\n\n<!DOCTYPE html>\n<html\n    xmlns:og="http://ogp.me/ns#"\n    xmlns:fb="http://www.facebook.com/2008/fbml">\n    <head>\n         \n        <meta charset="utf-8">\n        <meta http-equiv="X-UA-Compatible" content="IE=edge">\n\n    \n    \n    \n\n    \n    \n    \n\n    <meta name="apple-itunes-app" content="app-id=342792525, app-argument=imdb:///?src=mdot">\n            <style>\n                body#styleguide-v2 {\n                    background: no-repeat fixed center top #000;\n                }\n           '

In [3]:
# Step 3 : Pass the HTML Content to BeautifulSoup and construct a tree (BS object) to parse

page_soup = BeautifulSoup(page_html, "html.parser")

In [4]:
# Step 4 :  Find all the tags inside the tree that include top 250 movies' information

movies = page_soup.find_all(name = "tr")
movies[:3]

# Get bs4.element.Tag that includes the highest ranking movie info
movie = movies[1]
movie

# Check the type
type(movie)

# Print out
print(movie.prettify())


# Get movie name
name = movie.find(name="td",attrs={"class":"titleColumn"}).find(name="a").string
name = name.replace(",","|").strip()

# Get movie year
year = movie.find(name="td",attrs={"class":"titleColumn"}).find(name="span").string
year = year.replace(")", "").replace("(", "").strip()


# Get movie rating
rating = movie.find(name="td",attrs={"class":"ratingColumn imdbRating"}).find(name="strong").string
rating = rating.strip()


# Get number of user rating
num_user_rating = movie.find(name="td",attrs={"class":"ratingColumn imdbRating"}).find(name="strong").attrs['title']
num_user_rating = num_user_rating.split(" ")[3].replace(",","")

<tr>
 <td class="posterColumn">
  <span data-value="1" name="rk">
  </span>
  <span data-value="9.222187111474618" name="ir">
  </span>
  <span data-value="7.791552E11" name="us">
  </span>
  <span data-value="2147517" name="nv">
  </span>
  <span data-value="-1.777812888525382" name="ur">
  </span>
  <a href="/title/tt0111161/">
   <img alt="The Shawshank Redemption" height="67" src="https://m.media-amazon.com/images/M/MV5BMDFkYTc0MGEtZmNhMC00ZDIzLWFmNTEtODM1ZmRlYWMwMWFmXkEyXkFqcGdeQXVyMTMxODk2OTU@._V1_UY67_CR0,0,45,67_AL_.jpg" width="45"/>
  </a>
 </td>
 <td class="titleColumn">
  1.
  <a href="/title/tt0111161/" title="Frank Darabont (dir.), Tim Robbins, Morgan Freeman">
   The Shawshank Redemption
  </a>
  <span class="secondaryInfo">
   (1994)
  </span>
 </td>
 <td class="ratingColumn imdbRating">
  <strong title="9.2 based on 2,147,517 user ratings">
   9.2
  </strong>
 </td>
 <td class="ratingColumn">
  <div class="seen-widget seen-widget-tt0111161 pending" data-titleid="tt01111

In [8]:
# Step 5 : Extract movie features and save data in a csv file

# File name 'imdb_top_250.csv'
filename = "imdb_top_250.csv"

# Create above file with write permission
f = open(filename, "w", encoding='utf-8')

# Define header name
headers = "Rank,Name,Year,Rating,Num_user_rating\n"

# Write header in csv
f.write(headers)


Rank = 0
for movie in movies[1:251]:
    
    Rank = Rank + 1
    
    Name = movie.find(name="td",attrs={"class":"titleColumn"}).find(name="a").string
    Name = Name.replace(",","|").strip()
    
    Year = movie.find(name="td",attrs={"class":"titleColumn"}).find(name="span").string
    Year = Year.replace(")", "").replace("(", "").strip()

    Rating = movie.find(name="td",attrs={"class":"ratingColumn imdbRating"}).find(name="strong").string
    Rating = Rating.strip()
        
    Num_user_rating = movie.find(name="td",attrs={"class":"ratingColumn imdbRating"}).find(name="strong").attrs['title']
    Num_user_rating = Num_user_rating.split(" ")[3].replace(",","")
    
    f.write(str(Rank) + "," + Name + "," + Year + "," + Rating + "," + Num_user_rating + "\n")


f.close()