Web Scraping - Where we scrap the data from a website without using API.

Two web scraping methods in Python: 

1) Using Beautifulsoup
2) Seleinum with web drive

Both of them depend on your understanding of web page development using HTML

In [1]:
# Install beautifulSoup

!pip install --user bs4

Collecting bs4
  Using cached bs4-0.0.1.tar.gz (1.1 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py): started
  Building wheel for bs4 (setup.py): finished with status 'done'
  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1264 sha256=97aa1d843c72331f72a9dd36b29a6b1569acc5263ca6197f64f4b6f4b0c4f9a8
  Stored in directory: c:\users\idris\appdata\local\pip\cache\wheels\d4\c8\5b\b5be9c20e5e4503d04a6eac8a3cd5c2393505c29f02bea0960
Successfully built bs4
Installing collected packages: bs4
Successfully installed bs4-0.0.1


In [2]:
# Import libraries

from requests import get

from bs4 import BeautifulSoup

In [3]:
# Step 1: Create a link object 

url = "https://www.imdb.com/search/title/?release_date=2019-01-01,2019-12-31&start=1"

# Step 2: To request the server to connect to the webpage

response = get(url)

In [4]:
# Step 3: We need create beautifulSoup object

bs4_obj = BeautifulSoup(response.text, 'html.parser')

# Step 4: A MUST STEP - I will use a command of find_all() to extract all the containers 
# which contain the primary tag - 'div' and primary class - "lister-item mode-advanced"

master_container = bs4_obj.find_all('div', class_ = 'lister-item mode-advanced')

len(master_container)

50

In [5]:
# Lets view the 1st container

first_movie = master_container[0]

# To get the name of a movie, we have to go to h3 tag and in h3 tag we have to go to a tag. 

first_movie.h3.a.text

'The Witcher'

In [6]:
first_movie

<div class="lister-item mode-advanced">
<div class="lister-top-right">
<div class="ribbonize" data-caller="filmosearch" data-tconst="tt5180504"></div>
</div>
<div class="lister-item-image float-left">
<a href="/title/tt5180504/"> <img alt="The Witcher" class="loadlate" data-tconst="tt5180504" height="98" loadlate="https://m.media-amazon.com/images/M/MV5BMDEwOWVlY2EtMWI0ZC00OWVmLWJmZGItYTk3YjYzN2Y0YmFkXkEyXkFqcGdeQXVyMTUzMTg2ODkz._V1_UX67_CR0,0,67,98_AL_.jpg" src="https://m.media-amazon.com/images/S/sash/4FyxwxECzL-U1J8.png" width="67"/>
</a> </div>
<div class="lister-item-content">
<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">1.</span>
<a href="/title/tt5180504/">The Witcher</a>
<span class="lister-item-year text-muted unbold">(2019– )</span>
</h3>
<p class="text-muted">
<span class="certificate">18+</span>
<span class="ghost">|</span>
<span class="runtime">60 min</span>
<span class="ghost">|</span>
<span class="genre">
Action, Adventure, Drama   

In [7]:
# How we can extract the year of movie - find()

first_movie.h3.find('span', class_ = 'lister-item-year text-muted unbold').text


'(2019– )'

In [8]:
# Finding the rating of first movie from strong tag

first_movie.strong.text

'8.0'

In [10]:
meta_score_element = first_movie.find('span', class_='metascore favorable')

if meta_score_element is not None:
    meta_score = int(meta_score_element.text)
    print("Meta score:", meta_score)
else:
    print("Meta score not found for the first movie.")

Meta score not found for the first movie.


In [11]:
# Find the votes from the first movie

first_movie.find('span', {'name' : 'nv'})['data-value']

'548564'

In [12]:
# Putting every thing into a single code to extract infor from 50 continers

# Step 1: Create the empty list based on your final output

name = []
year = []
rating = []
vote = []
metascore = []

# Step 2: Extract the information 

for x in master_container:
    # extracting the movie name
    
    mov_name = x.h3.a.text
    name.append(mov_name)
    
    # extract the year
    
    mov_year = x.h3.find('span', class_ = 'lister-item-year text-muted unbold').text
    year.append(mov_year)
    
    # extract the rating
    
    mov_rating = x.strong.text
    rating.append(mov_rating)
    
   # extracting the vote
    
    mov_vote = x.find('span', {'name' : 'nv'})['data-value']
    
    vote.append(mov_vote)
    
    # extracting the metascore avoiding the missing values
    if x.find('div', class_ = 'inline-block ratings-metascore') is not None:
        mov_meta = x.find('span', class_ = 'metascore').text
    else:
        mov_meta = "NA"
    
    metascore.append(mov_meta)      


In [13]:
import pandas as pd

df = pd.DataFrame({"Movie Name" : name,
                  "Year" : year,
                  'Rating' : rating,
                  'Voting' : vote,
                  "MetaScore" : metascore})

# Removing the brackets from year column

df['Year'] = df['Year'].str.replace('(', "")
df['Year'] = df['Year'].str.replace(')', "")
df['Year'] = df['Year'].str.replace('–', "")

df

  df['Year'] = df['Year'].str.replace('(', "")
  df['Year'] = df['Year'].str.replace(')', "")


Unnamed: 0,Movie Name,Year,Rating,Voting,MetaScore
0,The Witcher,2019,8.0,548564,
1,Good Omens,20192023,8.0,100447,
2,The Boys,2019,8.7,579797,
3,What We Do in the Shadows,2019,8.6,90113,
4,ذا ماندلوريان,2019,8.7,557091,
5,Warrior,2019,8.4,34616,
6,The Righteous Gemstones,2019,8.1,43049,
7,Sex Education,20192023,8.3,310360,
8,نشوة,2019,8.3,217627,
9,The Morning Show,I 2019,8.2,113121,


For Flipkart