In [99]:
'''
Ojective: Extracting a list of Top 250 movies from IMDB website, along with some key information available
Method: Web-scraping
Database: IMDB
Resource used: Python selenium package

'''

# importing required libraries

import pandas as pd
import numpy as np

from selenium import webdriver
from selenium.webdriver.common.by import  By
from selenium.webdriver.chrome.options import Options

In [100]:
# intializing seleium driver

option = webdriver.ChromeOptions()
driver = webdriver.Chrome(options = option)

# Website address where the data is being hosted

base_url = 'https://www.imdb.com/chart/top/?ref_=nv_mv_250'

# this will take you to the imdb website page, do not close that page as the session is live 

driver.get(base_url)

#driver.quit()

In [101]:
# finding 250 containers that hold each movie
# you could find elements either by class or xpath

'''
find_element() will fetch one specific element in the container
find_elements() will fetch all elements in the container

'''

movies =driver.find_elements(By.XPATH, "//div[@class='ipc-metadata-list-summary-item__tc']")

In [102]:
# finding 1st element i.e. movie title

movies[0].find_element(By.CLASS_NAME, "ipc-title__text").text

'1. The Shawshank Redemption'

In [54]:
# finding the last movie title

movies[249].find_element(By.CLASS_NAME, "ipc-title__text").text

'250. Laapataa Ladies'

In [55]:
# splitting ranking with the title
# I will splitting the title string by first period (.). This will produce two elements in the list
# I would then use the index [0] and [1] to extract the rank and title

movies[249].find_element(By.CLASS_NAME, "ipc-title__text").text.split('.',1)

['250', ' Laapataa Ladies']

In [None]:
# Locate parent div by its class for 'Godfather' movie

parent_div = movies[1].find_element(By.CLASS_NAME, "cli-title-metadata")

In [None]:
# Extract all metadata items within the div

metadata_items = parent_div.find_elements(By.CLASS_NAME, "cli-title-metadata-item")

In [None]:
# Access elements by index:

year = metadata_items[0].text 
year

'1972'

In [None]:
# Access elements by index:

duration = metadata_items[1].text 
duration

'2h 55m'

In [None]:
# Access elements by index:

rating = metadata_items[2].text
rating

'15'

In [146]:
# extracting star rating element

movies[1].find_element(By.CLASS_NAME, "ipc-rating-star--rating").text

'9.2'

In [149]:
# extracting votecount element

movies[1].find_element(By.CLASS_NAME, "ipc-rating-star--voteCount").text

' (2.1M)'

In [150]:
# creating dictionery of schema of movie data

elements_dict ={
    'rank': np.nan, #if a value is missed then use NAN
    'title': np.nan,
    'release_year':np.nan,
    'duration':np.nan,
    'rate_type':np.nan,
    'star_rating':np.nan,
    'vote_count':np.nan
}

In [None]:
# creating a blank data object

data = []

# looping through individual movies

try:
    for movie_list in movies:
        row = elements_dict.copy()
        try:
                  
            row['title'] = movie_list.find_element(By.CLASS_NAME, "ipc-title__text").text.split('.',1)[1]
            row['rank'] = movie_list.find_element(By.CLASS_NAME, "ipc-title__text").text.split('.',1)[0]
        except:
            "No elements found (1)"
    
        try:
                      
            parent_div = movie_list.find_element(By.CLASS_NAME, "cli-title-metadata")
            try:
                elements = parent_div.find_elements(By.CLASS_NAME, "cli-title-metadata-item")
                row['release_year'] = elements[0].text
                row['duration'] = elements[1].text
                row['rate_type'] = elements[2].text
            except:
                "No elements found (2)"
        except:
            "No elements found (3)"

        try:
            row['star_rating']= movie_list.find_element(By.CLASS_NAME, "ipc-rating-star--rating").text
        except:
            "No elements found (4)"
    
        try:
           row['vote_count'] =movie_list.find_element(By.CLASS_NAME, "ipc-rating-star--voteCount").text
        except:
             "No elements found (5)"
        
        data.append(row)
except:
    driver.quit()
    
driver.quit()

In [152]:
# converting the results into a dataframe

df = pd.DataFrame(data)

df.shape


(250, 7)

In [153]:
print(df)

    rank                                              title release_year  \
0      1                           The Shawshank Redemption         1994   
1      2                                      The Godfather         1972   
2      3                                    The Dark Knight         2008   
3      4                              The Godfather Part II         1974   
4      5                                       12 Angry Men         1957   
..   ...                                                ...          ...   
245  246                                           The Help         2011   
246  247                                     I'm Still Here         2024   
247  248                                 Gangs of Wasseypur         2012   
248  249   Demon Slayer: Kimetsu no Yaiba - The Movie: M...         2020   
249  250                                          Andhadhun         2018   

    duration rate_type star_rating vote_count  
0     2h 22m        15         9.3     

In [154]:
# cleaning and normalizing the movie titles

df['transformed_title'] = df['title'].str.strip().str.lower()

In [155]:
list(df)

['rank',
 'title',
 'release_year',
 'duration',
 'rate_type',
 'star_rating',
 'vote_count',
 'transformed_title']

In [156]:
df.shape

(250, 8)

In [157]:
# converting the results into a dataframe and writing as a csv file

import os

myDir = 'D:/Zia/data-engineering/Projects/Imdb-movies-project/Outputs'
path = os.path.join(myDir, 'imdb-movies.csv')

df.to_csv(path, index= False)