In [1]:
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import time

## Webscrape IMDb Reviews Using Selenium

In [2]:
# First, open iMDB

url = "https://www.imdb.com/title/tt11737520/reviews/"

option = Options()
option.add_argument("start-maximized")

driver = webdriver.Chrome(options=option)
driver.get(url)

In [3]:
# Load everything

load_all = driver.find_element(By.XPATH, '//*[@id="__next"]/main/div/section/div/section/div/div[1]/section[1]/div[3]/div/span[2]/button')

while True:
    try:
        load_all.send_keys(Keys.ENTER)
        time.sleep(10)
    except Exception as e:
        break

In [4]:
# Spoiler Buttons
# Needed to take reference from other sources, including selenium official website to ensure all of them are expanded

while True:
    spoiler_buttons = driver.find_elements(By.CLASS_NAME, "review-spoiler-button")

    if not spoiler_buttons:
        break

    clicked = False

    for button in spoiler_buttons:
        try:
            driver.execute_script("arguments[0].scrollIntoView(true);", button)
            driver.execute_script("arguments[0].click();", button)
            time.sleep(0.4)
            clicked = True
            break   
        except:
            continue

    if not clicked:
        break


In [5]:
# Let's web scrape all the reviews

review_blocks = driver.find_elements(By.CLASS_NAME, "user-review-item")

ratings = []
review_titles = []
reviews = []
review_dates = []

for review_block in review_blocks:

    # Ratings
    try:
        rating = review_block.find_element(By.CLASS_NAME, "ipc-rating-star--rating").text
    except:
        rating = np.nan

    # Review Titles
    try:
        title = review_block.find_element(By.CLASS_NAME, "ipc-title__text").text
    except:
        title = np.nan

    # Review
    try:
        review = review_block.find_element(By.CLASS_NAME, "ipc-html-content-inner-div").text
    except:
        review = np.nan

    # Review Dates
    try:
        date = review_block.find_element(By.CLASS_NAME, "review-date").text
    except:
        date = np.nan

    ratings.append(rating)
    review_titles.append(title)
    reviews.append(review)
    review_dates.append(date)



In [6]:
# Check to make sure all of them are included

print(len(ratings), len(review_titles), len(reviews), len(review_dates))

1056 1056 1056 1056


## Create Dataframe

In [7]:
# Create dateframe

onepiece_columns = {"ratings": ratings,
           "review_titles": review_titles,
           "reviews": reviews,
           "review_dates": review_dates}

one_piece_imdb = pd.DataFrame(onepiece_columns)

one_piece_imdb["source"] = 'IMDb.com'

In [8]:
# Save it to csv

one_piece_imdb.to_csv('one_piece_imdb.csv')

In [9]:
# End Selenium

driver.close()
driver.quit()