In [45]:
# For this to work, you need to have all libraries below installed and have chromedriver.exe in the same folder as this file.

# Data collecting and saving
from bs4 import BeautifulSoup
import pandas as pd

# Dynamic page loading
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import WebDriverException

# Identifying OS
import platform

In [46]:
## Checking in on OS so we can load in the right webdriver
os = platform.system()
print(f"{os} operating system found. Launching chromedriver instance.")

def instance(os):
    if os == "Windows":
        c_driver = webdriver.Chrome("chromedriver.exe")
    else:
        c_driver == webdriver.Chrome("chromedriver")
    return c_driver

driver = instance(os)

Windows operating system found. Launching chromedriver instance.


In [48]:
# Asking for user input
imdb_page = input("Put your IMDB listing here: ").split("/")[4]

# Trying to load in the dynamic page. If no chromedriver instance found, new one is created. 
try:
    driver.get(f'https://www.imdb.com/title/{imdb_page}/reviews?ref_=tt_urv')
    html = driver.page_source
except WebDriverException:
    driver = instance(os)
    driver.get(f'https://www.imdb.com/title/{imdb_page}/reviews?ref_=tt_urv')
    html = driver.page_source
    
# Checking the amount of reviews so we know how many times to press on the 'load more reviews' button
soup = BeautifulSoup(html,"lxml")
page_title = soup.title.string.split(" -")[0]
reviews_per_chunk = 25
nr_of_reviews = int(soup.select(".lister")[0].find("span").get_text().split(" ")[0].replace(",",""))
total_button_presses = round(nr_of_reviews / reviews_per_chunk)

# Giving feedback to user about the amount of reviews
print(f"Listing: {page_title}")
if nr_of_reviews in range(1,26):
    print(f"Only one page of {nr_of_reviews} reviews.")
if total_button_presses > 0:
    print(f"{nr_of_reviews} total reviews in {total_button_presses} chunk(s) of {reviews_per_chunk} reviews.\n")
if total_button_presses > 75:
    print("That is quite a lot of reviews. This might take a while and make your computer run slower for a while.\n")
if nr_of_reviews == 0:
    print("No reviews yet. Come back later...")
    
# Loading all the reviews by pressing the load more button repeatedly
for x in range(0, total_button_presses):
    cp = x + 1
    try:
        load_more = WebDriverWait(driver, 8).until(ec.visibility_of_element_located((By.XPATH, '//*[@id="load-more-trigger"]')));
        load_more.click()
        print(f"Loading all reviews - chunk {cp} out of {total_button_presses}")
    except TimeoutException:
        print("\nHaving some issues loading more reviews... We might be done or just wait a bit for IMDB to respond!")

# Giving feedback that process is done.         
print(f"\nFinished loading of {nr_of_reviews} reviews.")

# Get all page content after all reviews are loaded    
html = driver.page_source
soup = BeautifulSoup(html,'html5lib')

Put your IMDB listing here: https://www.imdb.com/title/tt0414982/reviews?ref_=tt_ov_rt
Listing: Final Destination 3 (2006)
570 total reviews in 23 chunk(s) of 25 reviews.

Loading all reviews - chunk 1 out of 23
Loading all reviews - chunk 2 out of 23
Loading all reviews - chunk 3 out of 23
Loading all reviews - chunk 4 out of 23
Loading all reviews - chunk 5 out of 23
Loading all reviews - chunk 6 out of 23
Loading all reviews - chunk 7 out of 23
Loading all reviews - chunk 8 out of 23
Loading all reviews - chunk 9 out of 23
Loading all reviews - chunk 10 out of 23
Loading all reviews - chunk 11 out of 23
Loading all reviews - chunk 12 out of 23
Loading all reviews - chunk 13 out of 23
Loading all reviews - chunk 14 out of 23
Loading all reviews - chunk 15 out of 23
Loading all reviews - chunk 16 out of 23
Loading all reviews - chunk 17 out of 23
Loading all reviews - chunk 18 out of 23
Loading all reviews - chunk 19 out of 23
Loading all reviews - chunk 20 out of 23
Loading all revie

In [49]:
# Saving all the reviews
print("Collecting reviews from page...")

all_reviews = []

review_blocks = soup.select(".lister-item-content")
for review_block in review_blocks:
    # Review content
    content = review_block.findAll("div",{"class":"text show-more__control clickable"})
    if not content:
        content = review_block.findAll("div",{"class":"text show-more__control"})[0].get_text().strip()
    else:
        content = content[0].get_text().strip()

    # Review Score
    score = review_block.findAll("span",{"class":"rating-other-user-rating"})
    if not score:
        score = "NA"
    else:
        score = score[0].find_all('span')[0].get_text()    
    
    # Review title
    title = review_block.findAll("a",{"class":"title"})[0].get_text().strip()

    # Review author
    author = review_block.findAll("span",{"class":"display-name-link"})[0].select("a")[0].get_text()
    
    # Review date
    date = review_block.findAll("span",{"class":"review-date"})[0].get_text()
    
    review = {
        "author": author,
        "title": title,
        "date": date,
        "score": score,
        "content": content
    }
    
    all_reviews.append(review)

# Putting data into dataframe
print("Reviews saved to dataframe. See below.")    
df = pd.DataFrame(all_reviews)
df = df[['author', 'date', 'title', 'score', 'content']]
df

Collecting reviews from page...
Reviews saved to dataframe. See below.


Unnamed: 0,author,date,title,score,content
0,TheMovieMark,17 February 2006,If you enjoyed the prevous two then you should...,7,Poor Death. The guy can't seem to catch a brea...
1,Merklin,14 February 2006,I thought i was in for a disappointment. I was...,7,"I'm not gonna lie to ya, i wasn't looking forw..."
2,Davyd420,30 March 2006,"Same story, just more creative ways to die.",7,Once you've seen Final Destination there's not...
3,crazy_chick0000,17 February 2006,Very Interesting,8,I'd just have to say that this movie was a gra...
4,manicman84,11 August 2006,abounds with awesome deaths,6,"To be honest, I hadn't expected too much from ..."
5,dictator_ship_degree,21 February 2006,A short review of Final Destinaiton 3!,10,This movie is mainly about a senior named Wend...
6,law_p3-1,21 February 2006,cool,10,"this movie is the best of the year, it has eno..."
7,amazingliz,21 February 2006,Final Destination 3,10,This movie was actually quite good. I found it...
8,DramaDude926,20 February 2006,Final Destination 3 Review,10,I Just saw Final Destination 3 Today and I can...
9,blink182fan117,20 February 2006,Good closing to the series,9,I have been waiting a while for this film to c...


In [50]:
clean_title = page_title.replace(" ","").replace("''","").replace("(","_").replace(")","_").upper()
filename = f"{clean_title}IMDB_REVIEWS.csv"
csv = df.to_csv(filename,index=False)
print(f"Saved to file as '{filename}'.")

Saved to file as 'FINALDESTINATION3_2006_IMDB_REVIEWS.csv'.
