In [4]:
import re


def movie_title_to_search_string(title: str) -> str:
    """ Convert movie title string to the format accepted
        by IMDB search link (i.e. replace whitespaces with +)
    """
    return re.sub("\s+", "+", title)

In [26]:
import bs4
import requests
from typing import Tuple

headers = {"Accept-Language": "en-US"}


class MovieNotFoundException(Exception):
    pass


def get_movie_true_title_and_id(movie_title: str) -> Tuple[str, str]:
    """ Get IMDB id for given movie
    """
    search_string = movie_title_to_search_string(movie_title)
    response = requests.get(
        f"https://www.imdb.com/find?q={search_string}",
        headers = headers
    )
    souped_content = bs4.BeautifulSoup(response.content, "html.parser")
    try:
        first_result = souped_content.find("td", class_="result_text")
        movie_link = first_result.find("a")
        movie_title = movie_link.text
        movie_id = movie_link["href"].split('/')[2]
        return movie_title, movie_id
    except Exception as ex:
        raise MovieNotFoundException(
            f"Movie with title \'{movie_title}\' could NOT be found"
        )


In [27]:
# Test get_movie_id
get_movie_true_title_and_id("spiderman 2")

('Spider-Man 2', 'tt0316654')

In [82]:
from datetime import datetime
from typing import List, NamedTuple, Optional


class Review(NamedTuple):
    title: str
    text: str
    score: Optional[int]
    user: str
    date: str
    helpfulness_votes: int
    positive_helpfulness_votes: int

    def __str__(self) -> str:
        return (
            f"{self.title} ({self.score}/10)\n" +
            f"by {self.user} on {self.date}\n" +
            f"{self.text}...\n\n" +
            f"{self.positive_helpfulness_votes} out of " +
            f"{self.helpfulness_votes} found it useful"
        )


def process_review_text(text_div: bs4.element.Tag) -> str:
    """ Remove html tags and shrink consecutive whitespaces.
        Replace all URLs with a special token.
    """
    # replace links with [URL] token
    links = text_div.find_all("a")
    for link in links:
        link.string = "[URL]"
        link.unwrap()
    # remove all other tags
    text = text_div.get_text().strip()
    # change multiple consecutive whitespaces into a single space
    text = re.sub("\s+", " ", text)
    return text


def extract_review_from_div(tag: bs4.element.Tag) -> Review:
    """ Extract all review info from the review-container div
    """
    title = tag.find("a", class_="title").text.strip()
    user = tag.find("span", class_="display-name-link").find("a").text
    date = tag.find("span", class_="review-date").text
    # change date format to numerical one
    date = datetime.strptime(date, "%d %B %Y").strftime("%Y/%m/%d")
    try:
        score = int(tag.find("span", class_="point-scale").previous_sibling.text)
    except Exception:
        score = None  # no score is specified for this review
    help_text = tag.find(
        "div", class_="actions text-muted"
    ).text.strip().split(' ')
    positive_helpfulness_votes, helpfulnes_votes = int(help_text[0]), int(help_text[3])

    # get review text
    text_div = tag.find("div", class_="text show-more__control")
    text = process_review_text(text_div)
    
    return Review(title=title, text=text, score=score, user=user, date=date,
        helpfulness_votes=helpfulnes_votes,
        positive_helpfulness_votes=positive_helpfulness_votes
    )

In [84]:
# Test extract_review_from_div

div = """<div class="review-container">
        <div class="lister-item-content">
    <div class="ipl-ratings-bar">
            <span class="rating-other-user-rating">
            <svg class="ipl-icon ipl-star-icon  " xmlns="http://www.w3.org/2000/svg" fill="#000000" height="24" viewBox="0 0 24 24" width="24">
                <path d="M0 0h24v24H0z" fill="none"/>
                <path d="M12 17.27L18.18 21l-1.64-7.03L22 9.24l-7.19-.61L12 2 9.19 8.63 2 9.24l5.46 4.73L5.82 21z"/>
                <path d="M0 0h24v24H0z" fill="none"/>
            </svg>
                <span>9</span><span class="point-scale">/10</span>
            </span>
    </div>
<a href="/review/rw5638050/?ref_=tt_urv"
class="title" > Just rewatching in 2020
</a>            <div class="display-name-date">
                    <span class="display-name-link"><a href="/user/ur68951879/?ref_=tt_urv"
>koyushun</a></span><span class="review-date">14 April 2020</span>
            </div>
            <div class="content">
                <div class="text show-more__control">I was a kid when I watched this in cinema back in 2004
I just want to say after all these years after a few version of Spider-Mans and all the MCU movie.
<br>
<br>
This one is hands down the best Superhero movie <a href="google.com">link1</a>.
It has everything done within 2 hours. Perfectly <a href="google.com">link2</a> caught up what it left of from the previous Spider-Man and Toby Maguire will always be my Spider-Man.</div>
                <div class="actions text-muted">
                    26 out of 37 found this helpful.
                        <span>
                            Was this review helpful? <a href="/registration/signin?ref_=urv"
> Sign in</a> to vote.
                        </span>
                        <br/>
                    <a href="/review/rw5638050/?ref_=tt_urv"
>Permalink</a>
                </div>
            </div>
        </div>
        <div class="clear"></div>
    </div>
"""

div_soup = bs4.BeautifulSoup(div).find("div", class_="review-container")
print(extract_review_from_div(div_soup))


Just rewatching in 2020 (9/10)
by koyushun on 2020/04/14
I was a kid when I watched this in cinema back in 2004 I just want to say after all these years after a few version of Spider-Mans and all the MCU movie. This one is hands down the best Superhero movie [URL]. It has everything done within 2 hours. Perfectly [URL] caught up what it left of from the previous Spider-Man and Toby Maguire will always be my Spider-Man....

26 out of 37 found it useful


In [95]:
MAX_REVIEW_COUNT = 100


def scrape_reviews(movie_id: str) -> List[Review]:
    """Get all reviews for given movie id
    """
    request_link = f"https://www.imdb.com/title/{movie_id}/reviews"
    review_divs = []
    while len(review_divs) < MAX_REVIEW_COUNT:
        response = requests.get(
            request_link,
            headers = headers
        )
        souped_content = bs4.BeautifulSoup(response.content, "html.parser")
        review_divs.extend(
            souped_content.find_all("div", class_="review-container")
        )
        
        # load more link
        load_more_div = souped_content.find("div", class_="load-more-data")
        if not load_more_div:
            break
        pagination_key = load_more_div["data-key"]
        request_link = (
            f"https://www.imdb.com/title/{movie_id}/reviews/_ajax?ref_=undefined&" +
            f"paginationKey={pagination_key}"
        )
    reviews = []
    for review_div in review_divs:
        reviews.append(extract_review_from_div(review_div))

    return reviews

In [99]:
# Test scrape_reviews
import random


reviews = scrape_reviews("tt0316654")
print(len(reviews))
print(random.choice(reviews))

100
Spider-sequel-tastic (10/10)
by blacklist-1 on 2009/09/29
My presumptions of a sequel to a successful original are that they are worn out and are a desperate attempt to cash in on it's predecessor. Look no further for conclusive evidence with the terrible Transformers: Revenge of the Fallen to know what I'm talking about.But every now and again a sequel comes along that just simply changes your perceptions and gives you faith in a continuing franchise where the sequel is just as or in the case of Spiderman 2 better than it's predecessor.Toby Maquire, Kirsten Dunst and James Franco return with director Sam Raimi who has done a terrific job in making this for me one of the best sequels of all time.One of the successes Spiderman 2 has is the depth of the characters and stories as there is more than one strand. Toby Maquire has the same natural ability which he possessed in the first film to accurately convey the character of Peter Parker who struggles with his feelings for Mary Jane w

In [100]:
from tqdm import tqdm

MOVIES = ["Spiderman 2", "taxi driver", "goodfellas", "the   green mile"]
COLUMNS = ["movie", "title", "text", "score", "user", "date", "helpfulness_votes", "positive_helpfulness_votes"]


all_reviews = []
for movie in tqdm(MOVIES):
    movie_title, movie_id = get_movie_true_title_and_id(movie)
    reviews = scrape_reviews(movie_id)
    all_reviews.extend(
        (
            (
                movie_title, review.title, review.text, review.score, review.user,
                review.date, review.helpfulness_votes, review.positive_helpfulness_votes
            ) 
            for review in reviews
        )
    )

100%|██████████| 4/4 [00:18<00:00,  4.63s/it]


In [122]:
import pandas as pd
import random


random.shuffle(all_reviews)
df = pd.DataFrame(all_reviews, columns=COLUMNS)
df.to_csv("imdb_reviews.csv")
df.head(10)

Unnamed: 0,movie,title,text,score,user,date,helpfulness_votes,positive_helpfulness_votes
0,Spider-Man 2,Spider-Man just keeps getting better and better!,This movie takes everything from the first mov...,9.0,Green_Saturn,2010/09/03,3,3
1,Taxi Driver,An Enigmatic Masterpiece,If a picture is worth a thousand words then th...,,JosefSerf,2004/05/04,330,253
2,Goodfellas,Incredible crime drama,Goodfellas is an incredible crime drama that's...,9.0,masonsaul,2020/07/27,4,3
3,The Green Mile,Rest in Peace Michael Clarke Duncan,There's a reason why this Oscar winning tour d...,10.0,epluribusunum2010,2012/09/05,66,61
4,Goodfellas,Most overrated movie ever!,Maybe I was so disappointed in this movie beca...,1.0,mfmoore-1,2007/10/14,153,63
5,The Green Mile,Frank Darabont's Green Mile goes the extra mil...,I promise you that if you have any humanity in...,9.0,ironhorse_iv,2013/08/05,6,4
6,The Green Mile,It is Beyond its Date,There is nothing left to say after 21 years ha...,10.0,burakparlak,2020/02/03,2,2
7,The Green Mile,I loved it!!!,I had looked forward to this movie ever since ...,10.0,corky-7,1999/12/11,3,3
8,Spider-Man 2,Spider-man is back but with a greater enemy Do...,He's back but with a bigger problem now. His p...,9.0,howellgirl2011,2011/01/13,22,14
9,The Green Mile,"A Masterpiece of Cinematic Art -- Captivating,...","""The Green Mile"" expertly mixes film genres: I...",9.0,Wuchakk,2011/08/19,9,5
