In [2]:
"""
This jupyter notebook is for scraping the movie review data from most popular
users from the letterboxd.com website in a respectful manner to its servers.
@author: intelmt
"""

import urllib.request
from bs4 import BeautifulSoup, NavigableString
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementNotInteractableException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent

import time
import re
import pandas as pd
import numpy as np
import random
import gc
pd.set_option('display.max_colwidth', 2000)

In [None]:
BASE_PEOPLE = "https://letterboxd.com/people/popular/page/"
LBOX = "https://letterboxd.com"
REV = "films/reviews/page/"

In [None]:
userNames = list()

t0 = time.time()

# Get the usernames from people page by all-time popularity (set to: 1st page)
for i in range (1,2):
    URL = BASE_PEOPLE + str(i)
    req = urllib.request.Request(URL)
    user_agent = UserAgent().random
    req.add_header('user-agent', user_agent)
    soup = BeautifulSoup(urllib.request.urlopen(req),"html.parser")
    time.sleep(random.randint(10,15))
    for tag in soup.find_all("h3", "title-3"):
        href = tag.a.get("href")
        userNames.append(href)
    
t1 = time.time()            
print(f"{t1-t0} seconds to download {len(userNames)} usernames.")

In [None]:
# removing the repetitions of most popular 5 userNames (the ones that appear on the right dock of every page)
userNames = list(set(userNames))
# UNCOMMENT the following 2 lines if not the starting from the 1st page for people
# userNames = [user for user in userNames if user not in \
#              ['/bratpitt/', '/deathproof/', '/davidehrlich/', '/adrianbalboa/','/silentdawn/'] ]

In [None]:
# UNCOMMENT the following to use ChromeDriverManager
# from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome('chromedriver.exe')  #using Selenium-Chrome simulator to interact with JS
# replace 'chromedriver.exe' with ChromeDriverManager().install()

In [None]:
# from: https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

In [None]:
# Store the number of pages of reviews to the usernames
numPageReviews = dict()

for name in userNames:
    # Find the total number of pages of reviews from the first page
    firstPage = LBOX + name + REV + "1"
    
    req = urllib.request.Request(firstPage)
    user_agent = UserAgent().random
    req.add_header('user-agent', user_agent)
    soup = BeautifulSoup(urllib.request.urlopen(req),"html.parser")
    time.sleep(2)
    empty_rating = soup.find_all("li", "empty")
    if empty_rating:
        continue
    
    # Store user's all review data (movie/film name + year + review + rating)
    data = list()
    
    pageFooter = soup.find_all("div", "paginate-pages")
    if len(pageFooter) == 0:
        numPageReviews[name] = 1;
    else:
        item = soup.find_all('li','paginate-page')
        numPageReviews[name] = (int(item[-1].text))
    
    # Initialize lists for data
    ratings = list()
    movies = list()
    relYears = list()
    reviews = list()
    dates = list()
    reWatch = list()
    numRevLikes = list()
    
    # Loop through all pages and obtain the relevant data from each page
    for pageNum in range(1, numPageReviews[name]+1):
        thePage = LBOX + name + REV + str(pageNum)
        options = Options()
        user_agent = UserAgent().random
        options.add_argument(f'user-agent={user_agent}')
        driver = webdriver.Chrome('chromedriver.exe', options=options)
        driver.maximize_window()
        driver.get(thePage)
        time.sleep(random.randint(15,20))
        # Click on the spoilers links
        spoilers = list()
        spoilers = driver.find_elements_by_link_text('I can handle the truth.')
        state = True
        if len(spoilers)==0:
            state = False   
        if state ==True:
            for i in range(len(spoilers)):
                time.sleep(7)
                attempts = 0
                while(attempts < 2):
                    try:
                        WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.LINK_TEXT,'I can handle the truth.'))).click()
                        break
                    except TimeoutException:
                        print(name + ": TimeoutException Error  at :" + thePage + " at entry: " + str(i))
                    except ElementNotInteractableException:
                        print(name + ": ElementNotInteractableException Error  at :" + thePage + " at entry: " + str(i))
                    except StaleElementReferenceException:
                        print(name + ": StaleElementReferenceException Error  at :" + thePage + " at entry: " + str(i))
                    attempts += 1
        # Click on the "more" links to reveal all text            
        linksMore = list()
        time.sleep(7)
        linksMore = driver.find_elements_by_class_name('reveal')
        state = True
        if len(linksMore)==0:
            state = False   
        if state == True:
            for i in range(len(linksMore)):
                time.sleep(5)
                attempts = 0
                while(attempts < 2):
                    try:
                        WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.CLASS_NAME,'reveal'))).click()
                        break
                    except TimeoutException:
                        print(name + ": TimeoutException Error  at :" + thePage + " at entry: " + str(i))
                    except ElementNotInteractableException:
                        print(name + ": ElementNotInteractableException Error  at :" + thePage + " at entry: " + str(i))
                    except StaleElementReferenceException:
                        print(name + ": StaleElementReferenceException Error  at :" + thePage + " at entry: " + str(i))
                    attempts += 1
        
        time.sleep(7)
        soup = BeautifulSoup(driver.page_source,"html.parser")
        
        ratingList = list()  # list to store the review entries with ratings only      
        for tag in soup.find_all("p", "attribution"):
            spans = tag.find_all('span','rating')
            ratingList.append(spans)
            for span in spans:
                rating = [i for i in str(span.attrs.get('class')) if i in "0123456789"]
                if len(rating)==2:
                    rating = ['10']
                ratings.append(rating[0])

        for i, tag in enumerate(soup.find_all("h2", "headline-2 prettify")):
            if not ratingList[i]:
                continue
            filmName = tag.a.text
            filmYear = tag.small.a.text if tag.small else '0'
            movies.append(filmName)
            relYears.append(filmYear)
        

        for i, tag in enumerate(soup.find_all("div", "body-text")):
            if not ratingList[i]:
                continue
            rev = ""
            for item in tag:
                if isinstance(item, NavigableString):
                    continue
                rev += str(item.text)
            reviews.append(cleanhtml(rev))
            
        for i, tag in enumerate(soup.find_all("span","_nobr")):
            if not ratingList[i]:
                continue
            dates.append(tag.text)
        
        for i, tag in enumerate(soup.find_all("span","date")):
            if not ratingList[i]:
                continue
            if tag.text.startswith(" R"):
                reWatch.append(1)
            else:
                reWatch.append(0)
        
        for i, tag in enumerate(soup.find_all("p", "like-link-target")):
            if not ratingList[i]:
                continue
            if tag.span:
                numLike = ''.join([i for i in tag.span.a.text if i in "0123456789"])
            else:
                numLike = '0'
            numRevLikes.append(numLike)
        
        driver.quit()
            
    for i in range(len(ratings)):
        data.append([name, movies[i], relYears[i], reviews[i], ratings[i], dates[i], reWatch[i], numRevLikes[i]])
        
    df = pd.DataFrame(data, columns = ['userName' , 'filmName', 'releaseYear', 'userReview','userRating', 'reviewDate','reWatched',
                                       'reviewLikes'])
    df[['releaseYear','userRating','reWatched','reviewLikes']] = df[['releaseYear','userRating','reWatched','reviewLikes']].apply(pd.to_numeric, downcast='integer')
    
    df.to_csv("movrec.tsv", sep='\t', index=False, columns=None, header=False, mode='a')
    
    gc.collect()


### Test the saved file for inspection

In [5]:
df = pd.read_csv("data/movie_data.tsv", names=['userName' , 'filmName', 'releaseYear', 'userReview','userRating', 'reviewDate','reWatched','reviewLikes'])
df.tail()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,userName,filmName,releaseYear,userReview,userRating,reviewDate,reWatched,reviewLikes
293465.0,/silentjoe13/,Abduction,2011,"I had a feeling this film was going to be bad but I had no idea just HOW bad. Taylor Lautner, who I respect for doing his own stunts and at least TRYING to give a decent performance, is'nt even close to being the worst thing about this film. Story is cheesy and predictable. A lot of situations don't make sense. Talented actors like Signourny Weaver, Jason Isaacs, and Maria Bello feel wasted. The mustache on Lily Collins face that she calls eyebrows is distracting. I hate to be mean, but this movie deserves every ounce of shit people give it. It was a time and a waste of money.",1,"29 Sep, 2011",0,1
293466.0,/silentjoe13/,GoodFellas,1990,"Best gangster film ever. Even surpasses The Godfather, in my opinion. One of the few films where narration actually works. Joe Pesci steals the show. Some of the best camera shots I've ever seen. The editing, the dialogue, the visuals, the soundtrack are just breathtaking. Martin Scoresese's best film and probably the best film of the 90's.",10,"01 Mar, 2012",0,4
293467.0,/silentjoe13/,Drive,2011,"Probably the best surprise from 2011. Some may be turned off by its slow build up but that's one of the things that makes this so powerful. Everything can be so calm one mintue and the moment violence pops out of nowhere, it feels like some nightmare out of a David Cronenberg film. Nicholas Winding Refn creates one of the best films of the decade. Beautifully directed, body gyrating soundtrack, well acted (though Carey Mulligan seems miscast), pulse pounding action, and it has probably one of the best car chases to ever grace the screen.",10,"25 Sep, 2011",0,6
293468.0,/silentjoe13/,Cinema Paradiso,1988,Not only is this one of the greatest love letters to cinema but it is my favorite film of all time. It is about love and a boy's love for film. Magnificent story that is even better to look at. A must see for any and all film lovers.,10,"01 Mar, 2012",0,6
293469.0,/silentjoe13/,Manhattan,1979,"Witty, whimsical, poignant, and intelligent. Arguably Woody Allen's best film. Beautiful camera work and cinematography make this a beautiful love letter to the city.",10,"29 Feb, 2012",0,5


In [6]:
df.userName.nunique()

219

In [26]:
def filter_review_by_word(df, word):
    filtered = [(i,item) for (i, item) in enumerate(df['userReview']) if (str(item).find(word) >= 0)]
    locs = [item[0] for item in filtered]
    return df.iloc[locs]

In [43]:
aboutLBXD = filter_review_by_word(df, 'illuminating')

In [44]:
aboutLBXD.shape

(182, 8)

In [45]:
aboutLBXD.head()

Unnamed: 0,userName,filmName,releaseYear,userReview,userRating,reviewDate,reWatched,reviewLikes
367.0,/worsethan/,13 Hours: The Secret Soldiers of Benghazi,2016,"Shots don't get much more semiotically loaded that one of the elite mercenary ""operators"" dropping his assault rifle on top of a chessboard and scattering the pieces. Pleated khaki nerds like CIA chief David Constable (the walking embodiment of the word 'milquetoast"") can try to think their way through foreign occupations, but it's DH Lawrence's Essential American (hard, desolate and a killer) who really know how to handle the natives. The outraged question that rings throughout the runtime is ""why aren't THESE guys in charge?!""13 Hours is illuminating on the function of ideology in film. It's surprisingly non-partisan (although there's a trail of right-wing grievances running throughout that only the true Benghazi outrage aficionado will notice), but that apolitical pose masks a much deeper manifesto of the ideal American masculine imagination. Turns out is has a lot to do with owning females and killing other males. Weird.",4,"24 Jan, 2016",0,8
713.0,/worsethan/,There Will Be Blood,2007,"Paul Thomas Anderson's There Will Be Blood deals with epic themes and settings: the emergence of modern American capitalism and religion on the wide-open California oil fields of the early twentieth century. What's remarkable about the film's approach is that these issues are largely pushed to the periphery of the frame. At the center of the frame, for nearly every shot in the this two and a half hour film are Daniel Day-Lewis's piercing, fiery eyes. The film is monomanically focused on the character of oil baron Daniel Plainview.This is a radical departure for Anderson. His films tend to feature large casts of characters, all struggling to overcome their personal weaknesses and traumatic pasts and forge real connections with each other. There Will Be Blood never takes the focus off of Plainview, and his character arc is one of raging misanthropy and a repeated turning away from human trust and companionship. The viewer gets a long and harrowing view of the rocky outcroppings of Plainview's burning mind. Although larger issues of class, capitalism and culture are kept in the background, the film does offer a critique of capitalism that comes from a unique angle. While a character like Charles Foster Kane begins Citizen Kane as a young idealist who has his humanity sapped from him by his isolating wealth and growing power, Plainview starts out the film as an isolated, vengeful misanthrope, and it becomes apparent throughout the course of the film that he has sought out wealth and power explicitly to allow him to dominate those around him. Also, we see that his world-encompassing mistrust and contempt serve him very well in his frantic grasp for wealth.The result is one of the most vivid and terrifying depictions of human misanthropy in film history. Jonny Greenwood's dissonant score suggests the demonic, and Anderson's camera captures stark shots of desolate California brush and an oil fire belching forth from the earth like a portal to hell with a grace and rest...",10,"05 Nov, 2012",0,75
914.0,/justwannaboogie/,The Fog of War,2003,"'At the end we lucked out. It was luck that prevented nuclear war. We came that close to nuclear war at the end. Rational individuals: Kennedy was rational; Khrushchev was rational; Castro was rational. Rational individuals came that close to total destruction of their societies. And that danger exists today.'Errol Morris, like his friend Werner Herzog, has a particular gift when it comes to making documentary films on his own terms, extracting information from his idiosyncratic and interesting subjects and fittingly, in the case of The Fog of War, there's a degree of ambiguity in the presentationit would be too easy to paint Robert McNamara as a monster for his involvement and alleged culpability for major events in American historynamely the Vietnam war, or perhaps attempt to show him in the opposite light, perhaps a man seeking redemption or forgivenessMorris intelligently creates space for both but interpretations but avoids bending to either whim and allows audiences to make their own minds up.Even at the age of 85 McNamara is an engaging and sharp-witted interviewee, he had a knack for knowing exactly what to say in a way that was frank and forthwith but without divulging too muchlater in the film we see some reluctance to say anything more on Vietnam, suggesting he had some foibles not to be unearthed in this life or the next, but for the most part he displays total honesty in discussing his shortcomings and achievements through the lens of accrued hindsight.Morris breaks the film up into eleven tenets or 'life lessons' from McNamara's philosophy on war in a manner suggestive of irony, but in actual fact they are illuminating, logical and often terrifyingly truthful. To counterpoint McNamara as a talking head, Morris creatively inserts stock footage, graphs, charts and photographs, ushered along with a riveting score by Philip Glass that provides a relentless rhythm. The chilling conclusion reached is that we've learnt so much but are doomed to repeat ou...",8,"10 Dec, 2019",0,14
1864.0,/justwannaboogie/,Isle of Dogs,2018,"Wes Anderson has been gradually perfecting his unique comedy style ever since Bottle Rocket. A true auteur, he has a clear vision and an inimitable approach to filmmaking. Much like the work of Jacques Tati, the appeal for me is through the craftsmanship, charm and unique quirkiness rather than the laughs. In fact I didn't laugh at all during Isle of Dogs, yet still came out having loved what I'd seen.His second attempt at stop-motion animation since 2009's Fantastic Mr. Fox, the attention to detail and visual appeal is stunning, everything is meticulously crafted to the smallest degree. The animation itself has a tactile quality to it, it looks real because it is real. One standout part for me involves some empty sake bottles being used as multicoloured lighting, illuminating the scene and characters with a gorgeous glow. My only gripe is the breakneck pace; blink and you'll miss it!Able to find perfection in an imperfect world, the story is as heartwarming and grounded as you'd expect from Anderson. Refusing to deviate from his usual stylistic choices, the trademark symmetry, deadpan delivery and use of colour can all be found. His gift for physical comedy never disappoints either. He understands the importance of using every dimension of the frame in the same way that the silent era greats like Keaton and Lloyd did to hone their gags.With a staggering ensemble cast of Anderson veterans and well selected newcomers alike (it had me at Greta Gerwig), this is a heartwarming and thoroughly delightful slice of imagination. At a basic level it is a tale of a young boy and his dog, but leaves its wonders fully open to interpretation.",8,"04 Apr, 2018",0,25
2062.0,/justwannaboogie/,Menashe,2017,"Menashe offers a fascinating and illuminating insight into the Hasidic Jewish community of New York, of which most viewers will be completely unfamiliar with. Almost entirely in the Yiddish language, using non-actors and with a large amount of religious jargon in the dialogue it certainly isn't setting its sights on general audiences, which is admirably bold.What makes it so worthwhile though is its universally appealing themes of loss, regret and the daily struggle of life as we follow the titular character of Menashe as he experiences mishap after mishap. Trapped in a suffocating and absurdly strict religion, he is obliged to find a new wife in order to regain custody of his son and struggles financially to support him.Performances are natural, entirely believable and even in such a short runtime I became thoroughly invested in Menashe, rooting for him all the way.A24 really are an unstoppable force of nature at the moment, I've seen almost ten of their films this year and loved every one.",7,"22 Dec, 2017",0,5
