In [30]:
import urllib.request
from bs4 import BeautifulSoup, NavigableString
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import selenium.common.exceptions as Exception
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent

import time
import re
import pandas as pd
import numpy as np
import random

In [2]:
BASE_PEOPLE = "https://letterboxd.com/people/popular/page/"
LBOX = "https://letterboxd.com"
REV = "films/reviews/page/"

userNames = list()

t0 = time.time()

# Get the usernames from people page by all-time popularity (set to: 1st page)
for i in range (1700,1701):
    URL = BASE_PEOPLE + str(i)
    req = urllib.request.Request(URL)
    user_agent = UserAgent().random
    req.add_header('user-agent', user_agent)
    soup = BeautifulSoup(urllib.request.urlopen(req),"html.parser")
    for tag in soup.find_all("h3", "title-3"):
        href = tag.a.get("href")
        userNames.append(href)
    
    time.sleep(0.25)
t1 = time.time()            
print(f"{t1-t0} seconds to download {len(userNames)} usernames.")

9.450112104415894 seconds to download 35 usernames.


In [3]:
from webdriver_manager.chrome import ChromeDriverManager
# driver = webdriver.Chrome(ChromeDriverManager().install())  #using Selenium-Chrome simulator to interact with JS

In [4]:
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

In [15]:
# Store num. of pages of reviews to usernames
numPageReviews = dict()
# Store all data of reviews to usernames
allReviewData = dict()

# driver = webdriver.Chrome(ChromeDriverManager().install()) 
t0 = time.time()

for name in userNames[:2]:
    # Find num. of pages of reviews from the first page
    firstPage = LBOX + name + REV + "1"
    
    req = urllib.request.Request(firstPage)
    user_agent = UserAgent().random
    req.add_header('user-agent', user_agent)
    soup = BeautifulSoup(urllib.request.urlopen(req),"html.parser")
    time.sleep(2)
    
    # Store user's all review data (movie/film name + year + review + rating)
    data = list()
    
    pageFooter = soup.find_all("div", "paginate-pages")
    if len(pageFooter) == 0:
        numPageReviews[name] = 1;
    else:
        item = soup.find_all('li','paginate-page')
        numPageReviews[name] = (int(item[-1].text))
    
    # Initialize lists for data
    ratings = list()
    movies = list()
    relYears = list()
    reviews = list()
    dates = list()
    reWatch = list()
    numRevLikes = list()
    
    # Loop through all pages and obtain data from each page
    for pageNum in range(1, numPageReviews[name]+1):
        thePage = LBOX + name + REV + str(pageNum)
        options = Options()
        user_agent = UserAgent().random
        options.add_argument(f'user-agent={user_agent}')
        driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
        driver.maximize_window()
        driver.get(thePage)
        time.sleep(random.randint(10,20))
        # Click on the spoilers links
        spoilers = list()
        spoilers = driver.find_elements_by_link_text('I can handle the truth.')
        state = True
        if len(spoilers)==0:
            state = False   
        if state ==True:
            for i in range(len(spoilers)):
                time.sleep(7)
                attempts = 0
                while(attempts < 2):
                    try:
                        WebDriverWait(driver,20).until(EC.presence_of_element_located((By.LINK_TEXT,'I can handle the truth.'))).click()
                        break
                    except Exception as exc:
                        print("Error " + exc + " at :" + thePage + " at entry: " + str(i))
                    attempts += 1
        # Click on the "more" links to reveal all text            
        linksMore = list()
        time.sleep(7)
        linksMore = driver.find_elements_by_class_name('reveal')
        state = True
        if len(linksMore)==0:
            state = False   
        if state == True:
            for i in range(len(linksMore)):
                time.sleep(5)
                attempts = 0
                while(attempts < 2):
                    try:
                        WebDriverWait(driver,20).until(EC.presence_of_element_located((By.CLASS_NAME,'reveal'))).click()
                        break
                    except Exception as exc:
                        print("Error " + exc + " at :" + thePage + " at entry: " + str(i))
                    attempts += 1
        
        time.sleep(7)
        soup = BeautifulSoup(driver.page_source,"html.parser")
        
        ratingList = list()  # list to store the review entries with ratings only      
        for tag in soup.find_all("p", "attribution"):
            spans = tag.find_all('span','rating')
            ratingList.append(spans)
            for span in spans:
                rating = [i for i in str(span.attrs.get('class')) if i in "0123456789"]
                if len(rating)==2:
                    rating = ['10']
                ratings.append(rating[0])

        for i, tag in enumerate(soup.find_all("h2", "headline-2 prettify")):
            if not ratingList[i]:
                continue
            filmName = tag.a.text
            filmYear = tag.small.a.text
            movies.append(filmName)
            relYears.append(filmYear)
        

        for i, tag in enumerate(soup.find_all("div", "body-text")):
            if not ratingList[i]:
                continue
            rev = ""
            for item in tag:
                if isinstance(item, NavigableString):
                    continue
                rev += str(item.text)
            reviews.append(cleanhtml(rev))
            
        for i, tag in enumerate(soup.find_all("span","_nobr")):
            if not ratingList[i]:
                continue
            dates.append(tag.text)
        
        for i, tag in enumerate(soup.find_all("span","date")):
            if not ratingList[i]:
                continue
            if tag.text.startswith(" R"):
                reWatch.append(1)
            else:
                reWatch.append(0)
        
        for i, tag in enumerate(soup.find_all("p", "like-link-target")):
            if not ratingList[i]:
                continue
            if tag.span:
                numLike = ''.join([i for i in tag.span.a.text if i in "0123456789"])
            else:
                numLike = '0'
            numRevLikes.append(numLike)
        
        driver.quit()
            
    for i in range(len(ratings)):
        data.append([name, movies[i], relYears[i], reviews[i], ratings[i], dates[i], reWatch[i], numRevLikes[i]])
        
    df = pd.DataFrame(data, columns = ['userName' , 'filmName', 'releaseYear', 'userReview','userRating', 'reviewDate','reWatched',
                                       'reviewLikes'])
    df[['releaseYear','userRating','reWatched','reviewLikes']] = df[['releaseYear','userRating','reWatched','reviewLikes']].apply(pd.to_numeric, downcast='integer')
    
    df.to_csv("movrec.tsv", sep='\t', index=False, columns=None, header=False, mode='a')
    
t1 = time.time()


Looking for [chromedriver 79.0.3945.36 win32] driver in cache 
File found in cache by path [C:\Users\tanmu\AppData\Roaming\SPB_16.6\.wdm\drivers\chromedriver\79.0.3945.36\win32\chromedriver.exe]

Looking for [chromedriver 79.0.3945.36 win32] driver in cache 
File found in cache by path [C:\Users\tanmu\AppData\Roaming\SPB_16.6\.wdm\drivers\chromedriver\79.0.3945.36\win32\chromedriver.exe]

Looking for [chromedriver 79.0.3945.36 win32] driver in cache 
File found in cache by path [C:\Users\tanmu\AppData\Roaming\SPB_16.6\.wdm\drivers\chromedriver\79.0.3945.36\win32\chromedriver.exe]

Looking for [chromedriver 79.0.3945.36 win32] driver in cache 
File found in cache by path [C:\Users\tanmu\AppData\Roaming\SPB_16.6\.wdm\drivers\chromedriver\79.0.3945.36\win32\chromedriver.exe]


In [16]:
len(ratings), len(movies), len(relYears), len(reviews), len(dates), len(reWatch), len(numRevLikes)

(27, 27, 27, 27, 27, 27, 27)

In [17]:
numPageReviews

{'/bapz/': 1, '/vaderhader/': 3}

In [23]:
test = pd.read_table("movrec.tsv", names=['userName' , 'filmName', 'releaseYear', 'userReview','userRating', 'reviewDate','reWatched',
                                       'reviewLikes'])

In [27]:
test.head(30)

Unnamed: 0,userName,filmName,releaseYear,userReview,userRating,reviewDate,reWatched,reviewLikes
0,/bapz/,The Room,2003,"galera parece adorar,gosto de ver os melhores ...",8,"24 Sep, 2017",0,2
1,/bapz/,James and the Giant Peach,1996,Choro sempre Infância,10,"24 Sep, 2017",0,0
2,/bapz/,Kung Fury,2015,queria ter feito,10,"24 Sep, 2017",0,1
3,/vaderhader/,Paddington 2,2017,"Perfection, the cure to sadness.",10,"26 Dec, 2019",1,0
4,/vaderhader/,Star Wars: The Rise of Skywalker,2019,"Like a Star Wars pinball machine, it never sto...",3,"23 Dec, 2019",0,2
5,/vaderhader/,Long Shot,2019,Long shot is a “good enough movie” that works ...,6,"22 Dec, 2019",0,0
6,/vaderhader/,It’s a Wonderful Life,1946,For me Princess Mononoke will always be my fav...,10,"04 Dec, 2019",0,1
7,/vaderhader/,The Report,2019,"It drags the CIA and Obama so that’s good, but...",7,"30 Nov, 2019",0,0
8,/vaderhader/,The Big Short,2015,This film feels even better after watching Vic...,9,"10 Nov, 2019",1,0
9,/vaderhader/,Knives Out,2019,Knives Out is always entertaining. That is the...,10,"11 Oct, 2019",0,1
