In [52]:
import urllib.request
from bs4 import BeautifulSoup, NavigableString
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementNotInteractableException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent

import time
import re
import pandas as pd
import numpy as np
import random

In [2]:
BASE_PEOPLE = "https://letterboxd.com/people/popular/page/"
LBOX = "https://letterboxd.com"
REV = "films/reviews/page/"

userNames = list()

t0 = time.time()

# Get the usernames from people page by all-time popularity (set to: 1st page)
for i in range (11,21):
    URL = BASE_PEOPLE + str(i)
    req = urllib.request.Request(URL)
    user_agent = UserAgent().random
    req.add_header('user-agent', user_agent)
    soup = BeautifulSoup(urllib.request.urlopen(req),"html.parser")
    time.sleep(random.randint(10,15))
    for tag in soup.find_all("h3", "title-3"):
        href = tag.a.get("href")
        userNames.append(href)
    
t1 = time.time()            
print(f"{t1-t0} seconds to download {len(userNames)} usernames.")

174.5681767463684 seconds to download 350 usernames.


In [102]:
# removing most popular userNames (the ones on the right dock)
userNames = [user for user in userNames if user not in ['/bratpitt/', '/deathproof/', '/davidehrlich/', '/adrianbalboa/','/silentdawn/'] ]

In [63]:
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome('chromedriver.exe')  #using Selenium-Chrome simulator to interact with JS


In [5]:
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

In [None]:
# Store num. of pages of reviews to usernames
numPageReviews = dict()

# driver = webdriver.Chrome(ChromeDriverManager().install()) 
t0 = time.time()

for name in userNames[6:]:
    # Find num. of pages of reviews from the first page
    firstPage = LBOX + name + REV + "1"
    
    req = urllib.request.Request(firstPage)
    user_agent = UserAgent().random
    req.add_header('user-agent', user_agent)
    soup = BeautifulSoup(urllib.request.urlopen(req),"html.parser")
    time.sleep(2)
    
    # Store user's all review data (movie/film name + year + review + rating)
    data = list()
    
    pageFooter = soup.find_all("div", "paginate-pages")
    if len(pageFooter) == 0:
        numPageReviews[name] = 1;
    else:
        item = soup.find_all('li','paginate-page')
        numPageReviews[name] = (int(item[-1].text))
    
    # Initialize lists for data
    ratings = list()
    movies = list()
    relYears = list()
    reviews = list()
    dates = list()
    reWatch = list()
    numRevLikes = list()
    
    # Loop through all pages and obtain data from each page
    for pageNum in range(1, numPageReviews[name]+1):
        thePage = LBOX + name + REV + str(pageNum)
        options = Options()
        user_agent = UserAgent().random
        options.add_argument(f'user-agent={user_agent}')
        driver = webdriver.Chrome('chromedriver.exe', options=options)
        driver.maximize_window()
        driver.get(thePage)
        time.sleep(random.randint(10,20))
        # Click on the spoilers links
        spoilers = list()
        spoilers = driver.find_elements_by_link_text('I can handle the truth.')
        state = True
        if len(spoilers)==0:
            state = False   
        if state ==True:
            for i in range(len(spoilers)):
                time.sleep(7)
                attempts = 0
                while(attempts < 2):
                    try:
                        WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.LINK_TEXT,'I can handle the truth.'))).click()
                        break
                    except TimeoutException:
                        print(name + ": TimeoutException Error  at :" + thePage + " at entry: " + str(i))
                    except ElementNotInteractableException:
                        print(name + ": ElementNotInteractableException Error  at :" + thePage + " at entry: " + str(i))
                    except StaleElementReferenceException:
                        print(name + ": StaleElementReferenceException Error  at :" + thePage + " at entry: " + str(i))
                    attempts += 1
        # Click on the "more" links to reveal all text            
        linksMore = list()
        time.sleep(7)
        linksMore = driver.find_elements_by_class_name('reveal')
        state = True
        if len(linksMore)==0:
            state = False   
        if state == True:
            for i in range(len(linksMore)):
                time.sleep(5)
                attempts = 0
                while(attempts < 2):
                    try:
                        WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.CLASS_NAME,'reveal'))).click()
                        break
                    except TimeoutException:
                        print(name + ": TimeoutException Error  at :" + thePage + " at entry: " + str(i))
                    except ElementNotInteractableException:
                        print(name + ": ElementNotInteractableException Error  at :" + thePage + " at entry: " + str(i))
                    except StaleElementReferenceException:
                        print(name + ": StaleElementReferenceException Error  at :" + thePage + " at entry: " + str(i))
                    attempts += 1
        
        time.sleep(7)
        soup = BeautifulSoup(driver.page_source,"html.parser")
        
        ratingList = list()  # list to store the review entries with ratings only      
        for tag in soup.find_all("p", "attribution"):
            spans = tag.find_all('span','rating')
            ratingList.append(spans)
            for span in spans:
                rating = [i for i in str(span.attrs.get('class')) if i in "0123456789"]
                if len(rating)==2:
                    rating = ['10']
                ratings.append(rating[0])

        for i, tag in enumerate(soup.find_all("h2", "headline-2 prettify")):
            if not ratingList[i]:
                continue
            filmName = tag.a.text
            filmYear = tag.small.a.text if tag.small else '0'
            movies.append(filmName)
            relYears.append(filmYear)
        

        for i, tag in enumerate(soup.find_all("div", "body-text")):
            if not ratingList[i]:
                continue
            rev = ""
            for item in tag:
                if isinstance(item, NavigableString):
                    continue
                rev += str(item.text)
            reviews.append(cleanhtml(rev))
            
        for i, tag in enumerate(soup.find_all("span","_nobr")):
            if not ratingList[i]:
                continue
            dates.append(tag.text)
        
        for i, tag in enumerate(soup.find_all("span","date")):
            if not ratingList[i]:
                continue
            if tag.text.startswith(" R"):
                reWatch.append(1)
            else:
                reWatch.append(0)
        
        for i, tag in enumerate(soup.find_all("p", "like-link-target")):
            if not ratingList[i]:
                continue
            if tag.span:
                numLike = ''.join([i for i in tag.span.a.text if i in "0123456789"])
            else:
                numLike = '0'
            numRevLikes.append(numLike)
        
        driver.quit()
            
    for i in range(len(ratings)):
        data.append([name, movies[i], relYears[i], reviews[i], ratings[i], dates[i], reWatch[i], numRevLikes[i]])
        
    df = pd.DataFrame(data, columns = ['userName' , 'filmName', 'releaseYear', 'userReview','userRating', 'reviewDate','reWatched',
                                       'reviewLikes'])
    df[['releaseYear','userRating','reWatched','reviewLikes']] = df[['releaseYear','userRating','reWatched','reviewLikes']].apply(pd.to_numeric, downcast='integer')
    
    df.to_csv("movrec.tsv", sep='\t', index=False, columns=None, header=False, mode='a')
    
t1 = time.time()

In [99]:
test = pd.read_table("movrec.tsv", names=['userName' , 'filmName', 'releaseYear', 'userReview','userRating', 'reviewDate','reWatched','reviewLikes'])


In [100]:
test.userName.unique()

array(['/worsethan/', '/justwannaboogie/', '/owene73/', '/valdesbian/',
       '/ashmatxx/'], dtype=object)

In [101]:
test.tail()

Unnamed: 0,userName,filmName,releaseYear,userReview,userRating,reviewDate,reWatched,reviewLikes
5854,/ashmatxx/,"Love, Simon",2018,I'm so super glad that this film is doing posi...,4,"22 Oct, 2018",0,1
5855,/ashmatxx/,The Tale,2018,This film really took its time getting me inve...,7,"22 Oct, 2018",0,0
5856,/ashmatxx/,The Kissing Booth,2018,Don’t know what else I expected from a film I ...,3,"22 Oct, 2018",0,0
5857,/ashmatxx/,John Mulaney: Kid Gorgeous at Radio City,2018,I had never seen my partner cry with laughter ...,8,"22 Oct, 2018",1,1
5858,/ashmatxx/,Set It Up,2018,If you want to watch something while you’re no...,5,"22 Oct, 2018",0,0


In [104]:
len(userNames)

300