In [1]:
import os
import pandas as pd
import json
import time
from bs4 import BeautifulSoup as bs 
import requests as req
import tqdm

wd = "D:/Mestrado/2020-2/Curso DS/Recommender System - Movies/scrape/"
raw_data_dir = wd+"raw_data/"
url = "https://letterboxd.com/search/films/{title}/"
netflix_titles = pd.read_csv(wd+"netflix_titles.csv")
netflix_titles = netflix_titles[netflix_titles.type == "Movie"]
netflix_titles = netflix_titles.reset_index()

In [4]:
netflix_titles.head()

Unnamed: 0,index,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...
2,4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...
3,6,70304989,Movie,Automata,Gabe Ibáñez,"Antonio Banderas, Dylan McDermott, Melanie Gri...","Bulgaria, United States, Spain, Canada","September 8, 2017",2014,R,110 min,"International Movies, Sci-Fi & Fantasy, Thrillers","In a dystopian future, an insurance adjuster f..."
4,7,80164077,Movie,Fabrizio Copano: Solo pienso en mi,"Rodrigo Toro, Francisco Schultz",Fabrizio Copano,Chile,"September 8, 2017",2017,TV-MA,60 min,Stand-Up Comedy,Fabrizio Copano takes audience participation t...


# Get HTML search pages source and save at local disc

In [3]:
for i in tqdm.tqdm(netflix_titles.index, "searching letterboxd pages"):
    title, title_id = netflix_titles.title[i].replace(" ", "+"), netflix_titles.show_id[i]
    if not os.path.exists(raw_data_dir+"letterboxd_search_{}.html".format(title_id)):
        response = req.get(url.format(title=title))
        # saving
        with open(raw_data_dir+"letterboxd_search_{}.html".format(title_id), 'w+', encoding='utf8') as output:
            output.write(response.text)
        time.sleep(2)

searching letterboxd pages: 100%|████████████████████████████████████████████████| 4265/4265 [2:04:15<00:00,  1.75s/it]


# Processing raw HTML files to extract video title, URL and other info

In [14]:
import glob
from difflib import SequenceMatcher
import re

In [41]:
with open(wd+"letterboxd_movies_page_wimg.json", 'a+') as output:
    for search_page in tqdm.tqdm(sorted(glob.glob(raw_data_dir+"/letterboxd_search_*")), desc="working on film pages"):
        with open(search_page, "r+", encoding='utf8') as film_page:
            parsed = bs(film_page, 'html.parser')
            film_details = parsed.find_all("div", attrs={"class":"film-detail-content"})
            img_details = parsed.find_all("div", attrs={"class": re.compile("^react-component film-poster film-poster")})

            # info from netflix_titles data.frame (verificar se formato bate com o formato que ta tirando do HTML)
            film_id = int(search_page.split("letterboxd_search_", 1)[1].split(".html")[0])
            idx = [i for i, x in enumerate(netflix_titles.show_id == film_id) if x][0]
            film_title = netflix_titles.title[idx]
            film_year = netflix_titles.release_year[idx]
            film_director = netflix_titles.director[idx]

            best_ratio = 0
            for i in range(len(film_details)):
                film, img = film_details[i], img_details[i]
                try:
                    title_wrapper = film.find_all("span", attrs={"class":"film-title-wrapper"})[0].find_all("a")[0]
                    href, title = title_wrapper['href'], title_wrapper.text
                except:
                    href, title = None, ''
                    
                try:
                    year = title_wrapper = film.find_all("span", attrs={"class":"film-title-wrapper"})[0].find_all("a")[1].text
                    year_ok = abs(int(year) - film_year) == 0
                except:
                    year = None
                    year_ok = False
                    
                try:
                    director = film.find_all("p", attrs={"class":"film-metadata"})[0].find_all("a")[0].text
                except:
                    director = None
                    
                try:
                    img_source = img.find_all("img", attrs={"class": "image"})[0]['srcset'].partition(" ")[0]
                except:
                    img_source = None
                
                # check if match is ok
                this_ratio = max(SequenceMatcher(None, film_title, title).ratio(), SequenceMatcher(None, film_title, title[0:-5]).ratio())
                if not year_ok and this_ratio < 0.6:
                    continue
                
                if this_ratio > 0.95:
                    result = {'show_id':film_id, 'title':title, 'href':href, 'year':year, 'director':director, 
                              'similarity':this_ratio, 'img':img_source}
                    break
                elif this_ratio > best_ratio:
                    result = {'show_id':film_id, 'title':title, 'href':href, 'year':year, 'director':director, 
                              'similarity':this_ratio, 'img':img_source}
                    best_ratio = this_ratio
        if len(result) > 0:
            output.write("{}\n".format(json.dumps(result)))
            result = {}


working on film pages: 100%|███████████████████████████████████████████████████████| 4265/4265 [03:48<00:00, 18.64it/s]


# Verify results

In [42]:
netflix_titles.shape

(4265, 13)

In [51]:
# look at saved json
df = pd.read_json(wd+"letterboxd_movies_page_wimg.json", lines=True)
print('Total number of movies:', df.shape[0])

Total number of movies: 4086


In [52]:
print(df.head())

   show_id                              title  \
0  1005494                  The Stranger 1946   
1  1008581                       Stripes 1981   
2  1029730  Teenage Mutant Ninja Turtles 1990   
3  1064058        Tremors 2: Aftershocks 1996   
4  1065372            The Trigger Effect 1996   

                                  href    year      director  similarity  \
0                  /film/the-stranger/  1946.0  Orson Welles    1.000000   
1                       /film/stripes/  1981.0  Ivan Reitman    1.000000   
2  /film/teenage-mutant-ninja-turtles/  1990.0  Steve Barron    0.835821   
3         /film/tremors-2-aftershocks/  1996.0  S. S. Wilson    1.000000   
4            /film/the-trigger-effect/  1996.0   David Koepp    1.000000   

                                                 img  
0  https://a.ltrbxd.com/resized/film-poster/3/9/1...  
1  https://a.ltrbxd.com/resized/sm/upload/34/li/e...  
2  https://a.ltrbxd.com/resized/film-poster/5/1/0...  
3  https://a.ltrbxd.com/resi

In [53]:
print(df.tail())

      show_id                     title                        href    year  \
4081   915927                Rocky 1976                /film/rocky/  1976.0   
4082   916043             Rocky II 1979             /film/rocky-ii/  1979.0   
4083   916061             Rocky IV 1985             /film/rocky-iv/  1985.0   
4084   923574  Rumble in the Bronx 1995  /film/rumble-in-the-bronx/  1995.0   
4085   973861          Sling Blade 1996          /film/sling-blade/  1996.0   

                director  similarity  \
4081    John G. Avildsen         1.0   
4082  Sylvester Stallone         1.0   
4083  Sylvester Stallone         1.0   
4084        Stanley Tong         1.0   
4085  Billy Bob Thornton         1.0   

                                                    img  
4081  https://a.ltrbxd.com/resized/sm/upload/4z/ql/9...  
4082  https://a.ltrbxd.com/resized/film-poster/5/1/0...  
4083  https://a.ltrbxd.com/resized/sm/upload/vn/7i/5...  
4084  https://a.ltrbxd.com/resized/sm/upload/so/of/i

In [54]:
df2 = df.merge(netflix_titles, how='left', on='show_id')
bad_match = df2.show_id[(df2.year - df2.release_year != 0) & (df2.similarity < 0.6)]
print("Total bad matchs: %.0f out of %.0f." % (len(bad_match), df2.shape[0]))

Total bad matchs: 0 out of 4086.
