<h3>Getting film data from Letterboxd user list</h3>

In [53]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import datetime

<p>Acessing user profile</p>

In [85]:
USERNAME = "igorbonato"
BASE_SITE_URL = "https://letterboxd.com/"
CURRENT_DATE = datetime.datetime.now()
OUTPUT_PATH = f"datasets/{USERNAME}-{CURRENT_DATE.year}_{CURRENT_DATE.month}_{CURRENT_DATE.day}-{CURRENT_DATE.hour}-{CURRENT_DATE.minute}.csv"


In [55]:
def get_user_profile(username):
    username_path = f"{BASE_SITE_URL}{username}/films/"
    films_page = requests.get(username_path)
    films_page_content = BeautifulSoup(films_page.content)
    all_pages = films_page_content.select("li.paginate-page a")
    num_total_pages = int(all_pages[-1].string) if all_pages else 1
    total_films = get_page_data(username_path,num_total_pages)
    user_films_table = create_table(total_films)
    return user_films_table

In [56]:
def get_page_data(user_path,total_pages):
    total_films = []
    for page in range(1,total_pages+1):
        page_url = f"{user_path}page/{page}/"
        page_request = requests.get(page_url)
        page_content = BeautifulSoup(page_request.content)
        total_page_films = get_films(page_content)
        total_films.extend(total_page_films)
    return total_films

In [71]:
def get_films(page):
    films = page.select("li.poster-container")
    total_page_films = []
    for film in films:
        film_info = get_film_info(film)
        total_page_films.append(film_info)
    return total_page_films
    

In [58]:
def get_film_info(film):
    div_film = film.find("div")
    rating_film = film.find(class_="rating")
    img_film = film.find("img")
    film_path = f"{BASE_SITE_URL[:-1]}{div_film['data-film-slug']}" 
    film_request = requests.get(film_path)
    film_content = BeautifulSoup(film_request.content)
    film_info = {}
    film_info["title"] = get_title(img_film)
    film_info["rated"] = get_rated(rating_film)
    film_info["cast"] = get_cast(film_content)
    film_info["director"] = get_director(film_content)
    film_info["year"] = get_year(film_content)
    film_info["min"] = get_length(film_content)
    film_info["genre"] = get_genres(film_content)
    film_info["poster"] = get_poster(film_content)
    film_info["general_rated"] = get_general_rated(film_content)
    film_info["description"] = get_description(film_content)
    return film_info


In [82]:
def get_general_rated(film):
    general_meta_tag = film.select('[name="twitter:data2"]')
    return general_meta_tag[0]["content"].split(" ")[0] if general_meta_tag else ""

def get_rated(film):
    if film:
        rate = 0
        for char in film.text:
            if char == "★":
                rate += 1
            if char == "½":
                rate += 0.5
        return rate
    return ""

def get_poster(film):
    poster_lixo = film.find(class_="really-lazy-load")
    poster = poster_lixo.find("img")
    return poster["src"]

def get_title(film):
    return film["alt"]

def get_year(film):
    year_film = film.select(".number a")
    return year_film[0].text

def get_cast(film):
    cast_film = film.select("div.cast-list p a")
    cast_list = [cast.string for cast in cast_film]
    if 'Show All…' in cast_list:
        cast_list.remove("Show All…")
    return cast_list

def get_director(film):
    director = film.select("span.prettify")
    return director[0].text if director else ""

def get_length(film):
    length_film = film.select("p.text-link")
    min_film = length_film[0].text
    min = ''.join(item for item in min_film if item.isalnum())
    return min.split("m")[0]

def get_genres(film):
    genre = film.select("div.text-sluglist.capitalize")
    if len(genre) >= 1:
            w = genre[0].find_all(href=re.compile("genre"))
            genres = [genre.string for genre in w]
    else:
            genres = ["Sem Genero"]
    return genres

def get_description(film):
    description = film.select('[name="description"]')
    return description[0]["content"]
   

In [63]:
def create_table(user_total_film_info):
    user_films_table = pd.DataFrame(user_total_film_info)
    csv_user_table = user_films_table.to_csv(OUTPUT_PATH)
    return user_films_table

In [86]:
table  = get_user_profile(USERNAME)

In [87]:
table

Unnamed: 0,title,rated,cast,director,year,min,genre,poster,general_rated,description
0,The Tinder Swindler,3,"[Shimon Yehuda Hayut, Kristoffer Kumar, Pernil...",Felicity Morris,2022,114,"[documentary, crime]",https://a.ltrbxd.com/resized/film-poster/8/2/9...,3.17,"Posing as a wealthy, jet-setting diamond mogul..."
1,Spider-Man: No Way Home,3.5,"[Tom Holland, Zendaya, Benedict Cumberbatch, J...",Jon Watts,2021,148,"[action, adventure, science-fiction]",https://a.ltrbxd.com/resized/film-poster/5/6/0...,4.18,Peter Parker is unmasked and no longer able to...
2,Being the Ricardos,3.5,"[Nicole Kidman, Javier Bardem, J.K. Simmons, N...",Aaron Sorkin,2021,132,"[drama, history]",https://a.ltrbxd.com/resized/sm/upload/iy/k0/l...,2.95,Follows Lucille Ball and Desi Arnaz as they fa...
3,Don't Look Up,3,"[Jennifer Lawrence, Leonardo DiCaprio, Meryl S...",Adam McKay,2021,138,"[drama, science-fiction, comedy]",https://a.ltrbxd.com/resized/film-poster/5/7/2...,3.16,"Two low-level astronomers, upon discovering th..."
4,Nightmare Alley,3,"[Bradley Cooper, Cate Blanchett, Rooney Mara, ...",Guillermo del Toro,2021,150,"[crime, thriller, drama]",https://a.ltrbxd.com/resized/film-poster/5/2/4...,3.52,An ambitious carnival man with a talent for ma...
...,...,...,...,...,...,...,...,...,...,...
1376,Dial M for Murder,4,"[Ray Milland, Grace Kelly, Robert Cummings, Jo...",Alfred Hitchcock,1954,105,"[thriller, crime]",https://a.ltrbxd.com/resized/film-poster/5/1/5...,4.02,An ex-tennis pro carries out a plot to have hi...
1377,Peter Pan,3,"[Bobby Driscoll, Kathryn Beaumont, Hans Conrie...",Hamilton Luske,1953,77,"[adventure, family, fantasy, animation]",https://a.ltrbxd.com/resized/film-poster/4/5/9...,3.49,"Leaving the safety of their nursery behind, We..."
1378,Dumbo,2,"[Sterling Holloway, Herman Bing, John McLeish,...",Ben Sharpsteen,1941,64,"[animation, family]",https://a.ltrbxd.com/resized/film-poster/4/5/3...,3.38,Dumbo is a baby elephant born with over-sized ...
1379,Modern Times,4,"[Charlie Chaplin, Paulette Goddard, Henry Berg...",Charlie Chaplin,1936,87,"[comedy, drama]",https://a.ltrbxd.com/resized/film-poster/4/9/8...,4.23,The Tramp struggles to live in modern industri...
