# Letterboxd movie and user data scraping project

In [13]:
import pandas as pd
import numpy as np
import datetime as dt
from bs4 import BeautifulSoup, Tag
from base64 import b64decode

import os
from collections import defaultdict
import json
import glob
import time
from tqdm import tqdm
import gzip
from typing import Any, Hashable

from crawlbase import CrawlingAPI
import requests
from urllib import parse

## Helper functions, constants, and web scraping APIs

In [2]:
# helper functions and constants

WEB_PREFIX = 'https://letterboxd.com'
USER_REVIEWS_PATH = os.path.join(os.getcwd(), 'user_reviews/')

# keys for various web scraping APIs
ZYTE_API_KEY = 'API_KEY'
ZYTE_API_KET = 'API_KEY'

CRAWLBASE_API_KEY = 'API_KEY'
CRAWLBASE_API_KEY = 'API_KEY'

SCRAPER_API_KEY = 'API_KEY'
SCRAPER_API_KEY = 'API_KEY'

WEBSCRAPERAPI_API_KEY = 'API_KEY'

# converts a star rating to its float equivalent
def convert_rating_to_float(rating):
    int_rating = 0
    for c in rating:
        if c == '★':
            int_rating += 1.0
        if c == '½':
            int_rating += 0.5

    return int_rating

# get a list of all users who have been scraped
def get_used_users():
    with open('used_users.txt', 'r') as u:
        used_users = [line.split(',')[0].replace('\n', '') for line in u.readlines()]
    return used_users

# returns True iff this user has been scraped already
def is_user_used(user):
    return user in get_used_users()

# return a user using requests library
def get_request(link):
    response = requests.get(link)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

# return a request passed through Zyte API
def get_request_zyte(link):
    api_response = requests.post(
        'https://api.zyte.com/v1/extract',
        auth=(ZYTE_API_KEY, ''),
        json={
            'url': link,
            'httpResponseBody': True,
            },
    )
    http_response_body: bytes = b64decode(
        api_response.json()['httpResponseBody']
    )
    
    page = http_response_body.decode('utf-8')
    soup = BeautifulSoup(page, 'html.parser')
    
    return soup

# return a request passed through Crawlbase's Crawling API
def get_request_crawlbase(link):
    api = CrawlingAPI({'token': CRAWLBASE_API_KEY})
    response = api.get(link)
    if response['status_code'] == 200:
        soup = BeautifulSoup(response['body'], 'html.parser')
        return soup
    raise Exception(f'Error: {response}')

# return a request passed through
def get_request_scraperapi(link):
    params = {'api_key': SCRAPER_API_KEY, 'url': link}
    response = requests.get('http://api.scraperapi.com/', params=parse.urlencode(params))
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

def get_request_webscrapingapi(link):
    API_KEY = WEBSCRAPERAPI_API_KEY
    scraper_url = 'https://api.webscrapingapi.com/v2'
    
    params = {
        "api_key":WEBSCRAPINGAPI_API_KEY,
        "url": link,
    }
    
    response = requests.get(scraper_url, params=params)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

## Scraping Letterboxd reviews

In [3]:
# Scrapes the diary provided at `link`
def scrape_review(link, debug=False):
    # review = requests.get(link)    
    # soup = get_request_zyte(link)
    # soup = get_request_crawlbase(link)
    # soup = get_request_scraperapi(link)
    soup = get_request(link)
    
    user = soup.find(class_='person-summary').find('a')['href'][1:-1]
    movie = soup.find(class_='film-title-wrapper').find('a').get_text(strip=True)
    movie_id = soup.find(class_='film-title-wrapper').find('a')['href'].split('/')[2]
    movie_year = int(soup.find(class_='film-title-wrapper').find('small', class_='metadata').find('a').get_text(strip=True))

    if debug:
        print(f'[DEBUG] Scraping "{movie}" ({movie_year}) review for user {user}')
    
    watch_metadata = soup.find(class_='film-viewing-info-wrapper')

    # get rating
    rating_qualifier = watch_metadata.find('span', class_='rating')

    try:
        rating = convert_rating_to_float(rating_qualifier.get_text(strip=True))
    except AttributeError as e:
        print(e)
        rating = 0

    # get other watch metadata
    liked = 1 if type(watch_metadata.find('span', class_='has-icon')) is Tag else 0
    when_watched = watch_metadata.find('meta', content=True)['content']
    rewatched = 1 if 'Rewatched' in watch_metadata.find(class_='view-date date-links').get_text(strip=True) else 0

    # get review
    review_div = soup.find('div', class_='review body-text -prose -hero -loose')
    review_text = ' '.join([p.get_text() for p in review_div.find_all('p')])
    num_likes = int(soup.find('p', class_='like-link-target')['data-count'])

    # get tags
    tags_section = soup.find('ul', class_='tags')
    if tags_section is not None:
        tags = [t.get_text(strip=True) for t in tags_section.find_all('li')]
    else:
        tags = []

    all_liked_reviews = soup.find('ul', id='liked-reviews')
    liked_reviews = {}
    
    if all_liked_reviews is not None:
        for r in all_liked_reviews.find_all('li'):
                liked_review_rating = r.find('a').find(class_='rating')
        if liked_review_rating is not None:
            liked_review_rating = convert_rating_to_float(liked_review_rating.get_text(strip=True))
        else:
            liked_review_rating = 0
        liked_reviews[r.find('a')['href']] = liked_review_rating

    # getting the href link for this review
    split_link = link.split('/')[3:]
    review_link = ['']
    review_link.extend(split_link)
    review_link = '/'.join(review_link)
    
    result = {'user': user, 
              'movie': movie, 
              'movie_id': movie_id,
              'movie_year': movie_year, 
              'rating': rating, 
              'liked': liked,
              'when_watched': when_watched, 
              'rewatched': rewatched,
              'review': review_text,
              'num_likes': num_likes,
              'tags': tags,
              'liked_reviews': liked_reviews,
              'review_link': review_link
             }
    return result

## Scraping Letterboxd diary - 50 most recent watched films

### Scrape user diary from link

In [4]:
def scrape_diary(link, debug=False):
    # soup = get_request_zyte(link)
    # soup = get_request_crawlbase(link)
    # soup = get_request_scraperapi(link)
    soup = get_request(link)
    table = soup.find('table', id='diary-table')
    rows = []
    curr_month = ''

    # iterate each row in the table of diary entries
    for row in table.find_all('tr')[1:]:
        try:
            user = soup.find(class_='profile-mini-person').find('a')['href'][1:-1]
            
            if debug:
                print(f'[DEBUG] Loading diary for user {user}')
                
            movie = row.find('h3', class_='headline-3 prettify').get_text()
            movie_id = row.find('h3', class_='headline-3 prettify').find('a')['href'].split('/')[3]
            movie_year = int(row.find(class_='td-released center').get_text())
        
            # getting date due to weird date formatting on Letterboxd
            month = row.find(class_='td-calendar').get_text().strip()
            if month != '':
                curr_month = month
            else:
                month = curr_month
        
            day = row.find(class_='td-day diary-day center').get_text(strip=True)
            
            date = str(day) + ' ' + month
            when_watched = dt.datetime.strptime(date, '%d %b %Y').strftime('%Y-%m-%d')
        
            rewatched = 1 if type(row.find(class_='td-rewatch center')) == Tag else 0
            rating = convert_rating_to_float(row.find(class_='hide-for-owner').find(class_='rating').get_text(strip=True))
            liked = 1 if type(row.find(class_='has-icon icon-16 large-liked icon-liked hide-for-owner')) is Tag else 0
            is_reviewed = type(row.find(class_='has-icon icon-review icon-16 tooltip')) is Tag

            review_link = row.find('h3', class_='headline-3 prettify').find('a')['href']
        
            # if reviewed, scrape from review page
            if is_reviewed:
                review_link = WEB_PREFIX + row.find(class_='has-icon icon-review icon-16 tooltip')['href']
                result = scrape_review(review_link)
            else:
                result = {'user': user,
                          'movie': movie,
                          'movie_id': movie_id,
                          'movie_year': movie_year,
                          'rating': rating,
                          'liked': liked,
                          'when_watched': when_watched,
                          'rewatched': rewatched,
                          'review': '',
                          'num_likes': 0,
                          'tags': [],
                          'liked_reviews': {},
                          'review_link': review_link
                          }
            rows.append(result)
        except ValueError as e:
            print(f'Error for movie {movie} for user {user}: {e} ({link})')
            continue
    
    return rows

### Save user diary

In [5]:
# saves `user`'s diary at the given page number, where the diary is sorted by review activity
def save_user_diary(user, page, folder_name, debug=True):
    diary_link = WEB_PREFIX + '/' + user + '/films/diary/page/' + str(page) + '/'
        
    file_path = f'{folder_name}/{user}_recent_reviews_{page}.json'
    
    if debug:
        print(f'\n[DEBUG] Loading diary for user {user}: {file_path}')
    recent_50_watches = scrape_diary(diary_link)
        
    with open(file_path, 'w') as json_file:
        json.dump(recent_50_watches, json_file)

    if debug:
        print(f'[DEBUG] Saved to file {file_path}')
        
    return recent_50_watches

## Scrape Letterboxd movie page

In [6]:
# scrape letterboxd homepage for a movie
def scrape_letterboxd_movie_page(movie_name):
    movie_link = WEB_PREFIX + '/film/' + movie_name
    # soup = get_request_zyte(movie_link)
    # soup = get_request_crawlbase(movie_link)
    # soup = get_request_scraperapi(movie_link)
    soup = get_request(movie_link)

    # get basic movie details
    movie_id = movie_name
    movie_name = soup.find('h1', class_='headline-1 filmtitle').get_text()
    movie_year = int(soup.find(class_='releaseyear').get_text())

    # get tagline and description
    has_tagline = soup.find(class_='tagline')
    if has_tagline is not None:
        movie_tagline = has_tagline.get_text()
    else:
        movie_tagline = ''

    movie_desc = soup.find(class_='truncate')
    if movie_desc is not None:
        movie_desc = movie_desc.get_text(strip=True)
    else:
        movie_desc = ''

    # get movie length and IMDB
    details_footer = soup.find('p', class_='text-link text-footer')
    
    if details_footer is not None:
        try:
            movie_len = int(details_footer.get_text(strip=True).split('min')[0][:-1])
        except ValueError:
            movie_len = 0
    else:
        movie_len = 0

    imdb_id = 0
    tmdb_id = 0
    
    for id in details_footer.find_all('a'):
        if id.get('data-track-action') == 'IMDb':
            imdb_id = int(id['href'].split('/')[4][2:])
        elif id.get('data-track-action') == 'TMDb':
            try:
                tmdb_id = int(id['href'].split('/')[4])
            except IndexError:
                tmdb_id = 0
            

    # get genres and top keywords
    has_genres = soup.find(id='tab-genres')
    if has_genres is not None:
        has_genres = has_genres.find_all('div')
        if len(has_genres) == 2:
            genres_html = soup.find(id='tab-genres').find_all('div')[0]
            genres = [g.get_text() for g in genres_html.find_all('a')]
            
            keywords_html = soup.find(id='tab-genres').find_all('div')[1]
            top_keywords = [k.get_text() for k in keywords_html.find_all('a') if 'Show All' not in k.get_text()]
        elif len(has_genres) == 1:
            genres_html = soup.find(id='tab-genres').find_all('div')[0]
            genres = [g.get_text() for g in genres_html.find_all('a')]
            top_keywords = []
        elif len(has_genres) > 2:
            print(f'[DEBUG] PAGE FOR {movie_name} HAS ADDITIONAL INFO')
    else:
        top_keywords = []
        genres = []

    # get (if any) related films
    has_collection = soup.find('section', id='related')
    if has_collection is not None:
        in_collection = 1
        collection_name = has_collection.find('h2', class_='section-heading').find('a')['href'].split('/')[3]
    else:
        in_collection = 0
        collection_name = ''

    # get production details
    prod_details = soup.find(id='tab-details')

    studios = {}
    countries = {}
    primary_language = ''
    spoken_languages = []
    if prod_details is not None:
        has_studios = prod_details.find('h3', string=lambda t: 'Studios' in t)
        for detail in prod_details.find_all('a'):
            details = detail['href'].split('/')
            if details[1] == 'studio':
                studios[details[2]] = detail.get_text(strip=True)
            if details[2] == 'country':
                countries[details[3]] = detail.get_text(strip=True)
            if details[2] == 'language':
                if primary_language == '':
                    primary_language = detail.get_text(strip=True)
                else:
                    spoken_languages.append(detail.get_text(strip=True))

    # get cast and crew
    has_cast = soup.find(class_='cast-list')
    if has_cast is not None:
        cast = {c.get_text(): c.get('data-original-title') for c in has_cast.find_all('a') if 'Show All' not in c.get_text()}
    else:
        cast = {}

    has_crew = soup.find(id='tab-crew')
    if has_crew is not None:
        crew_html = [c.find_all('a') for c in has_crew.find_all(class_='text-sluglist')]
        crew = defaultdict(list)
        
        for line in crew_html:
            role = line[0]['href'].split('/')[1]
            for member in line:
                crew[member.get_text()].append(role)
        
        crew = dict(crew)
    else:
        crew = {}
    
    result = {'movie_name': movie_name,
              'movie_id': movie_id,
              'movie_year': movie_year,
              'movie_tagline': movie_tagline,
              'movie_desc': movie_desc,
              'movie_len': movie_len,
              'genres': genres,
              'top_keywords': top_keywords,
              'in_collection': in_collection,
              'collection_name': collection_name,
              'studios': studios,
              'countries': countries,
              'primary_language': primary_language,
              'spoken_langauges': spoken_languages,
              'imdb_id': imdb_id,
              'tmdb_id': tmdb_id,
              'cast': cast,
              'crew': crew
              }
    
    return result

# get letterboxed-specific user-movie interaction stats from consumer data integration page
def get_movie_interaction_stats(movie_name):
    movie_link = WEB_PREFIX + '/csi/film/' + movie_name + '/stats/'
    # soup = get_request_zyte(movie_link)
    # soup = get_request_crawlbase(movie_link)
    # soup = get_request_scraperapi(movie_link)
    soup = get_request(movie_link)
    num_watched = soup.find(class_='stat filmstat-watches').find('a')['title'][11:-8].replace(',', '')
    num_listed = soup.find(class_='stat filmstat-lists').find('a')['title'][11:-6].replace(',', '')
    num_liked = soup.find(class_='stat filmstat-likes').find('a')['title'][9:-8].replace(',', '')
    top_250_rank = soup.find(class_='stat filmstat-top250')

    if top_250_rank is not None:
        top_250_rank = int(top_250_rank.find('a').get_text(strip=True))
    else:
        top_250_rank = 0
    
    result = {'num_watched': num_watched,
              'num_listed': num_listed,
              'num_liked': num_liked,
              'top_250_rank': top_250_rank
             }
    return result

# get letterboxd-specific user ratings for movies from consumer data integration page
def get_movie_hist(movie_name):
    movie_rating_dist_link = WEB_PREFIX + '/csi/film/' + movie_name + '/rating-histogram/'
    # soup = get_request_zyte(movie_rating_dist_link)
    # soup = get_request_crawlbase(movie_rating_dist_link)
    # soup = get_request_scraperapi(movie_rating_dist_link)
    soup = get_request(movie_rating_dist_link)
    
    hist_keys = ['num_half_star', 'num_one_star', 'num_one_half_star', 'num_two_star', 'num_two_half_star', 
                 'num_three_star', 'num_three_half_star', 'num_four_star', 'num_four_half_star', 'num_five_star']
    hist_vals = []
    for i in soup.find_all('li'):
        value = i.get_text(strip=True)
        if value != '':
            hist_vals.append(int(value.split('\xa0')[0].replace(',', '')))
        else:
            hist_vals.append(0)
    
    result = {pair[0]: pair[1] for pair in zip(hist_keys, hist_vals)}
    
    return result

# get all letterboxd data for a movie; scrapes three separate pages
def get_letterboxd_stats(movie_name):
    # get stats from a movie from homepage and CDI page
    basic_movie_info = scrape_letterboxd_movie_page(movie_name)
    movie_interaction_stats = get_movie_interaction_stats(movie_name)
    movie_hist = get_movie_hist(movie_name)

    # combine dictionaries
    basic_movie_info.update(movie_interaction_stats)
    basic_movie_info.update(movie_hist)

    return basic_movie_info

# Processing list of user IDs

### A list of user IDs from approximately the top 5000 reviewers on Letterboxd.com was scraped, and from each of those, (up to) the top 250 reviews for each user was collected into a set of 5 .json files.

In [280]:
movie_data_copy = []
for x in movie_data:
    movie_data_copy.append(x)

for i in range(len(movie_data_copy)):
    for key, value in movie_data_copy[i].items():
        if isinstance(value, dict):
            for k, v in value.items():
                if isinstance(v, set):
                    movie_data_copy[i][key][k] = list(v)

In [276]:
unused_movie_ids = []
for id in all_movie_ids:
    if id not in scraped_movie_ids:
        unused_movie_ids.append(id)

len(unused_movie_ids)

3500

In [None]:
# get list of popular reviewers

In [376]:
unused_users = set()

In [377]:
for i in range(0, 1000):
    pop_reviewers = f'https://letterboxd.com/reviewers/popular/page/{i}/'
    soup = get_request(pop_reviewers)
    unused_users.update(set([u['href'].replace('/', '') for u in soup.find_all(class_='avatar')]))
    time.sleep(1)

In [379]:
unused_users_list = list(unused_users)

In [383]:
users_to_use = []
for u in unused_users_list:
    if not is_user_used(u):
        users_to_use.append(u)

In [385]:
with open('users_to_use.txt', 'w') as f:
    for user in users_to_use:
        f.write(user + '\n')

In [362]:
len(unused_users_list)

725

In [49]:
with open('users_to_use.txt', 'r') as u:
        users_to_use = [line.split(',')[0].replace('\n', '') for line in u.readlines()]

In [50]:
len(users_to_use)

2831

In [63]:
users_to_use[:5]

['believeitornot',
 'layanadelrey',
 'ashleepradella',
 'cinemaconsumer',
 'mrmichaeldjones']

In [66]:
for i in range(len(users_to_use)):
    if users_to_use[i] == 'jbouie':
        print(i)

221


In [47]:
with open('users_to_use.txt', 'w') as f:
    for user in users_to_use:
        f.write(user + '\n')

In [102]:
for i in range(len(users_to_use)):
    if users_to_use[i]=='jay525':
        print(i)

1438


In [101]:
for i in range(1417, 1425):
    user_to_save = users_to_use[i]
    for j in range(1,11):
        try:
            save_user_diary(user_to_save, j, 'reviews_10')
        except AttributeError as e:
            print(e)
        time.sleep(3)


[DEBUG] Loading diary for user thejoelynch: reviews_10/thejoelynch_recent_reviews_1.json
Error for movie Stiletto for user thejoelynch: invalid literal for int() with base 10: '' (https://letterboxd.com/thejoelynch/films/diary/page/1/)
'NoneType' object has no attribute 'get_text'
[DEBUG] Saved to file reviews_10/thejoelynch_recent_reviews_1.json

[DEBUG] Loading diary for user thejoelynch: reviews_10/thejoelynch_recent_reviews_2.json
'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'get_text'
[DEBUG] Saved to file reviews_10/thejoelynch_recent_reviews_2.json

[DEBUG] Loading diary for user thejoelynch: reviews_10/thejoelynch_recent_reviews_3.json
'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'get_text'
[DEBUG] Saved to file reviews_10/thejoelynch_recent_reviews_3.json

[DEBUG] Loadi

# Create cleaned .json and compressed .json (gzip) from individual .json files

In [121]:
combined_data = []
folders = [f'reviews_{i}' for i in range(4, 14)]

for folder in folders:
    json_files = glob.glob(os.path.join(folder, '*.json'))
    print(f'[DEBUG] Loading {len(json_files)} files from {folder}')
    for json_file in tqdm(json_files):
        with open(json_file, 'r') as file:
            data = json.load(file)
            combined_data.extend(data)

[DEBUG] Loading 9978 files from reviews_4


100%|███████████████████████████████████████| 9978/9978 [05:48<00:00, 28.61it/s]


[DEBUG] Loading 9892 files from reviews_5


100%|█████████████████████████████████████| 9892/9892 [1:35:22<00:00,  1.73it/s]


[DEBUG] Loading 4314 files from reviews_6


100%|███████████████████████████████████████| 4314/4314 [43:18<00:00,  1.66it/s]


[DEBUG] Loading 0 files from reviews_7


0it [00:00, ?it/s]


[DEBUG] Loading 4933 files from reviews_8


100%|██████████████████████████████████████| 4933/4933 [00:15<00:00, 324.66it/s]


[DEBUG] Loading 7731 files from reviews_9


100%|██████████████████████████████████████| 7731/7731 [00:09<00:00, 844.62it/s]


[DEBUG] Loading 4926 files from reviews_10


100%|██████████████████████████████████████| 4926/4926 [00:06<00:00, 804.42it/s]


[DEBUG] Loading 4998 files from reviews_11


100%|█████████████████████████████████████| 4998/4998 [00:04<00:00, 1121.50it/s]


[DEBUG] Loading 4998 files from reviews_12


100%|██████████████████████████████████████| 4998/4998 [00:06<00:00, 751.06it/s]


[DEBUG] Loading 3296 files from reviews_13


100%|███████████████████████████████████████| 3296/3296 [31:46<00:00,  1.73it/s]


In [3]:
def make_hashable(value: Any) -> Hashable:
    if isinstance(value, dict):
        return frozenset((key, make_hashable(val)) for key, val in value.items())
    elif isinstance(value, list):
        return tuple(make_hashable(item) for item in value)
    else:
        return value

def convert_from_frozenset(value: Any) -> Any:
    if isinstance(value, frozenset):
        if all(isinstance(item, tuple) and len(item) == 2 for item in value):
            return {convert_from_frozenset(k): convert_from_frozenset(v) for k, v in value}
        else:
            return [convert_from_frozenset(item) for item in value]
    elif isinstance(value, (list, tuple)):
        return [convert_from_frozenset(item) for item in value]
    elif isinstance(value, dict):
        return {k: convert_from_frozenset(v) for k, v in value.items()}
    else:
        return value

### 68325 duplicates were removed to produce a collection of just over 2 million diary entries

In [None]:
unique_data = list({make_hashable(d) for d in combined_data})
unique_data = [dict(fs) for fs in unique_data]
unique_data = [convert_from_frozenset(d) for d in unique_data]

In [113]:
# save gzip
with gzip.open('all_letterboxd_reviews_unfiltered.json.gz', 'wt', encoding='utf-8') as f:
    json.dump(combined_data, f)

In [123]:
# save json
with open('all_letterboxd_reviews_unfiltered.json', 'w') as f:
    json.dump(combined_data, f)

In [5]:
all_data = []

In [6]:
with open('1_new_letterboxd_reviews_unfiltered.json', 'r') as f:
    temp_data = json.load(f)
    all_data.extend(temp_data)

with open('2_letterboxd_reviews.json', 'r') as f:
    temp_data = json.load(f)
    all_data.extend(temp_data)

with open('1_letterboxd_reviews.json', 'r') as f:
    temp_data = json.load(f)
    all_data.extend(temp_data)

with open('new_letterboxd_reviews_unfiltered.json', 'r') as f:
    temp_data = json.load(f)
    all_data.extend(temp_data)

In [7]:
len(all_data)

5423863

In [8]:
unique_data = set()
for d in tqdm(all_data):
    unique_data.add(make_hashable(d))

100%|██████████████████████████████| 5423863/5423863 [05:51<00:00, 15444.95it/s]


In [9]:
unique_data_list = list(unique_data)

In [10]:
new_unique_data = []
for fs in tqdm(unique_data_list):
    new_unique_data.append(dict(fs))

100%|███████████████████████████████| 3582031/3582031 [13:37<00:00, 4384.14it/s]


In [11]:
unique_data = []

for d in tqdm(new_unique_data):
    unique_data.append(convert_from_frozenset(d))

100%|███████████████████████████████| 3582031/3582031 [17:23<00:00, 3432.76it/s]


In [12]:
with open('final_letterboxd_reviews.json', 'w') as f:
    json.dump(unique_data, f)

In [None]:
with gzip.open('final_letterboxd_reviews.json.gz', 'wt', encoding='utf-8') as f:
    json.dump(unique_data, f)

In [38]:
len(unique_data)

3582031

# Scrape movie pages for movie features

In [13]:
with open('letterboxd_movies.json', 'r') as movie_data_file:
    movie_data_raw = json.load(movie_data_file)

In [15]:
movie_data_raw[0]['movie_id']

'the-substance'

In [16]:
used_movie_ids = list({x['movie_id'] for x in movie_data_raw})

In [17]:
len(used_movie_ids)

73070

In [18]:
used_movie_ids[0]

'the-blind-2023'

In [None]:
# with open('final_letterboxd_reviews.json', 'r') as f:
#     raw_reviews = json.load(f)

df = pd.DataFrame(unique_data)

# get all unique movie_id values

all_ids = set(df['movie_id'].values)

In [None]:
to_use = []

In [23]:
with open('new_used_movie_ids.txt', 'r') as m:
    to_use = [line.split(',')[0].replace('\n', '') for line in m.readlines()]

In [29]:
used_movie_ids = set(used_movie_ids)

In [33]:
len(used_movie_ids) + len(unused_ids)

173022

In [30]:
to_use = set(to_use)
unused_ids = []

for d in to_use:
    if d not in used_movie_ids:
        unused_ids.append(d)

In [36]:
unused_ids = set(unused_ids)

In [37]:
len(unused_ids)

99952

In [7]:
with open('new_unused_ids.txt', 'w') as m:
    for item in unused_ids:
        m.write(item + "\n")

NameError: name 'unused_ids' is not defined

In [15]:
movie_data = []

In [None]:
unused_ids = list(unused_ids)

In [7]:
with open('new_unused_ids.txt', 'r') as m:
    all_movie_ids = [line.split(',')[0].replace('\n', '') for line in m.readlines()]

In [32]:
for i in range(len(all_movie_ids)):
    if all_movie_ids[i] == 'underground':
        print(i)

98951


In [31]:
for id_ in list(all_movie_ids)[98759:98952]:
    print(f'[DEBUG] Appending to `movie_data`: {id_}')
    page = get_letterboxd_stats(id_)
    movie_data.append(page)

[DEBUG] Appending to `movie_data`: overwatch-be-the-hero
[DEBUG] Appending to `movie_data`: most-likely-to-murder-2019
[DEBUG] Appending to `movie_data`: first-day-of-work
[DEBUG] Appending to `movie_data`: banning
[DEBUG] Appending to `movie_data`: the-damned-2013
[DEBUG] Appending to `movie_data`: invaders
[DEBUG] Appending to `movie_data`: buster-and-billie
[DEBUG] Appending to `movie_data`: tragedy-in-a-temporary-town-1956
[DEBUG] Appending to `movie_data`: quantez
[DEBUG] Appending to `movie_data`: habitat-2014
[DEBUG] Appending to `movie_data`: la-riviere-de-diamants
[DEBUG] Appending to `movie_data`: maid-of-honor-2006
[DEBUG] Appending to `movie_data`: fastball
[DEBUG] Appending to `movie_data`: omg-o-manchi-ghost
[DEBUG] Appending to `movie_data`: so-this-is-london-1930
[DEBUG] Appending to `movie_data`: the-scream-1993
[DEBUG] Appending to `movie_data`: mnemonics-of-shape-and-reason
[DEBUG] Appending to `movie_data`: tears-of-god
[DEBUG] Appending to `movie_data`: aayiram-por

In [29]:
for i in range(len(all_movie_ids)):
    if all_movie_ids[i] == 'two-sisters':
        print(i)

98758


In [33]:
with open('new_movie_data_last2.json', 'w') as json_file:
        json.dump(movie_data, json_file)

## Save and collect into one file

In [430]:
del movie_data_copy

In [429]:
movie_data = []
files = ['movie_data.json', 'updated_movie_data_0.json', 'updated_movie_data_1.json', 
         'updated_movie_data_2.json', 'updated_movie_data_3.json', 'updated_movie_data_4.json', 'updated_movie_data_5.json']
for file in files:
    with open(file, 'r') as f:
        temp_data = json.load(f)
        movie_data.extend(temp_data)

with open('letterboxd_movies.json', 'w') as f:
    json.dump(movie_data, f)

In [34]:
all_movie_data = []
files = ['letterboxd_movies.json', 'new_movie_data_0_6899.json', 'new_movie_data_10000_16606.json',
         'new_movie_data_20000_28560.json', 'new_movie_data_30000_38196.json', 'new_movie_data_38196_40000.json',
         'new_movie_data_40000_50000.json', 'new_movie_data_50000_60000.json', 'new_movie_data_60000_77008.json',
         'new_movie_data_6899_10000.json', 'new_movie_data_last1.json', 'new_movie_data_last3.json', 'new_movie_data_last2.json',
        ]
for file in tqdm(files):
    with open(file, 'r') as f:
        temp_data = json.load(f)
        all_movie_data.extend(temp_data)

100%|███████████████████████████████████████████| 13/13 [00:26<00:00,  2.04s/it]


In [35]:
with open('final_letterboxd_movies.json', 'w') as f:
    json.dump(all_movie_data, f)