# Setup

In [None]:
# this mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


FOLDERNAME = 'cs229_proj/'


assert FOLDERNAME is not None, "[!] Enter the foldername."

# now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

%cd drive/My\ Drive/$FOLDERNAME/

Mounted at /content/drive
/content/drive/My Drive/cs229_proj


In [None]:
!pwd
!ls

In [None]:
from IPython.display import Image
import re
import requests
from bs4 import BeautifulSoup
import json
import urllib.request
import random
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math

# Web Scraping

Looking at the ~7000 most popular movies on Letterboxd, we aim to get the user ratings (the ratings histograms) and movie posters. Since Letterboxd doesn't allow for webscraping due to Javascript needing to dynamically load the page, we downloaded 100 pages by brute force which has beautiful HTML files which we can use Regular Expressions to extract movie ID and movie poster URL. We then use the movie ID to webscrape the histogram from a website that actually allows us to webscrape from without Javascript preventing us from viewing. We then edit the movie poster URL to be the largest poster size possible and then save the image in one of 100 folders with the poster renamed as movie ID.

## Web Scraping through HTML files (100 Letterboxd pages)

In [None]:
!pip3 install beautifulsoup4 requests



In [None]:
!ls

boxd_scrape.ipynb	     MovieGenre.csv	      SampleMoviePosters
collect_movie_posters.ipynb  movie_to_ratings_1.json
html			     movie_to_ratings_2.json


In [None]:
html_folder = '/content/drive/My Drive/cs229_proj/html/'

start_page = 1
end_page = 100

In [None]:
movie_links = []
lengths = []

for i in range(start_page, end_page + 1):
    with open(html_folder + str(i) +  ".html", "r", encoding='utf-8') as f:
        text= f.read()
        result = re.findall('data-film-link="/film/(.*)/" data-film-in-watchlist', text)
        movie_links.extend(result)
        lengths.append(len(result))
print(len(movie_links))

6873


In [None]:
print(movie_links)
print(lengths)

['parasite-2019', 'joker-2019', 'knives-out-2019', 'pulp-fiction', 'inception', 'get-out-2017', 'midsommar', 'fight-club', 'spider-man-into-the-spider-verse', 'once-upon-a-time-in-hollywood', 'the-dark-knight', 'lady-bird', 'la-la-land', 'the-grand-budapest-hotel', 'interstellar', 'baby-driver', 'inglourious-basterds', 'the-shining', 'avengers-infinity-war', 'the-wolf-of-wall-street', 'black-panther', 'jojo-rabbit', 'whiplash-2014', 'avengers-endgame', 'gone-girl', 'her', 'spirited-away', 'arrival-2016', 'mad-max-fury-road', 'eternal-sunshine-of-the-spotless-mind', 'guardians-of-the-galaxy', 'django-unchained', 'the-social-network', 'the-silence-of-the-lambs', 'uncut-gems', 'little-women-2019', 'call-me-by-your-name', 'kill-bill-vol-1', 'the-lighthouse-2019', 'scott-pilgrim-vs-the-world', 'the-truman-show', '1917', 'hereditary', 'moonlight-2016', 'the-godfather', 'thor-ragnarok', 'soul-2020', 'forrest-gump', 'the-matrix', 'se7en', 'marriage-story-2019', 'dunkirk-2017', 'us-2019', 'a-qu

In [None]:
headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }

In [None]:
rating_str_lst = ['half-★', '★½', '★★', '★★½', '★★★', '★★★½', '★★★★', '★★★★½', '★★★★★']
rating_lst = [1, 3, 4, 5, 6, 7, 8, 9, 10]
movie_to_ratings = {}

In [None]:
count = 0

for movie in movie_links:
    count += 1
    if count % 50 == 0:
        print(count)
    rating_to_num = {}
    url = "https://letterboxd.com/csi/film/" + movie + "/rating-histogram/"
    req = requests.get(url, headers)
    soup = BeautifulSoup(req.content, 'html.parser')
    text = soup.prettify()
    # print(soup.prettify())
    skip = False
    for i in range(len(rating_str_lst)):
        rating_str = rating_str_lst[i]
        rating = rating_lst[i]
        result = re.search('title="(.*)' + rating_str + ' ratings', text)
        if result is None:
            skip = True
            print('skip')
            break
        if result.group(1) == "No ":
            rating_to_num[rating] = 0
        else:
            val = int((result.group(1)).replace(',', ''))
            rating_to_num[rating] = val
    # handle 2-ratings
    if not skip:
        result = re.findall('title="(.*)★ ratings', text)
        val = 0
        if result[1].split(' ')[0] != "No" and result[1].split(' ')[0] != "26\xa0★":
            val = int((result[1]).replace(',', ''))
        rating_to_num[2] = val
        movie_to_ratings[movie] = rating_to_num

In [None]:
print(len(movie_to_ratings))

In [None]:
print(movie_to_ratings)
print(len(movie_to_ratings.keys()))

In [None]:
with open("movie_to_ratings.json", "w") as outfile:
    json.dump(movie_to_ratings, outfile)

## Sanity Check -- Jesse's 758 movies

In [None]:
html_folder = '/content/drive/My Drive/cs229_proj/html/'

start_page = 1
end_page = 11

In [None]:
movie_links = []
lengths = []

for i in range(start_page, end_page + 1):
    with open(html_folder + "jesse" + str(i) +  ".html", "r", encoding='utf-8') as f:
        text= f.read()
        result = re.findall('data-film-link="/film/(.*)/" data-film-in-watchlist', text)
        movie_links.extend(result)
        lengths.append(len(result))
print(len(movie_links))

758


In [None]:
headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }

In [None]:
rating_str_lst = ['half-★', '★½', '★★', '★★½', '★★★', '★★★½', '★★★★', '★★★★½', '★★★★★']
rating_lst = [1, 3, 4, 5, 6, 7, 8, 9, 10]
movie_to_ratings = {}

In [None]:
count = 0

for movie in movie_links:
    if movie == "finding-jesus": # Screw this movie very much
        continue
    count += 1
    if count % 50 == 0:
        print(count)
    rating_to_num = {} 
    url = "https://letterboxd.com/csi/film/" + movie + "/rating-histogram/"
    req = requests.get(url, headers)
    soup = BeautifulSoup(req.content, 'html.parser')
    text = soup.prettify()
    # print(soup.prettify())
    skip = False
    for i in range(len(rating_str_lst)):
        rating_str = rating_str_lst[i]
        rating = rating_lst[i]
        result = re.search('title="(.*)' + rating_str + ' ratings', text)
        if result is None:
            skip = True
            print('skip')
            break
        if result.group(1).split(' ')[0] == "No":
            rating_to_num[rating] = 0
        else:
            val = int((result.group(1)).replace(',', ''))
            rating_to_num[rating] = val
    # handle 2-ratings
    if not skip:
        result = re.findall('title="(.*)★ ratings', text)
        val = 0
        if result[1].split(' ')[0] != "No" and result[1].split(' ')[0] != "26\xa0★":
            val = int((result[1]).replace(',', ''))
        rating_to_num[2] = val
        movie_to_ratings[movie] = rating_to_num

50
100
skip
150
200
250
300
350
skip
400
450
skip
500
550
600
650
700
750


In [None]:
print(len(movie_to_ratings))
print(movie_to_ratings)

754
{'dune-2021': {1: 519, 3: 898, 4: 3638, 5: 4911, 6: 14778, 7: 24546, 8: 62007, 9: 52178, 10: 42939, 2: 915}, 'hes-all-that': {1: 17848, 3: 4800, 4: 6090, 5: 2648, 6: 2892, 7: 811, 8: 746, 9: 123, 10: 1163, 2: 11704}, 'operation-varsity-blues-the-college-admissions-scandal': {1: 15, 3: 176, 4: 926, 5: 1841, 6: 5244, 7: 2925, 8: 1834, 9: 226, 10: 263, 2: 97}, 'to-all-the-boys-always-and-forever': {1: 2098, 3: 3228, 4: 9120, 5: 8836, 6: 15996, 7: 7408, 8: 5882, 9: 1056, 10: 2199, 2: 3933}, 'enola-holmes': {1: 1503, 3: 4022, 4: 15298, 5: 20424, 6: 48550, 7: 31469, 8: 24333, 9: 4233, 10: 7198, 2: 3665}, 'nomadland': {1: 837, 3: 1508, 4: 6028, 5: 7933, 6: 25624, 7: 35819, 8: 76065, 9: 43771, 10: 32594, 2: 1812}, 'im-thinking-of-ending-things': {1: 3519, 3: 4349, 4: 14614, 5: 13878, 6: 33071, 7: 35089, 8: 56425, 9: 28322, 10: 20307, 2: 6487}, 'tenet': {1: 3182, 3: 7106, 4: 26540, 5: 30201, 6: 74393, 7: 75553, 8: 95424, 9: 30994, 10: 29714, 2: 8196}, 'host-2020': {1: 877, 3: 1222, 4: 3468,

In [None]:
# UPDATE to new JSON (no overwriting please)
# with open("jesse_movie_to_ratings.json", "w") as outfile:
#     json.dump(movie_to_ratings, outfile)

## More Scraping (Images + Such)

In [None]:
import os
import requests

In [None]:
html_folder = '/content/drive/My Drive/cs229_proj/html/'
poster_folder_path = '/content/drive/My Drive/cs229_proj/letterboxd_posters/'
headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }

start_page = 1
end_page = 100

In [None]:
movie_links = []
lengths = []

page_num_to_movie_links = {}

for i in range(start_page, end_page + 1):
    with open(html_folder + str(i) +  ".html", "r", encoding='utf-8') as f:
        text= f.read()
        result = re.findall('data-film-link="/film/(.*)/" data-film-in-watchlist', text)
        movie_links.extend(result)
        lengths.append(len(result))
        page_num_to_movie_links[i] = result
print(len(movie_links))

6873


In [None]:
print(len(page_num_to_movie_links))
print(page_num_to_movie_links)

100
{1: ['parasite-2019', 'joker-2019', 'knives-out-2019', 'pulp-fiction', 'inception', 'get-out-2017', 'midsommar', 'fight-club', 'spider-man-into-the-spider-verse', 'once-upon-a-time-in-hollywood', 'the-dark-knight', 'lady-bird', 'la-la-land', 'the-grand-budapest-hotel', 'interstellar', 'baby-driver', 'inglourious-basterds', 'the-shining', 'avengers-infinity-war', 'the-wolf-of-wall-street', 'black-panther', 'jojo-rabbit', 'whiplash-2014', 'avengers-endgame', 'gone-girl', 'her', 'spirited-away', 'arrival-2016', 'mad-max-fury-road', 'eternal-sunshine-of-the-spotless-mind', 'guardians-of-the-galaxy', 'django-unchained', 'the-social-network', 'the-silence-of-the-lambs', 'uncut-gems', 'little-women-2019', 'call-me-by-your-name', 'kill-bill-vol-1', 'the-lighthouse-2019', 'scott-pilgrim-vs-the-world', 'the-truman-show', '1917', 'hereditary', 'moonlight-2016', 'the-godfather', 'thor-ragnarok', 'soul-2020', 'forrest-gump', 'the-matrix', 'se7en', 'marriage-story-2019', 'dunkirk-2017', 'us-2019

In [None]:
movie_to_title = {}

for i in range(start_page, end_page + 1):
    with open(html_folder + str(i) +  ".html", "r", encoding='utf-8') as f:
        text = f.read()
        movie_links = page_num_to_movie_links[i]
        for movie_link in movie_links:
            result = re.findall('data-film-name="(.*)" data-poster-url="/film/' + movie_link + '', text)
            # print(result)
            if len(result) > 0:
                movie_to_title[movie_link] = result[0]
            else:
                print("Failed on " + movie_link)
    print("Finished Page " + str(i))
print(len(movie_to_title.keys()))
print(movie_to_title)

In [None]:
# with open("movie_to_title.json", "w") as outfile:
#     json.dump(movie_to_title, outfile)

In [None]:
movie_to_url = {}
n_failures = 0
page_num_to_failures = {}
for i in range(start_page, end_page + 1):
    with open(html_folder + str(i) +  ".html", "r", encoding='utf-8') as f:
        text = f.read()
        movie_links = page_num_to_movie_links[i]
        failures = []
        for movie_link in movie_links:
            movie_title = movie_to_title[movie_link]
            # We love edge cases
            if movie_title == "Don&#39;t F**k with Cats: Hunting an Internet Killer":
                movie_title = "Don&#39;t F\*\*k with Cats: Hunting an Internet Killer"
            result = re.findall('alt="' + movie_title + '" srcset="(.*) 2x" class="image">', text)
            if len(result) > 0:
                movie_to_url[movie_link] = result[0]
            else:
                failures.append(movie_link)
                print("Failed on " + movie_link)
                n_failures += 1
        page_num_to_failures[i] = failures
    print("Finished Page " + str(i))
print(len(movie_to_url.keys()))
print(movie_to_url)

In [None]:
print(n_failures)
print(page_num_to_failures)

In [None]:
sample_url = 'https://a.ltrbxd.com/resized/film-poster/4/2/6/4/0/6/426406-parasite-0-140-0-210-crop.jpg?k=2d9ea1c1b9'
sample_movieid = "parasite-2019"

In [None]:
url = sample_url
page = requests.get(url)
f_ext = os.path.splitext(url)[-1]
with open(poster_folder_path + sample_movieid +".jpg", 'wb') as f:
    f.write(page.content)

In [None]:
# import os 
 
# main_photos_directory = "/content/drive/My Drive/cs229_proj/letterboxd_posters/"
# for i in range(1, 101):
#     os.mkdir(main_photos_directory + str(i)) 

In [None]:
print(page_num_to_movie_links)
print(movie_to_url)
print(poster_folder_path)

{1: ['parasite-2019', 'joker-2019', 'knives-out-2019', 'pulp-fiction', 'inception', 'get-out-2017', 'midsommar', 'fight-club', 'spider-man-into-the-spider-verse', 'once-upon-a-time-in-hollywood', 'the-dark-knight', 'lady-bird', 'la-la-land', 'the-grand-budapest-hotel', 'interstellar', 'baby-driver', 'inglourious-basterds', 'the-shining', 'avengers-infinity-war', 'the-wolf-of-wall-street', 'black-panther', 'jojo-rabbit', 'whiplash-2014', 'avengers-endgame', 'gone-girl', 'her', 'spirited-away', 'arrival-2016', 'mad-max-fury-road', 'eternal-sunshine-of-the-spotless-mind', 'guardians-of-the-galaxy', 'django-unchained', 'the-social-network', 'the-silence-of-the-lambs', 'uncut-gems', 'little-women-2019', 'call-me-by-your-name', 'kill-bill-vol-1', 'the-lighthouse-2019', 'scott-pilgrim-vs-the-world', 'the-truman-show', '1917', 'hereditary', 'moonlight-2016', 'the-godfather', 'thor-ragnarok', 'soul-2020', 'forrest-gump', 'the-matrix', 'se7en', 'marriage-story-2019', 'dunkirk-2017', 'us-2019', '

In [None]:
url = sample_url
url = url.replace("0-140-0-210", "0-460-0-690")
print(url)

https://a.ltrbxd.com/resized/film-poster/4/2/6/4/0/6/426406-parasite-0-460-0-690-crop.jpg?k=2d9ea1c1b9


In [None]:
# for i in range(1, 101):
#     movie_links = page_num_to_movie_links[i]
#     for movie in movie_links:
#         if movie in movie_to_url:
#             url = movie_to_url[movie]
#             url = url.replace("0-140-0-210", "0-460-0-690")
#             page = requests.get(url)
#             f_ext = os.path.splitext(url)[-1]
#             with open(poster_folder_path + str(i) + "/" + movie +".jpg", 'wb') as f:
#                 f.write(page.content)
#     print("Handled Page " + str(i))

Handled Page 81
Handled Page 82
Handled Page 83
Handled Page 84
Handled Page 85
Handled Page 86
Handled Page 87
Handled Page 88
Handled Page 89
Handled Page 90
Handled Page 91
Handled Page 92
Handled Page 93
Handled Page 94
Handled Page 95
Handled Page 96
Handled Page 97
Handled Page 98
Handled Page 99
Handled Page 100


### Handle the 42 failures

In [None]:
for i in page_num_to_failures.keys():
    failures = page_num_to_failures[i]
    if len(failures) != 0:
        for failure in failures:
            print('"' + failure + '" : ,')

In [None]:
count = 0
for i in page_num_to_failures.keys():
    failures = page_num_to_failures[i]
    if len(failures) != 0:
        count += 1
        print(i, failures)
print(count)

In [None]:
failure_to_url = {
    "birdman-or-the-unexpected-virtue-of-ignorance" : "https://a.ltrbxd.com/resized/film-poster/1/3/9/7/9/5/139795-birdman-0-140-0-210-crop.jpg?k=f46736b6ad",
    "birds-of-prey-and-the-fantabulous-emancipation-of-one-harley-quinn" : "https://a.ltrbxd.com/resized/film-poster/4/2/6/1/3/1/426131-birds-of-prey-and-the-fantabulous-emancipation-of-one-har-0-140-0-210-crop.jpg?k=c19e0e176f",
    "500-days-of-summer" : "https://a.ltrbxd.com/resized/film-poster/3/9/3/5/0/39350--500-days-of-summer-0-140-0-210-crop.jpg?k=9f6f7802db",
    "o-brother-where-art-thou" : "https://a.ltrbxd.com/resized/sm/upload/a0/sz/qf/mq/erBsxkzVwkXuvuzUBpVnNxxPYhO-0-140-0-210-crop.jpg?k=afe13f9469",
    "romeo-juliet-1996" : "https://a.ltrbxd.com/resized/film-poster/5/1/6/4/7/51647-romeo-juliet-0-140-0-210-crop.jpg?k=f9cbf2f4b5",
    "the-meyerowitz-stories-new-and-selected" : "https://a.ltrbxd.com/resized/sm/upload/3h/s9/0c/uc/lJr90SKXhEBaSchymgzOrbC56kA-0-140-0-210-crop.jpg?k=247dfb0e79",
    "wont-you-be-my-neighbor" : "https://a.ltrbxd.com/resized/film-poster/4/2/0/4/7/1/420471-won-t-you-be-my-neighbor--0-140-0-210-crop.jpg?k=bce8512934",
    "can-you-ever-forgive-me" : "https://a.ltrbxd.com/resized/film-poster/3/3/6/5/5/5/336555-can-you-ever-forgive-me--0-140-0-210-crop.jpg?k=e391c849f9",
    "rec" : "https://a.ltrbxd.com/resized/sm/upload/wc/8v/37/hl/cOW5Oc3vnjD2QgK68BUvm6GGLZ4-0-140-0-210-crop.jpg?k=a7d97e49a0",
    "what-did-jack-do" : "https://a.ltrbxd.com/resized/film-poster/4/5/7/7/9/7/457797-what-did-jack-do--0-140-0-210-crop.jpg?k=aa8b916388",
    "whos-afraid-of-virginia-woolf" : "https://a.ltrbxd.com/resized/film-poster/5/1/6/9/4/51694-who-s-afraid-of-virginia-woolf--0-140-0-210-crop.jpg?k=a1a6f6d5e3",
    "euphoria-fck-anyone-whos-not-a-sea-blob" : "https://a.ltrbxd.com/resized/sm/upload/vo/i8/ev/22/euphoria-0-140-0-210-crop.jpg?k=0ab16be076",
    "what-ever-happened-to-baby-jane" : "https://a.ltrbxd.com/resized/sm/upload/cq/ht/cp/b9/7Zr0EuDMcIvnJREsLmaAWi5x7mV-0-140-0-210-crop.jpg?k=0f52134e31",
    "where-is-my-friends-house" : "https://a.ltrbxd.com/resized/film-poster/1/5/6/6/9/15669-where-is-the-friend-s-house--0-140-0-210-crop.jpg?k=a0f665dfee",
    "mash" : "https://a.ltrbxd.com/resized/film-poster/5/1/4/7/3/51473-m-a-s-h-0-140-0-210-crop.jpg?k=2d38aafcf3",
    "why-him" : "https://a.ltrbxd.com/resized/film-poster/2/8/7/5/0/9/287509-why-him--0-140-0-210-crop.jpg?k=f9bf6b09e3",
    "bpm-beats-per-minute" : "https://a.ltrbxd.com/resized/sm/upload/jn/dw/3f/mx/azLtGx5ZhdTSP2b4oNLWtiE51OW-0-140-0-210-crop.jpg?k=392be8b518",
    "the-human-centipede-first-sequence" : "https://a.ltrbxd.com/resized/sm/upload/le/h6/9n/tu/rhy5WMyLVmYQ9PfEM60pg25E3TL-0-140-0-210-crop.jpg?k=b0288f8187",
    "evangelion-3010-thrice-upon-a-time" : "https://a.ltrbxd.com/resized/film-poster/2/0/8/8/7/7/208877-evangelion-3-0-1-0-thrice-upon-a-time-0-140-0-210-crop.jpg?k=c804def72b",
    "dude-wheres-my-car" : "https://a.ltrbxd.com/resized/film-poster/4/7/5/6/2/47562-dude-where-s-my-car--0-140-0-210-crop.jpg?k=980e4d7ed8",
    "evangelion-10-you-are-not-alone" : "https://a.ltrbxd.com/resized/film-poster/4/2/5/7/1/42571-evangelion-1-0-you-are-not-alone-0-140-0-210-crop.jpg?k=c1f816818b",
    "evangelion-20-you-can-not-advance" : "https://a.ltrbxd.com/resized/sm/upload/3p/l8/hu/en/odCtbDVJMxaqsTt4fIkM7UHZ8ta-0-140-0-210-crop.jpg?k=761c539565",
    "summer-of-soul-or-when-the-revolution-could-not-be-televised" : "https://a.ltrbxd.com/resized/film-poster/6/9/5/4/8/8/695488-summer-of-soul-or-when-the-revolution-could-not-be-televi-0-140-0-210-crop.jpg?k=352ba03a1a",
    "evangelion-30-you-can-not-redo" : "https://a.ltrbxd.com/resized/film-poster/6/2/0/9/1/62091-evangelion-3-0-you-can-not-redo-0-140-0-210-crop.jpg?k=7c53f170a5",
    "knife-heart" : "https://a.ltrbxd.com/resized/film-poster/4/0/7/2/0/6/407206-knife-heart-0-140-0-210-crop.jpg?k=7eb2f8ee1e",
    "quo-vadis-aida" : "https://a.ltrbxd.com/resized/film-poster/6/5/0/6/0/6/650606-quo-vadis-aida--0-140-0-210-crop.jpg?k=67947d4435",
    "whats-up-doc-1972" : "https://a.ltrbxd.com/resized/film-poster/4/8/1/6/5/48165-what-s-up-doc--0-140-0-210-crop.jpg?k=b825bde557",
    "hail-satan" : "https://a.ltrbxd.com/resized/film-poster/4/9/4/1/5/0/494150-hail-satan--0-140-0-210-crop.jpg?k=0a02bf29da",
    "what-about-bob" : "https://a.ltrbxd.com/resized/film-poster/4/6/3/7/8/46378-what-about-bob--0-140-0-210-crop.jpg?k=2a35cac217",
    "why-dont-you-play-in-hell" : "https://a.ltrbxd.com/resized/sm/upload/3c/k2/pv/29/h6ZGWLMv9Q1KzC9lB2rIQIm8jiG-0-140-0-210-crop.jpg?k=1ea2c9a9d4",
    "rec-2" : "https://a.ltrbxd.com/resized/sm/upload/xd/n3/o5/mv/g2Dk8ud1KpXt2RWzvcgsuuPkPuJ-0-140-0-210-crop.jpg?k=6a62c15571",
    "whats-your-number" : "https://a.ltrbxd.com/resized/sm/upload/lq/kx/kg/ub/yugpF7km9nEE5GLpSWMSkGKHGNd-0-140-0-210-crop.jpg?k=d6278890af",
    "who-killed-captain-alex" : "https://a.ltrbxd.com/resized/film-poster/2/4/3/6/0/5/243605-who-killed-captain-alex--0-140-0-210-crop.jpg?k=a02f85e2f9",
    "they-shoot-horses-dont-they" : "https://a.ltrbxd.com/resized/film-poster/3/3/8/8/6/33886-they-shoot-horses-don-t-they--0-140-0-210-crop.jpg?k=f13b6ffaa3",
    "what-happened-miss-simone" : "https://a.ltrbxd.com/resized/film-poster/2/4/5/2/5/4/245254-what-happened-miss-simone--0-140-0-210-crop.jpg?k=caa24ecacd",
    "the-human-centipede-2-full-sequence" : "https://a.ltrbxd.com/resized/sm/upload/6o/of/zh/bv/kFtAdkCO0vXN2RWu2oMcR9GZ9Hi-0-140-0-210-crop.jpg?k=29631f91eb",
    "everything-you-always-wanted-to-know-about-sex-but-were-afraid-to-ask" : "https://a.ltrbxd.com/resized/film-poster/4/5/1/2/7/45127-everything-you-always-wanted-to-know-about-sex-but-were-afraid-to-ask-0-140-0-210-crop.jpg?k=3fb044b467",
    "are-we-there-yet" : "https://a.ltrbxd.com/resized/sm/upload/7r/7v/3b/8x/e081B8b8ZCAfeyof5Zm6dNwJOmp-0-140-0-210-crop.jpg?k=65e0796b6c",
    "what-time-is-it-there" : "https://a.ltrbxd.com/resized/film-poster/3/6/9/7/2/36972-what-time-is-it-there--0-140-0-210-crop.jpg?k=102985e8a9",
    "october-ten-days-that-shook-the-world" : "https://a.ltrbxd.com/resized/film-poster/5/1/4/2/9/51429-october-0-140-0-210-crop.jpg?k=1d0b2769fd",
    "flesh-blood-1985" : "https://a.ltrbxd.com/resized/film-poster/4/4/3/0/6/44306-flesh-blood-0-140-0-210-crop.jpg?k=0ab532f73c",
    "patti-cakes" : "https://a.ltrbxd.com/resized/sm/upload/1f/l8/ft/mb/11MVxyp77zUPwc4cmqsUumNQYWK-0-140-0-210-crop.jpg?k=5c7fd9e3e3"
}

In [None]:
for i in range(1, 101):
    failures = page_num_to_failures[i]
    if len(failures) > 0:
        for movie in failures:
            if movie in failure_to_url:
                url = failure_to_url[movie]
                url = url.replace("0-140-0-210", "0-460-0-690")
                page = requests.get(url)
                f_ext = os.path.splitext(url)[-1]
                with open(poster_folder_path + str(i) + "/" + movie +".jpg", 'wb') as f:
                    f.write(page.content)
        print("Handled Page " + str(i))

Handled Page 2
Handled Page 3
Handled Page 8
Handled Page 9
Handled Page 14
Handled Page 17
Handled Page 18
Handled Page 21
Handled Page 25
Handled Page 26
Handled Page 27
Handled Page 35
Handled Page 36
Handled Page 39
Handled Page 41
Handled Page 43
Handled Page 45
Handled Page 48
Handled Page 49
Handled Page 50
Handled Page 53
Handled Page 54
Handled Page 55
Handled Page 56
Handled Page 57
Handled Page 60
Handled Page 63
Handled Page 64
Handled Page 71
Handled Page 73
Handled Page 78
Handled Page 80
Handled Page 83
Handled Page 95
Handled Page 98


In [None]:
# 6873 photos

In [None]:
movie_to_year = {}

for i in range(start_page, end_page + 1):
    with open(html_folder + str(i) +  ".html", "r", encoding='utf-8') as f:
        text = f.read()
        movie_links = page_num_to_movie_links[i]
        for movie_link in movie_links:
            result = re.findall('data-poster-url="/film/' + movie_link + '/image-150/" data-film-release-year="(.*)" data-new-list-with-film-action=', text)
            # print(result)
            if len(result) > 0:
                if result[0] == "":
                    movie_to_year[movie_link] = -1
                else:
                    movie_to_year[movie_link] = int(result[0])
            else:
                print("Failed on " + movie_link)
    print("Finished Page " + str(i))
print(len(movie_to_title.keys()))
print(movie_to_title)

Finished Page 1
Finished Page 2
Finished Page 3
Finished Page 4
Finished Page 5
Finished Page 6
Finished Page 7
Finished Page 8
Finished Page 9
Finished Page 10
Finished Page 11
Finished Page 12
Finished Page 13
Finished Page 14
Finished Page 15
Finished Page 16
Finished Page 17
Finished Page 18
Finished Page 19
Finished Page 20
Finished Page 21
Finished Page 22
Finished Page 23
Finished Page 24
Finished Page 25
Finished Page 26
Finished Page 27
Finished Page 28
Finished Page 29
Finished Page 30
Finished Page 31
Finished Page 32
Finished Page 33
Finished Page 34
Finished Page 35
Finished Page 36
Finished Page 37
Finished Page 38
Finished Page 39
Finished Page 40
Finished Page 41
Finished Page 42
Finished Page 43
Finished Page 44
Finished Page 45
Finished Page 46
Finished Page 47
Finished Page 48
Finished Page 49
Finished Page 50
Finished Page 51
Finished Page 52
Finished Page 53
Finished Page 54
Finished Page 55
Finished Page 56
Finished Page 57
Finished Page 58
Finished Page 59
Finish

In [None]:
print(movie_to_year)

{'parasite-2019': 2019, 'joker-2019': 2019, 'knives-out-2019': 2019, 'pulp-fiction': 1994, 'inception': 2010, 'get-out-2017': 2017, 'midsommar': 2019, 'fight-club': 1999, 'spider-man-into-the-spider-verse': 2018, 'once-upon-a-time-in-hollywood': 2019, 'the-dark-knight': 2008, 'lady-bird': 2017, 'la-la-land': 2016, 'the-grand-budapest-hotel': 2014, 'interstellar': 2014, 'baby-driver': 2017, 'inglourious-basterds': 2009, 'the-shining': 1980, 'avengers-infinity-war': 2018, 'the-wolf-of-wall-street': 2013, 'black-panther': 2018, 'jojo-rabbit': 2019, 'whiplash-2014': 2014, 'avengers-endgame': 2019, 'gone-girl': 2014, 'her': 2013, 'spirited-away': 2001, 'arrival-2016': 2016, 'mad-max-fury-road': 2015, 'eternal-sunshine-of-the-spotless-mind': 2004, 'guardians-of-the-galaxy': 2014, 'django-unchained': 2012, 'the-social-network': 2010, 'the-silence-of-the-lambs': 1991, 'uncut-gems': 2019, 'little-women-2019': 2019, 'call-me-by-your-name': 2017, 'kill-bill-vol-1': 2003, 'the-lighthouse-2019': 20

In [None]:
# with open("movie_to_year.json", "w") as outfile:
#     json.dump(movie_to_year, outfile)

In [None]:
print(page_num_to_movie_links)

{1: ['parasite-2019', 'joker-2019', 'knives-out-2019', 'pulp-fiction', 'inception', 'get-out-2017', 'midsommar', 'fight-club', 'spider-man-into-the-spider-verse', 'once-upon-a-time-in-hollywood', 'the-dark-knight', 'lady-bird', 'la-la-land', 'the-grand-budapest-hotel', 'interstellar', 'baby-driver', 'inglourious-basterds', 'the-shining', 'avengers-infinity-war', 'the-wolf-of-wall-street', 'black-panther', 'jojo-rabbit', 'whiplash-2014', 'avengers-endgame', 'gone-girl', 'her', 'spirited-away', 'arrival-2016', 'mad-max-fury-road', 'eternal-sunshine-of-the-spotless-mind', 'guardians-of-the-galaxy', 'django-unchained', 'the-social-network', 'the-silence-of-the-lambs', 'uncut-gems', 'little-women-2019', 'call-me-by-your-name', 'kill-bill-vol-1', 'the-lighthouse-2019', 'scott-pilgrim-vs-the-world', 'the-truman-show', '1917', 'hereditary', 'moonlight-2016', 'the-godfather', 'thor-ragnarok', 'soul-2020', 'forrest-gump', 'the-matrix', 'se7en', 'marriage-story-2019', 'dunkirk-2017', 'us-2019', '

In [None]:
movie_to_page_num = {}
for page_num in page_num_to_movie_links:
    movies = page_num_to_movie_links[page_num]
    for movie in movies:
        movie_to_page_num[movie] = page_num

In [None]:
print(movie_to_page_num)

{'parasite-2019': 1, 'joker-2019': 1, 'knives-out-2019': 1, 'pulp-fiction': 1, 'inception': 1, 'get-out-2017': 1, 'midsommar': 1, 'fight-club': 1, 'spider-man-into-the-spider-verse': 1, 'once-upon-a-time-in-hollywood': 1, 'the-dark-knight': 1, 'lady-bird': 1, 'la-la-land': 1, 'the-grand-budapest-hotel': 1, 'interstellar': 1, 'baby-driver': 1, 'inglourious-basterds': 1, 'the-shining': 1, 'avengers-infinity-war': 1, 'the-wolf-of-wall-street': 1, 'black-panther': 1, 'jojo-rabbit': 1, 'whiplash-2014': 1, 'avengers-endgame': 1, 'gone-girl': 1, 'her': 1, 'spirited-away': 1, 'arrival-2016': 1, 'mad-max-fury-road': 1, 'eternal-sunshine-of-the-spotless-mind': 1, 'guardians-of-the-galaxy': 1, 'django-unchained': 1, 'the-social-network': 1, 'the-silence-of-the-lambs': 1, 'uncut-gems': 1, 'little-women-2019': 1, 'call-me-by-your-name': 1, 'kill-bill-vol-1': 1, 'the-lighthouse-2019': 1, 'scott-pilgrim-vs-the-world': 1, 'the-truman-show': 1, '1917': 1, 'hereditary': 1, 'moonlight-2016': 1, 'the-godf

In [None]:
# with open("movie_to_page_num.json", "w") as outfile:
#     json.dump(movie_to_page_num, outfile)

# Data Augmentation

https://imerit.net/blog/13-best-movie-data-sets-for-machine-learning-projects-all-pbm/

Connecting together our previous webscraped data with three other datasets from Rotten Tomatoes, TMDB, and IMDB (along with my own Letterboxd ratings and Roger Ebert's ratings).

### Letterboxd Data from above as JSON to CSV (Pandas)

In [None]:
def get_rating_counts_in_order(movie_ratings):
    ratings = [str(i) for i in range(1, 11)]
    counts = [movie_ratings[ratings[i]] for i in range(10)]
    return counts

In [None]:
letterboxd_data = []
for movieid in movieid_to_data:
    data = movieid_to_data[movieid]
    rating_counts = get_rating_counts_in_order(data[2])
    row = [movieid, data[3], data[0], data[1]]
    row = row + rating_counts
    letterboxd_data.append(row)

In [None]:
print(letterboxd_data[0])

['parasite-2019', 1, 'Parasite', 2019, 850, 1502, 711, 3900, 3798, 23756, 31679, 167575, 172909, 547471]


In [None]:
print(letterboxd_data)

[['parasite-2019', 1, 'Parasite', 2019, 850, 1502, 711, 3900, 3798, 23756, 31679, 167575, 172909, 547471], ['joker-2019', 1, 'Joker', 2019, 5030, 11400, 8768, 37878, 35134, 118256, 109299, 248930, 102271, 188145], ['knives-out-2019', 1, 'Knives Out', 2019, 1011, 2965, 1897, 12495, 13468, 77147, 99389, 271924, 123158, 157946], ['pulp-fiction', 1, 'Pulp Fiction', 1994, 1660, 3821, 1626, 10863, 7688, 51895, 47635, 192128, 117437, 313059], ['inception', 1, 'Inception', 2010, 1250, 3550, 2063, 13735, 11410, 71166, 69034, 232098, 117643, 245070], ['get-out-2017', 1, 'Get Out', 2017, 1071, 2865, 1673, 10046, 9612, 60424, 74074, 241095, 111921, 212312], ['midsommar', 1, 'Midsommar', 2019, 4425, 9056, 5303, 23798, 21271, 81963, 88044, 198784, 77215, 115478], ['fight-club', 1, 'Fight Club', 1999, 1069, 2709, 1501, 9343, 7963, 50079, 53178, 191481, 110863, 240121], ['spider-man-into-the-spider-verse', 1, 'Spider-Man: Into the Spider-Verse', 2018, 506, 1626, 678, 4438, 4394, 31460, 42291, 167701, 

In [None]:
letterboxd_cols = ['movie_id', 'poster_path', 'title', 'year', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
letterboxd_df = pd.DataFrame(letterboxd_data, columns=letterboxd_cols)

In [None]:
letterboxd_df

Unnamed: 0,movie_id,poster_path,title,year,1,2,3,4,5,6,7,8,9,10
0,parasite-2019,1,Parasite,2019,850,1502,711,3900,3798,23756,31679,167575,172909,547471
1,joker-2019,1,Joker,2019,5030,11400,8768,37878,35134,118256,109299,248930,102271,188145
2,knives-out-2019,1,Knives Out,2019,1011,2965,1897,12495,13468,77147,99389,271924,123158,157946
3,pulp-fiction,1,Pulp Fiction,1994,1660,3821,1626,10863,7688,51895,47635,192128,117437,313059
4,inception,1,Inception,2010,1250,3550,2063,13735,11410,71166,69034,232098,117643,245070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6828,2-days-in-paris,100,2 Days in Paris,2007,44,106,118,440,713,1891,1805,1452,231,246
6829,blancanieves-2012,100,Blancanieves,2012,14,37,40,153,257,768,1178,1846,694,498
6830,grumpy-old-men,100,Grumpy Old Men,1993,24,112,111,632,932,3260,2326,1678,274,387
6831,dead-snow-2-red-vs-dead,100,Dead Snow 2: Red vs. Dead,2014,100,192,202,590,857,2093,2051,1749,386,302


## Critic Reviews!!

Rotten Tomatoes

https://www.kaggle.com/stefanoleone992/rotten-tomatoes-movies-and-critic-reviews-dataset?select=rotten_tomatoes_critic_reviews.csv

In [None]:
rt_path = '/content/drive/My Drive/cs229_proj/rotten_tomatoes_movies.csv'
rt_df = pd.read_csv(rt_path)
rt_df

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,streaming_release_date,runtime,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2015-11-25,119.0,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
1,m/0878835,Please Give,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,2012-09-04,90.0,Sony Pictures Classics,Certified-Fresh,87.0,142.0,Upright,64.0,11574.0,44,123,19
2,m/10,10,"A successful, middle-aged Hollywood songwriter...",Blake Edwards' bawdy comedy may not score a pe...,R,"Comedy, Romance",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,2014-07-24,122.0,Waner Bros.,Fresh,67.0,24.0,Spilled,53.0,14684.0,2,16,8
3,m/1000013-12_angry_men,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,Sidney Lumet's feature debut is a superbly wri...,NR,"Classics, Drama",Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",1957-04-13,2017-01-13,95.0,Criterion Collection,Certified-Fresh,100.0,54.0,Upright,97.0,105386.0,6,54,0
4,m/1000079-20000_leagues_under_the_sea,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...","One of Disney's finest live-action adventures,...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1954-01-01,2016-06-10,127.0,Disney,Fresh,89.0,27.0,Upright,74.0,68918.0,5,24,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17707,m/zoot_suit,Zoot Suit,Mexican-American gangster Henry Reyna (Daniel ...,,R,"Drama, Musical & Performing Arts",Luis Valdez,Luis Valdez,"Daniel Valdez, Edward James Olmos, Charles Aid...",1981-10-02,2016-04-19,104.0,MCA Universal Home Video,Rotten,56.0,9.0,Upright,74.0,1195.0,2,5,4
17708,m/zootopia,Zootopia,From the largest elephant to the smallest shre...,The brilliantly well-rounded Zootopia offers a...,PG,"Action & Adventure, Animation, Comedy","Byron Howard, Rich Moore, Jared Bush","Jared Bush, Phil Johnston","J.K. Simmons, Kristen Bell, Octavia Spencer, A...",2016-03-04,2016-06-07,108.0,Walt Disney Animation Studios,Certified-Fresh,98.0,291.0,Upright,92.0,101511.0,50,285,7
17709,m/zorba_the_greek,Zorba the Greek,Traveling to inspect an abandoned mine his fat...,,NR,"Action & Adventure, Art House & International,...",,,"Anthony Quinn, Alan Bates, Irene Papas, Lila K...",1964-12-17,2015-11-25,142.0,Fox,Fresh,80.0,10.0,Upright,86.0,7146.0,0,8,2
17710,m/zulu,Zulu,"In 1879, the Zulu nation hands colonial Britis...",Zulu patiently establishes a cast of colorful ...,PG,"Classics, Drama","Cy Endfield, Cyril Endfield","Cy Endfield, John Prebble","Stanley Baker, Jack Hawkins, Ulla Jacobsson, J...",1964-06-17,2017-01-08,135.0,Paramount Pictures,Fresh,96.0,23.0,Upright,91.0,30193.0,6,22,1


In [None]:
rt_df.columns

Index(['rotten_tomatoes_link', 'movie_title', 'movie_info',
       'critics_consensus', 'content_rating', 'genres', 'directors', 'authors',
       'actors', 'original_release_date', 'streaming_release_date', 'runtime',
       'production_company', 'tomatometer_status', 'tomatometer_rating',
       'tomatometer_count', 'audience_status', 'audience_rating',
       'audience_count', 'tomatometer_top_critics_count',
       'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count'],
      dtype='object')

In [None]:
rt_titles = rt_df["movie_title"].tolist()
rt_release = rt_df["original_release_date"].tolist()
movies_full_titles = movies_full_df["title"].tolist()
movies_full_year = movies_full_df["year"].tolist()

In [None]:
nan_count = 0
rt_pairs = set()
for i, title in enumerate(rt_titles):
    # if not math.isnan(rt_release[i]):
    if type(rt_release[i]) is not float:
        rt_pairs.add((title, int(rt_release[i][:4])))
    else:
        rt_pairs.add((title, rt_release[i]))
        nan_count += 1
print(nan_count)

print(len(rt_pairs))

1166
17708


In [None]:
full_pairs = set()
for i, title in enumerate(movies_full_titles):
    full_pairs.add((title, movies_full_year[i]))

print(len(full_pairs))

2220


In [None]:
def update_rt_year(row):
    if type(row['original_release_date']) is not float: # len(row['release_date']) >= 4:
        return int(row['original_release_date'][:4])
    else:
        return float('nan')

# RIP to the 1166 NAN rows
rt_df["year"] = rt_df.apply(update_rt_year, axis=1)
rt_df = rt_df[rt_df['year'].notna()]
rt_df["year"]  = rt_df["year"].astype(int)
rt_df

In [None]:
intersect_rt_pairs = rt_pairs.intersection(full_pairs)
print(len(intersect_rt_pairs))

1891


In [None]:
missing_full = full_pairs.difference(rt_pairs)
print(len(missing_full))
print((2030 - 1891))
print(missing_full)

329
139
{('Monsoon Wedding', 2001), ('The Little Prince', 2015), ('Shortbus', 2006), ('Brüno', 2009), ('Mission: Impossible II', 2000), ('The Secret of Kells', 2009), ('Allegiant', 2016), ('Star Trek III: The Search for Spock', 1984), ('Fish Tank', 2009), ('Christmas with the Kranks', 2004), ('Just Married', 2003), ('Sense and Sensibility', 1995), ('Children of Heaven', 1997), ('Spirit: Stallion of the Cimarron', 2002), ('The Fault in Our Stars', 2014), ('Incendies', 2010), ('The Avengers', 2012), ('Enter the Void', 2009), ('Ip Man 3', 2015), ('Dogtooth', 2009), ('Dirty Pretty Things', 2002), ('Downfall', 2004), ('Seventh Son', 2014), ('The Land Before Time', 1988), ('Daybreakers', 2009), ('The Godfather: Part III', 1990), ('Ponyo', 2008), ('Hansel & Gretel: Witch Hunters', 2013), ('Star Trek V: The Final Frontier', 1989), ("River's Edge", 1986), ('The Man with the Iron Fists', 2012), ('J. Edgar', 2011), ("Meek's Cutoff", 2010), ("Porky's", 1981), ('Extremely Loud & Incredibly Close', 

In [None]:
def update_with_rt(movies_df, old_df, title, year, old_field, target_field):
    value = old_df.loc[(old_df['movie_title'] == title) & (old_df['year'] == year), old_field].iloc[0]
    movies_df.loc[(movies_df['title'] == title) & (movies_df['year'] == year), target_field] = value

In [None]:
old_fields = ["movie_info", "critics_consensus", "actors", "tomatometer_status", "tomatometer_rating",
              "tomatometer_count", "audience_status", "audience_rating", "audience_count", "tomatometer_top_critics_count",
              "tomatometer_fresh_critics_count", "tomatometer_rotten_critics_count"]
new_fields = ["rt_info", "critics_consensus", "actors", "tm_status", "tm_rating",
              "tm_count", "audience_status", "audience_rating", "audience_count", "tm_top_critics_count",
              "tm_fresh_critics_count", "tm_rotten_critics_count"]

for new_field in new_fields:
    movies_full_df[new_field] = np.nan
# print(len(old_fields))
movies_full_df

In [None]:
for title, year in intersect_rt_pairs:
    for i, old_field in enumerate(old_fields):
        update_with_rt(movies_full_df, rt_df, title, year, old_field, new_fields[i])

In [None]:
movies_full_df

In [None]:
# movies_full_df.to_csv('/content/drive/My Drive/cs229_proj/movies_full.csv')

In [None]:
rt_intersect_titles = set(movies_full_titles).intersection(set(rt_titles))
print(len(rt_intersect_titles))
print(len(movies_full_titles))

2030
2220


## Jesse Doan

https://letterboxd.com/jdoan/

Webscraping my own Letterboxd profile for user ratings. Needed to do a similar downloading of a bunch of HTML files such as in webscraping the movie IDs and posters. The easy thing is that Letterboxd allows you to view a user's movies based on an exact rating so we just need movie IDs from each of those filtered pages and make sure you bookkeep well. And since I've only seen ~800 films, it's not too hard to brute force it.

In [None]:
page_counts = [2, 2, 3, 5, 5, 7, 8, 8, 5, 3]
print(sum(page_counts))

48


In [None]:
movies = [] # from rating 1 to rating 10
lengths = []
ratings = [i for i in range(1, 11)]
page_counts = [2, 2, 3, 5, 5, 7, 8, 8, 5, 3]

jesse_folder = '/content/drive/My Drive/cs229_proj/jesse_html/'

for i, rating in enumerate(ratings):
    cur_movie = []
    for page in range(1, page_counts[i] + 1):
        with open(jesse_folder + str(rating) + "-" + str(page) + ".html", "r", encoding='utf-8') as f:
            text= f.read()
            result = re.findall('data-film-link="/film/(.*)/" data-film-in-watchlist', text)
            cur_movie.extend(result)
    movies.append(cur_movie)
    lengths.append(len(cur_movie))

print(lengths)
print(sum(lengths))

[19, 26, 40, 73, 79, 111, 139, 140, 84, 51]
762


In [None]:
jesse_movie_to_rating = {}
for i, rating in enumerate(ratings):
    cur_movies = movies[i]
    for movie in cur_movies:
        jesse_movie_to_rating[movie] = rating

In [None]:
print(jesse_movie_to_rating)

{'the-lion-king-2019': 1, 'aladdin-2019': 1, 'mulan-2020': 1, 'x-men-origins-wolverine': 1, 'the-kissing-booth': 1, 'the-last-airbender': 1, 'hes-all-that': 1, 'knock-knock-2015': 1, 'movie-43': 1, 'elektra': 1, 'birdemic-shock-and-terror': 1, 'gods-not-dead': 1, 'zoom': 1, 'space-chimps': 1, 'f-the-prom': 1, 'inspector-gadget-2': 1, 'the-secret-2006': 1, 'finding-jesus': 1, 'wiener-dog-internationals': 1, 'suicide-squad-2016': 2, 'star-wars-episode-i-the-phantom-menace': 2, 'the-secret-life-of-pets': 2, 'host-2020': 2, 'transformers-revenge-of-the-fallen': 2, 'green-lantern': 2, 'unfriended': 2, 'scooby-doo-2-monsters-unleashed': 2, 'alvin-and-the-chipmunks': 2, 'no-strings-attached': 2, 'jingle-all-the-way': 2, 'the-circle-2017': 2, 'alvin-and-the-chipmunks-the-squeakquel': 2, 'gi-joe-the-rise-of-cobra': 2, 'battleship': 2, 'gi-joe-retaliation': 2, 'mulan-ii': 2, 'inspector-gadget': 2, 'project-almanac': 2, 'spy-kids-all-the-time-in-the-world': 2, 'underdog': 2, 'cyberbully': 2, 'the

In [None]:
# with open("movie_to_jesse_rating.json", "w") as outfile:
#     json.dump(jesse_movie_to_rating, outfile)

## Roger Ebert

https://letterboxd.com/ebert_roger/

Webscraped Roger Ebert's (famous movie reviewer) user ratings from Letterboxd. Similar to myself (Jesse Doan) except only handled the popular movies that are covered by our Letterboxd scraped dataset (Ebert has seen A LOT of movies).

In [None]:
movies_full_df["ebert_rating"] = np.nan
# movies_full_df = movies_full_df.drop('Unnamed: 0', 1)
# movies_full_df = movies_full_df.drop('Unnamed: 0.1', 1)
movies_full_df

In [None]:
for movieid in ebert_intersect_full:
    movies_full_df.loc[(movies_full_df['movie_id'] == movieid), 'ebert_rating'] = movie_to_ebert_rating[movieid]

In [None]:
# movies_full_df.to_csv('movies_full.csv')

In [None]:
print(len(movie_links_10)) # 4/4
print(len(movie_links_9))  # 3.5/4
print(len(movie_links_8))  # 3/4
print(len(movie_links_6))  # 2.5/4

print(len(movie_links_5))  # 2/4
print(len(movie_links_4))  # 1.5/4
print(len(movie_links_3))  # 1/4
print(len(movie_links_1))  # 0.5/4

ebert_rating_movies = [movie_links_1, movie_links_3, movie_links_4, movie_links_5, movie_links_6, movie_links_8, movie_links_9, movie_links_10]
ebert_ratings = [0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4]

846
630
900
450
450
180
180
90


In [None]:
movie_to_ebert_rating = {}
for i, rating in enumerate(ebert_ratings):
    cur_movies = ebert_rating_movies[i]
    for movie in cur_movies:
        movie_to_ebert_rating[movie] = rating

In [None]:
print(movie_to_ebert_rating)
print(len(movie_to_ebert_rating))

# with open("movie_to_ebert_rating.json", "w") as outfile:
#     json.dump(movie_to_ebert_rating, outfile)

{'hellraiser': 0.5, 'charlies-angels': 0.5, 'the-last-airbender': 0.5, 'friday-the-13th-part-2': 0.5, 'pink-flamingos': 0.5, 'the-devils': 0.5, 'the-beyond': 0.5, 'resident-evil-apocalypse': 0.5, 'jason-x': 0.5, 'the-texas-chainsaw-massacre': 0.5, 'hellbound-hellraiser-ii': 0.5, 'josie-and-the-pussycats': 0.5, 'police-academy': 0.5, 'the-hot-chick': 0.5, 'freddy-got-fingered': 0.5, 'the-hitcher': 0.5, 'the-doom-generation': 0.5, 'wolf-creek': 0.5, 'death-race': 0.5, 'cant-buy-me-love': 0.5, 'i-spit-on-your-grave-2010': 0.5, 'battle-los-angeles': 0.5, 'the-life-of-david-gale': 0.5, 'death-race-2000': 0.5, 'jaws-the-revenge': 0.5, 'the-human-centipede-2-full-sequence': 0.5, 'spice-world': 0.5, 'day-of-the-woman': 0.5, 'ishtar': 0.5, 'the-good-son': 0.5, 'battlefield-earth': 0.5, 'caligula': 0.5, 'mighty-morphin-power-rangers-the-movie': 0.5, 'revolver': 0.5, 'death-to-smoochy': 0.5, 'the-limits-of-control': 0.5, 'rosencrantz-guildenstern-are-dead': 0.5, 'the-ghost-and-the-darkness': 0.5,

In [None]:
print(len(ebert_rating_movies))
print(len(ebert_ratings))
counts = [846, 630, 900, 450, 450, 180, 180, 90]
print(sum(counts))

8
8
3726


In [None]:
ebert_movies_set = set()
for cur_movies in ebert_rating_movies:
    ebert_movies_set = ebert_movies_set.union(set(cur_movies))

In [None]:
print(len(ebert_movies_set))

json_movie_set = set(movieid_to_data.keys())
print(len(json_movie_set))
full_movie_set = set(movies_full_df["movie_id"].tolist())
print(len(full_movie_set))

3726
6833
2220


In [None]:
ebert_intersect_json = json_movie_set.intersection(ebert_movies_set)
print(len(ebert_intersect_json))

ebert_intersect_full = full_movie_set.intersection(ebert_movies_set)
print(len(ebert_intersect_full))

print(1577/2220)

2914
1577
0.7103603603603603


In [None]:
testing = 'bulletproof-monk'# 'somewhere-in-time'# 'the-bone-collector'# 'country-strong'# 'tequila-sunrise'# 'guess-who'# 'dragnet-1987'# 'under-siege-2-dark-territory'# 'the-medallion' # 'air-bud'# 'boy-a' # 'red-rock-west' # 'decalogue-ii' # 'kinsey' 
def try_word(word):
    if word in movieid_to_data:
        print("Folder", movieid_to_data[word][3])
        print("Yeet")
    else:
        print("Nah dog")
try_word(testing)

Nah dog


In [None]:
# 4/4 ratings - stop after popularity reaches page 100 (somehow)
movie_links_10 = []
lengths = []

ebert10_folder = '/content/drive/My Drive/cs229_proj/ebert_html/ebert10/'
start_file = '10-'

start_page = 1
end_page = 47

for i in range(start_page, end_page + 1):
    with open(ebert10_folder + start_file + str(i) +  ".html", "r", encoding='utf-8') as f:
        text= f.read()
        result = re.findall('data-film-link="/film/(.*)/" data-film-in-watchlist', text)
        movie_links_10.extend(result)
        if len(result) < 18:
            print("Fail on ", i)
print(len(set(movie_links_10)))
print(18 * (end_page - start_page + 1))
print(movie_links_10)

846
846
['pulp-fiction', 'inception', 'the-dark-knight', 'the-shining', 'inglourious-basterds', 'spirited-away', 'eternal-sunshine-of-the-spotless-mind', 'the-silence-of-the-lambs', 'the-social-network', 'kill-bill-vol-1', 'the-truman-show', 'forrest-gump', 'the-godfather', 'se7en', 'star-wars', 'taxi-driver', 'the-shawshank-redemption', 'the-empire-strikes-back', 'ratatouille', '2001-a-space-odyssey', 'goodfellas', 'no-country-for-old-men', 'up', 'blade-runner', 'iron-man-2008', 'zodiac', 'alien', 'toy-story', 'titanic-1997', 'psycho', 'lost-in-translation', 'harry-potter-and-the-philosophers-stone', 'the-big-lebowski', 'finding-nemo', 'fargo', 'return-of-the-jedi', 'my-neighbor-totoro', 'american-beauty', 'batman-begins', 'the-departed', 'jaws', 'mulholland-drive', 'schindlers-list', 'avatar', 'pans-labyrinth', 'shrek', 'harry-potter-and-the-chamber-of-secrets', '500-days-of-summer', 'apocalypse-now', 'saving-private-ryan', 'the-godfather-part-ii', 'oldboy', 'raiders-of-the-lost-ark'

In [None]:
# 3.5/4 ratings - stop after popularity reaches page 100 (somehow)
movie_links_9 = []
lengths = []

ebert9_folder = '/content/drive/My Drive/cs229_proj/ebert_html/ebert9/'
start_file = '9-'

start_page = 1
end_page = 35

for i in range(start_page, end_page + 1):
    with open(ebert9_folder + start_file + str(i) +  ".html", "r", encoding='utf-8') as f:
        text= f.read()
        result = re.findall('data-film-link="/film/(.*)/" data-film-in-watchlist', text)
        movie_links_9.extend(result)
        if len(result) < 18:
            print("Fail on ", i)
print(len(set(movie_links_9)))
print(18 * (end_page - start_page + 1))
print(movie_links_9)

630
630
['black-swan', 'shutter-island', 'drive-2011', 'back-to-the-future', 'walle', 'the-perks-of-being-a-wallflower', 'the-incredibles', 'the-lord-of-the-rings-the-return-of-the-king', 'harry-potter-and-the-prisoner-of-azkaban', 'fantastic-mr-fox', 'moonrise-kingdom', 'the-lion-king', 'there-will-be-blood', 'amelie', 'requiem-for-a-dream', 'kill-bill-vol-2', 'harry-potter-and-the-deathly-hallows-part-2', 'harry-potter-and-the-goblet-of-fire', 'little-miss-sunshine', 'star-wars-episode-iii-revenge-of-the-sith', 'superbad', 'star-wars-episode-i-the-phantom-menace', 'toy-story-2', 'the-nightmare-before-christmas', 'clueless', 'the-royal-tenenbaums', 'silver-linings-playbook', 'eyes-wide-shut', 'school-of-rock', 'the-pianist', 'midnight-in-paris', 'the-princess-bride', 'ghostbusters', 'terminator-2-judgment-day', 'mulan', 'aliens', 'before-sunset', 'punch-drunk-love', 'the-hangover', 'the-amazing-spider-man', 'the-girl-with-the-dragon-tattoo-2011', 'the-green-mile', 'easy-a', 'heat-1995

In [None]:
# 3/4 ratings - stop after popularity reaches page 100 (somehow)
movie_links_8 = []
lengths = []

ebert8_folder = '/content/drive/My Drive/cs229_proj/ebert_html/ebert8/'
start_file = '8-'

start_page = 1
end_page = 50

for i in range(start_page, end_page + 1):
    with open(ebert8_folder + start_file + str(i) +  ".html", "r", encoding='utf-8') as f:
        text= f.read()
        result = re.findall('data-film-link="/film/(.*)/" data-film-in-watchlist', text)
        movie_links_8.extend(result)
        if len(result) < 18:
            print("Fail on ", i)
print(len(set(movie_links_8)))
print(18 * (end_page - start_page + 1))
print(movie_links_8)

900
900
['the-matrix', 'the-avengers-2012', 'the-lord-of-the-rings-the-fellowship-of-the-ring', 'american-psycho', 'jurassic-park', 'memento', 'the-breakfast-club', 'the-dark-knight-rises', 'the-lord-of-the-rings-the-two-towers', 'good-will-hunting', 'monsters-inc', 'the-prestige', 'captain-america-the-first-avenger', 'catch-me-if-you-can-2002', 'ferris-buellers-day-off', 'toy-story-3', 'shaun-of-the-dead', 'the-hunger-games', 'before-sunrise', 'iron-man-2', 'mean-girls', 'trainspotting', 'scream', 'harry-potter-and-the-deathly-hallows-part-1', 'harry-potter-and-the-half-blood-prince', 'coraline', 'pirates-of-the-caribbean-the-curse-of-the-black-pearl', 'zombieland', 'v-for-vendetta', 'in-the-mood-for-love', 'the-sixth-sense', 'shrek-2', 'oceans-eleven-2001', 'the-cabin-in-the-woods', 'jennifers-body', 'charlie-and-the-chocolate-factory', 'back-to-the-future-part-ii', 'how-to-train-your-dragon', 'wreck-it-ralph', '21-jump-street', 'chungking-express', 'brave-2012', 'aladdin', 'cars', '

In [None]:
# 2.5/4 ratings - stop after popularity reaches page 100 (somehow)
movie_links_6 = []
lengths = []

ebert6_folder = '/content/drive/My Drive/cs229_proj/ebert_html/ebert6/'
start_file = '6-'

start_page = 1
end_page = 25

for i in range(start_page, end_page + 1):
    with open(ebert6_folder + start_file + str(i) +  ".html", "r", encoding='utf-8') as f:
        text= f.read()
        result = re.findall('data-film-link="/film/(.*)/" data-film-in-watchlist', text)
        movie_links_6.extend(result)
        if len(result) < 18:
            print("Fail on ", i)
print(len(set(movie_links_6)))
print(18 * (end_page - start_page + 1))
print(movie_links_6)
print(movie_links_6[-1])

450
450
['reservoir-dogs', 'donnie-darko', '10-things-i-hate-about-you', 'howls-moving-castle', 'spider-man', 'leon-the-professional', 'the-thing', 'home-alone', 'harry-potter-and-the-order-of-the-phoenix', 'full-metal-jacket', 'twilight-2008', 'the-curious-case-of-benjamin-button', 'the-master-2012', 'the-rocky-horror-picture-show', 'heathers', 'x-men-first-class', 'the-life-aquatic-with-steve-zissou', 'big-fish', 'the-incredible-hulk', 'rushmore', 'back-to-the-future-part-iii', 'o-brother-where-art-thou', 'x-men', 'girl-interrupted', 'the-intouchables', 'madagascar', 'star-trek', 'top-gun', 'independence-day', 'shrek-the-third', 'insidious', 'once-upon-a-time-in-the-west', 'the-twilight-saga-breaking-dawn-part-2', 'cloudy-with-a-chance-of-meatballs', 'the-others', 'the-twilight-saga-breaking-dawn-part-1', 'mrs-doubtfire', 'taken', 'to-kill-a-mockingbird', 'limitless', 'the-butterfly-effect', 'the-untouchables', 'oceans-thirteen', 'the-blues-brothers', 'mission-impossible-iii', 'austi

In [None]:
# 2/4 ratings - stop after popularity reaches page 100 (somehow)
movie_links_5 = []
lengths = []

ebert5_folder = '/content/drive/My Drive/cs229_proj/ebert_html/ebert5/'
start_file = '5-'

start_page = 1
end_page = 25

for i in range(start_page, end_page + 1):
    with open(ebert5_folder + start_file + str(i) +  ".html", "r", encoding='utf-8') as f:
        text= f.read()
        result = re.findall('data-film-link="/film/(.*)/" data-film-in-watchlist', text)
        movie_links_5.extend(result)
        if len(result) < 18:
            print("Fail on ", i)
print(len(set(movie_links_5)))
print(18 * (end_page - start_page + 1))
print(movie_links_5)
# try_word(movie_links_5[-120])

450
450
['fight-club', 'a-clockwork-orange', 'dead-poets-society', 'edward-scissorhands', 'die-hard', 'star-wars-episode-ii-attack-of-the-clones', 'the-devil-wears-prada', 'gladiator-2000', 'beetlejuice', 'mamma-mia', 'spider-man-3', 'the-texas-chain-saw-massacre', 'snatch', 'pitch-perfect', 'saw', 'batman-1989', 'home-alone-2-lost-in-new-york', 'romeo-juliet-1996', '13-going-on-30', '300', 'batman-returns', 'how-the-grinch-stole-christmas-2000', 'brazil', 'the-elephant-man', 'the-lost-world-jurassic-park', 'clue', 'quantum-of-solace', 'the-twilight-saga-eclipse', 'lost-highway', 'pirates-of-the-caribbean-on-stranger-tides', 'x-men-origins-wolverine', 'an-american-werewolf-in-london', 'rope', 'bee-movie', 'the-addams-family', 'click', 'the-ring-2002', 'the-craft', 'national-treasure', 'labyrinth', 'dumb-and-dumber', 'the-hangover-part-ii', 'dogville', 'bottle-rocket-1996', 'starship-troopers', 'shark-tale', 'war-of-the-worlds', 'i-robot', 'hook', 'army-of-darkness', 'big-trouble-in-lit

In [None]:
# 1.5/4 ratings - stop after popularity reaches page 100 (somehow)
movie_links_4 = []
lengths = []

ebert4_folder = '/content/drive/My Drive/cs229_proj/ebert_html/ebert4/'
start_file = '4-'

start_page = 1
end_page = 10

for i in range(start_page, end_page + 1):
    with open(ebert4_folder + start_file + str(i) +  ".html", "r", encoding='utf-8') as f:
        text= f.read()
        result = re.findall('data-film-link="/film/(.*)/" data-film-in-watchlist', text)
        movie_links_4.extend(result)
        if len(result) < 18:
            print("Fail on ", i)
print(len(set(movie_links_4)))
print(18 * (end_page - start_page + 1))
print(movie_links_4)
# try_word(movie_links_4[-60])

180
180
['thor', 'the-usual-suspects', 'jumanji', 'the-princess-diaries', 'napoleon-dynamite', 'raising-arizona', 'step-brothers', 'the-lovely-bones', 'men-in-black-ii', 'constantine', 'harold-and-maude', 'white-chicks', 'how-to-lose-a-guy-in-10-days', 'happy-gilmore', 'fast-furious', 'night-at-the-museum-battle-of-the-smithsonian', 'alien-resurrection', 'dead-man', 'the-princess-diaries-2-royal-engagement', 'the-strangers', 'halloween-iii-season-of-the-witch', 'big-daddy', 'godzilla', 'a-nightmare-on-elm-street-3-dream-warriors', 'empire-records', 'silent-hill', 'final-destination-2', 'pearl-harbor', 'footloose', 'nacho-libre', 'day-of-the-dead', 'rush-hour-2', 'godzilla-1998', 'spy-kids-3-d-game-over', 'star-wars-the-clone-wars', 'the-hunger', 'mallrats', 'the-hills-have-eyes-2006', 'me-myself-irene', 'paranormal-activity-2', 'blade-trinity', 'the-a-team', 'the-girl-next-door-2004', 'this-means-war', 'jawbreaker', 'johnny-english', 'patch-adams', 'mr-deeds', 'the-pink-panther', 'debs

In [None]:
# 1/4 ratings - stop after popularity reaches page 100 (somehow)
movie_links_3 = []
lengths = []

ebert3_folder = '/content/drive/My Drive/cs229_proj/ebert_html/ebert3/'
start_file = '3-'

start_page = 1
end_page = 10

for i in range(start_page, end_page + 1):
    with open(ebert3_folder + start_file + str(i) +  ".html", "r", encoding='utf-8') as f:
        text= f.read()
        result = re.findall('data-film-link="/film/(.*)/" data-film-in-watchlist', text)
        movie_links_3.extend(result)
        if len(result) < 18:
            print("Fail on ", i)
print(len(set(movie_links_3)))
print(18 * (end_page - start_page + 1))
print(movie_links_3)
try_word(movie_links_3[-100])

180
180
['blue-velvet', 'kick-ass', 'fear-and-loathing-in-las-vegas', 'dirty-dancing', 'the-twilight-saga-new-moon', 'scooby-doo', 'hocus-pocus', 'zoolander', 'the-raid-2011', 'dune', 'fantastic-four-2005', 'the-village', 'ace-ventura-pet-detective', 'armageddon', 'transformers-revenge-of-the-fallen', 'transformers-dark-of-the-moon', 'taste-of-cherry', 'i-know-what-you-did-last-summer', 'resident-evil', 'fast-times-at-ridgemont-high', 'just-go-with-it', 'bad-boys-ii', 'team-america-world-police', 'the-wedding-singer', 'a-cinderella-story', 'wet-hot-american-summer', 'the-brood', 'a-nightmare-on-elm-street-2010', 'children-of-the-corn', 'flubber', 'scrooged', 'v-h-s', 'the-tenant', 'the-texas-chainsaw-massacre-2', 'the-grudge', 'the-bucket-list', 'the-waterboy', 'the-green-hornet', 'catwoman', 'old-school', 'wild-wild-west', 'the-league-of-extraordinary-gentlemen', 'paranormal-activity-3', 'the-frighteners', 'thir13en-ghosts', 'high-tension', 'stargate', 'snake-eyes', 'daddy-day-care', 

In [None]:
# 0.5/4 ratings - stop after popularity reaches page 100 (somehow)
movie_links_1 = []
lengths = []

ebert1_folder = '/content/drive/My Drive/cs229_proj/ebert_html/ebert1/'
start_file = '1-'

start_page = 1
end_page = 5

for i in range(start_page, end_page + 1):
    with open(ebert1_folder + start_file + str(i) +  ".html", "r", encoding='utf-8') as f:
        text= f.read()
        result = re.findall('data-film-link="/film/(.*)/" data-film-in-watchlist', text)
        movie_links_1.extend(result)
        if len(result) < 18:
            print("Fail on ", i)
print(len(set(movie_links_1)))
print(18 * (end_page - start_page + 1))
print(movie_links_1)

In [None]:
try_word(movie_links_1[36])

Folder 92
Yeet


In [None]:
def find_last(movie_links):
    last_i = None
    last_folder = None
    last_id = None
    for i in range(len(movie_links)):
        if movie_links[i] in movieid_to_data:
            last_i = i
            last_folder = (movieid_to_data[movie_links[i]][3])
            last_id = (movie_links[i])
    print(str(last_i) + " vs length " + str(len(movie_links)))
    print("Folder", last_folder)
    print(last_id)
find_last(movie_links_3)

90 vs length 180
Folder 99
a-lot-like-love


In [None]:
find_last(movie_links_1)

36 vs length 90
Folder 92
rosencrantz-guildenstern-are-dead


In [None]:
find_last(movie_links_5)

344 vs length 450
Folder 100
grumpy-old-men


In [None]:
find_last(movie_links_6)

303 vs length 450
Folder 100
the-way-of-the-gun


In [None]:
find_last(movie_links_8)

790 vs length 900
Folder 100
2-days-in-paris


In [None]:
find_last(movie_links_9)

536 vs length 630
Folder 100
mr-hollands-opus


In [None]:
find_last(movie_links_10)

796 vs length 846
Folder 100
blancanieves-2012


## Other Datasets

TMDB from Kaggle - https://www.kaggle.com/tmdb/tmdb-movie-metadata?select=tmdb_5000_movies.csv

Movie Industry (IMDB data) from Kaggle - https://www.kaggle.com/danielgrijalvas/movies


Augment Datasets using Title and Year (Unfortunately, there are probably more movies to be linked up but either the title was slightly off or the year was missing) -- We are making a big assumption that it's unique to find a movie with a certain title coming out in a certain year. (Not too big of an assumption given that we are mostly handling popular films)

In [None]:
tmdb_credits_path = '/content/drive/My Drive/cs229_proj/tmdb_5000_credits.csv'
tmdb_movies_path = '/content/drive/My Drive/cs229_proj/tmdb_5000_movies.csv'
movie_industry_path = '/content/drive/My Drive/cs229_proj/movie_industry.csv'

tmdb_credits_df = pd.read_csv(tmdb_credits_path)
tmdb_movies_df = pd.read_csv(tmdb_movies_path)
movie_industry_df = pd.read_csv(movie_industry_path)

In [None]:
tmdb_credits_df

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."
...,...,...,...,...
4798,9367,El Mariachi,"[{""cast_id"": 1, ""character"": ""El Mariachi"", ""c...","[{""credit_id"": ""52fe44eec3a36847f80b280b"", ""de..."
4799,72766,Newlyweds,"[{""cast_id"": 1, ""character"": ""Buzzy"", ""credit_...","[{""credit_id"": ""52fe487dc3a368484e0fb013"", ""de..."
4800,231617,"Signed, Sealed, Delivered","[{""cast_id"": 8, ""character"": ""Oliver O\u2019To...","[{""credit_id"": ""52fe4df3c3a36847f8275ecf"", ""de..."
4801,126186,Shanghai Calling,"[{""cast_id"": 3, ""character"": ""Sam"", ""credit_id...","[{""credit_id"": ""52fe4ad9c3a368484e16a36b"", ""de..."


In [None]:
def update_year(row):
    if type(row['release_date']) is not float: # len(row['release_date']) >= 4:
        return int(row['release_date'][:4])
    else:
        return float('nan')

tmdb_movies_df["year"] = tmdb_movies_df.apply(update_year, axis=1)
tmdb_movies_df = tmdb_movies_df.drop(tmdb_movies_df[(tmdb_movies_df["title"] == "America Is Still the Place")].index)
tmdb_movies_df["year"]  = tmdb_movies_df["year"].astype(int)

In [None]:
tmdb_movies_df

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,year
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,2009
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,2007
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,2015
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.312950,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,2012
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,2012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4798,220000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",,9367,"[{""id"": 5616, ""name"": ""united states\u2013mexi...",es,El Mariachi,El Mariachi just wants to play his guitar and ...,14.269792,"[{""name"": ""Columbia Pictures"", ""id"": 5}]","[{""iso_3166_1"": ""MX"", ""name"": ""Mexico""}, {""iso...",1992-09-04,2040920,81.0,"[{""iso_639_1"": ""es"", ""name"": ""Espa\u00f1ol""}]",Released,"He didn't come looking for trouble, but troubl...",El Mariachi,6.6,238,1992
4799,9000,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 10749, ""...",,72766,[],en,Newlyweds,A newlywed couple's honeymoon is upended by th...,0.642552,[],[],2011-12-26,0,85.0,[],Released,A newlywed couple's honeymoon is upended by th...,Newlyweds,5.9,5,2011
4800,0,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam...",http://www.hallmarkchannel.com/signedsealeddel...,231617,"[{""id"": 248, ""name"": ""date""}, {""id"": 699, ""nam...",en,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...",1.444476,"[{""name"": ""Front Street Pictures"", ""id"": 3958}...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2013-10-13,0,120.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,"Signed, Sealed, Delivered",7.0,6,2013
4801,0,[],http://shanghaicalling.com/,126186,[],en,Shanghai Calling,When ambitious New York attorney Sam is sent t...,0.857008,[],"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-05-03,0,98.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,A New Yorker in Shanghai,Shanghai Calling,5.7,7,2012


In [None]:
movie_industry_df

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7663,More to Life,,Drama,2020,"October 23, 2020 (United States)",3.1,18.0,Joseph Ebanks,Joseph Ebanks,Shannon Bond,United States,7000.0,,,90.0
7664,Dream Round,,Comedy,2020,"February 7, 2020 (United States)",4.7,36.0,Dusty Dukatz,Lisa Huston,Michael Saquella,United States,,,Cactus Blue Entertainment,90.0
7665,Saving Mbango,,Drama,2020,"April 27, 2020 (Cameroon)",5.7,29.0,Nkanya Nkwai,Lynno Lovert,Onyama Laura,United States,58750.0,,Embi Productions,
7666,It's Just Us,,Drama,2020,"October 1, 2020 (United States)",,,James Randall,James Randall,Christina Roz,United States,15000.0,,,120.0


In [None]:
industry_titles = set(movie_industry_df['name'].tolist())
print(industry_titles)
print(len(industry_titles))

7512


In [None]:
letterboxd_titles = set(letterboxd_df['title'].tolist())
print(letterboxd_titles)
print(len(letterboxd_titles))

{'Good Bye Lenin!', 'Gloria', 'The Talented Mr. Ripley', 'Son of the Mask', 'Ritual', 'The American Friend', 'The Bounty Hunter', 'Liz and the Blue Bird', 'A League of Their Own', 'The Muppet Christmas Carol', 'Unknown', 'Summer of 84', 'Friday the 13th Part VIII: Jason Takes Manhattan', 'The Grinch', 'Hide and Seek', 'Less Than Zero', 'Sing Street', 'Devilman Crybaby', 'Avatar', 'Mowgli: Legend of the Jungle', 'Saw II', 'Lords of Dogtown', 'Marjorie Prime', 'The Willoughbys', 'Orphan', 'Reservoir Dogs', 'The Meg', 'La Notte', 'Long Weekend', 'The Wandering Earth', 'Fifty Shades of Grey', 'Blood Simple', 'Ted', 'Little Women', 'The Change-Up', 'Words on Bathroom Walls', '3 Idiots', 'On the Road', 'Hellboy', 'They Came Together', 'North Hollywood', 'You Get Me', 'The Beyond', 'Tokyo Godfathers', 'Last Vegas', 'Sherlock, Jr.', 'Free State of Jones', 'The Boy and the Beast', 'Hell Fest', 'The Letter Room', 'Beasts of the Southern Wild', 'Police Story', 'Sudden Impact', 'Starstruck', 'Amer

In [None]:
tmdb_titles = set(tmdb_movies_df['title'].tolist())
print(tmdb_titles)
print(len(tmdb_titles))

{'Elizabeth', 'Men of Honor', 'The Talented Mr. Ripley', 'The Pirate', 'The Big Wedding', 'Plush', 'The Railway Man', 'Son of the Mask', 'One True Thing', 'Saving Face', 'Bound by Honor', 'Admission', 'The Unborn', 'Highlander: The Final Dimension', 'Samsara', 'Osmosis Jones', 'Get Hard', 'Pokémon: Spell of the Unknown', 'The Bounty Hunter', 'Evil Words', 'Curse of the Golden Flower', 'The Christmas Bunny', 'Insidious', 'The Boat That Rocked', 'A League of Their Own', "Barney's Version", 'Undercover Brother', '3 Strikes', 'The Shallows', 'Desperado', 'The Muppet Christmas Carol', 'Diary of a Mad Black Woman', 'Unknown', 'Salvation Boulevard', 'Addicted', 'The Theory of Everything', 'Niagara', 'In the Shadow of the Moon', 'End of Watch', "Bill & Ted's Excellent Adventure", 'Friday the 13th Part VIII: Jason Takes Manhattan', 'Halloween', 'Happy Feet Two', 'Hide and Seek', 'Sparkler', 'The Thomas Crown Affair', 'Ride with the Devil', 'Men in Black II', 'Gone with the Wind', 'Supercapitali

In [None]:
tmdb_titles_2 = set(tmdb_movies_df['original_title'].tolist())
print(tmdb_titles_2)
print(len(tmdb_titles_2))

{'Elizabeth', 'Men of Honor', 'The Talented Mr. Ripley', 'The Pirate', 'The Big Wedding', 'Plush', 'The Railway Man', 'Son of the Mask', 'One True Thing', 'Saving Face', 'Bound by Honor', 'Admission', 'The Unborn', 'Samsara', '一個好人', 'Osmosis Jones', 'Get Hard', 'The Bounty Hunter', 'The Christmas Bunny', 'Insidious', 'The Boat That Rocked', 'A League of Their Own', "Barney's Version", 'Undercover Brother', '3 Strikes', 'The Shallows', 'Desperado', 'The Muppet Christmas Carol', 'Diary of a Mad Black Woman', 'Unknown', 'Salvation Boulevard', 'Addicted', 'The Theory of Everything', 'Niagara', 'In the Shadow of the Moon', 'End of Watch', 'El Crimen del Padre Amaro', "Bill & Ted's Excellent Adventure", 'Friday the 13th Part VIII: Jason Takes Manhattan', 'Halloween', 'Happy Feet Two', 'Hide and Seek', 'Sparkler', 'The Thomas Crown Affair', 'Ride with the Devil', 'Men in Black II', 'Gone with the Wind', 'Time Changer', 'Little Man', 'Repo! The Genetic Opera', 'Zoom', 'Bobby Jones: Stroke of 

In [None]:
industry_and_letterboxd = letterboxd_titles.intersection(industry_titles)
print(len(industry_and_letterboxd))

tmdb_and_letterboxd = letterboxd_titles.intersection(tmdb_titles)
print(len(tmdb_and_letterboxd))

all_intersect_titles = tmdb_titles.intersection(letterboxd_titles.intersection(industry_titles))
print(len(all_intersect_titles))

3800
2606
2292


In [None]:
missing_letterboxd_1 = letterboxd_titles.difference(industry_titles)
missing_industry_1 = industry_titles.difference(letterboxd_titles)
print(len(missing_letterboxd_1))
print(len(missing_industry_1))

print(sorted(missing_letterboxd_1))
print(sorted(missing_industry_1))

2837
3712
['#Alive', '(500) Days of Summer', '11.22.63', '12 Angry Men', '12 Hour Shift', '13 Hours: The Secret Soldiers of Benghazi', '13th', '16 Wishes', '1900', '1922', '1941', '1BR', '2 Days in Paris', '2 or 3 Things I Know About Her', '20 Feet from Stardom', '20,000 Days on Earth', '20,000 Leagues Under the Sea', '2001: A Space Odyssey', '2010', '2036: Nexus Dawn', '21 Bridges', '22 July', '24 Frames', '28 Days Later', '3 Men and a Baby', '3 Women', '3 from Hell', '35 Shots of Rum', '365 Days', '42nd Street', '5 Centimeters per Second', '6 Underground', '7 Days in Hell', '71 Fragments of a Chronology of Chance', '7500', '8:46', '8½', '964 Pinocchio', 'A Bay of Blood', 'A Better Tomorrow', 'A Bittersweet Life', 'A Boy and His Dog', 'A Bride for Rip Van Winkle', 'A Bridge Too Far', 'A Brighter Summer Day', 'A Charlie Brown Christmas', 'A Charlie Brown Thanksgiving', 'A Christmas Prince', 'A City of Sadness', 'A Classic Horror Story', 'A Clockwork Orange', 'A Close Shave', 'A Coffee 

### Building our Dataframe

In [None]:
# Get tuples of (title, year) pairs shared across all 3

# movie_industry_df --> name, year
# letterboxd_df --> title, year
# tmdb_movies_df --> title, release_date[:4]

# Good news! name/title should be the same LOL

industry_titles = movie_industry_df['name'].tolist()
industry_years = movie_industry_df['year'].tolist()

boxd_titles = letterboxd_df['title'].tolist()
boxd_years = letterboxd_df['year'].tolist()

tmdb_titles = tmdb_movies_df['title'].tolist()
tmdb_years = tmdb_movies_df['release_date'].tolist()

In [None]:
industry_pairs = set()
for i, title in enumerate(industry_titles):
    year = industry_years[i]
    industry_pairs.add((title, year))

boxd_pairs = set()
for i, title in enumerate(boxd_titles):
    year = boxd_years[i]
    boxd_pairs.add((title, year))

tmdb_pairs = set()
for i, title in enumerate(tmdb_titles):
    year = tmdb_years[i]
    if len(str(year)) >= 4:
        year = int(year[:4])
        tmdb_pairs.add((title, year))
    else:
        print(title)
        print('nan')

print(len(industry_pairs))
print(len(boxd_pairs))
print(len(tmdb_pairs))

America Is Still the Place
nan
7668
6833
4802


In [None]:
intersect_pairs = tmdb_pairs.intersection(industry_pairs.intersection(boxd_pairs))
print(len(intersect_pairs))

2220


In [None]:
letterboxd_data_pruned = []
for movieid in movieid_to_data:
    data = movieid_to_data[movieid]
    rating_counts = get_rating_counts_in_order(data[2])
    if (data[0], data[1]) in intersect_pairs:
        row = [movieid, data[3], data[0], data[1]]
        row = row + rating_counts
        letterboxd_data_pruned.append(row)
print(len(letterboxd_data_pruned))

2220


In [None]:
letterboxd_cols = ['movie_id', 'poster_path', 'title', 'year', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
letterboxd_pruned_df = pd.DataFrame(letterboxd_data_pruned, columns=letterboxd_cols)
letterboxd_pruned_df

Unnamed: 0,movie_id,poster_path,title,year,1,2,3,4,5,6,7,8,9,10
0,pulp-fiction,1,Pulp Fiction,1994,1660,3821,1626,10863,7688,51895,47635,192128,117437,313059
1,inception,1,Inception,2010,1250,3550,2063,13735,11410,71166,69034,232098,117643,245070
2,fight-club,1,Fight Club,1999,1069,2709,1501,9343,7963,50079,53178,191481,110863,240121
3,the-dark-knight,1,The Dark Knight,2008,757,1771,962,6094,5548,37917,40828,175350,125042,335098
4,the-grand-budapest-hotel,1,The Grand Budapest Hotel,2014,624,2238,1081,8321,6965,47211,52375,182761,101769,173334
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2215,school-daze,100,School Daze,1988,22,52,96,404,669,1555,1440,1126,231,167
2216,trees-lounge,100,Trees Lounge,1996,3,24,46,166,350,1403,1867,1791,381,316
2217,kill-the-messenger,100,Kill the Messenger,2014,12,43,84,480,895,2703,2354,1204,185,112
2218,my-name-is-khan,100,My Name Is Khan,2010,104,149,128,434,599,1885,1930,2827,847,1278


In [None]:
movies_df = letterboxd_pruned_df.copy()
movies_df

Unnamed: 0,movie_id,poster_path,title,year,1,2,3,4,5,6,7,8,9,10
0,pulp-fiction,1,Pulp Fiction,1994,1660,3821,1626,10863,7688,51895,47635,192128,117437,313059
1,inception,1,Inception,2010,1250,3550,2063,13735,11410,71166,69034,232098,117643,245070
2,fight-club,1,Fight Club,1999,1069,2709,1501,9343,7963,50079,53178,191481,110863,240121
3,the-dark-knight,1,The Dark Knight,2008,757,1771,962,6094,5548,37917,40828,175350,125042,335098
4,the-grand-budapest-hotel,1,The Grand Budapest Hotel,2014,624,2238,1081,8321,6965,47211,52375,182761,101769,173334
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2215,school-daze,100,School Daze,1988,22,52,96,404,669,1555,1440,1126,231,167
2216,trees-lounge,100,Trees Lounge,1996,3,24,46,166,350,1403,1867,1791,381,316
2217,kill-the-messenger,100,Kill the Messenger,2014,12,43,84,480,895,2703,2354,1204,185,112
2218,my-name-is-khan,100,My Name Is Khan,2010,104,149,128,434,599,1885,1930,2827,847,1278


In [None]:
# 27 columns

movies_df["tmdb_budget"] = np.nan
movies_df["imdb_budget"] = np.nan

movies_df["tmdb_revenue"] = np.nan
movies_df["imdb_revenue"] = np.nan

movies_df["tmdb_vote_average"] = np.nan
movies_df["tmdb_vote_count"] = np.nan

movies_df["imdb_vote_average"] = np.nan
movies_df["imdb_vote_count"] = np.nan

movies_df["tmdb_popularity"] = np.nan

movies_df["tmdb_runtime"] = np.nan
movies_df["imdb_runtime"] = np.nan

movies_df["main_genre"] = np.nan
movies_df["genres"] = np.nan

movies_df["director"] = np.nan
movies_df["writer"] = np.nan
movies_df["main_actor"] = np.nan
movies_df["mpaa_rating"] = np.nan

movies_df["overview"] = np.nan
movies_df["tagline"] = np.nan
movies_df["keywords"] = np.nan
movies_df["release_date"] = np.nan

movies_df["main_prod_company"] = np.nan
movies_df["production_companies"] = np.nan

movies_df["country"] = np.nan
movies_df["production_countries"] = np.nan

movies_df["original_language"] = np.nan
movies_df["spoken_languages"] = np.nan

In [None]:
movies_df # should have 14 + 27 = 41 columns

Unnamed: 0,movie_id,poster_path,title,year,1,2,3,4,5,6,7,8,9,10,tmdb_budget,imdb_budget,tmdb_revenue,imdb_revenue,tmdb_vote_average,tmdb_vote_count,imdb_vote_average,imdb_vote_count,tmdb_popularity,tmdb_runtime,imdb_runtime,main_genre,genres,director,writer,main_actor,mpaa_rating,overview,tagline,keywords,release_date,main_prod_company,production_companies,country,production_countries,original_language,spoken_languages
0,pulp-fiction,1,Pulp Fiction,1994,1660,3821,1626,10863,7688,51895,47635,192128,117437,313059,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,inception,1,Inception,2010,1250,3550,2063,13735,11410,71166,69034,232098,117643,245070,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,fight-club,1,Fight Club,1999,1069,2709,1501,9343,7963,50079,53178,191481,110863,240121,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,the-dark-knight,1,The Dark Knight,2008,757,1771,962,6094,5548,37917,40828,175350,125042,335098,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,the-grand-budapest-hotel,1,The Grand Budapest Hotel,2014,624,2238,1081,8321,6965,47211,52375,182761,101769,173334,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2215,school-daze,100,School Daze,1988,22,52,96,404,669,1555,1440,1126,231,167,,,,,,,,,,,,,,,,,,,,,,,,,,,
2216,trees-lounge,100,Trees Lounge,1996,3,24,46,166,350,1403,1867,1791,381,316,,,,,,,,,,,,,,,,,,,,,,,,,,,
2217,kill-the-messenger,100,Kill the Messenger,2014,12,43,84,480,895,2703,2354,1204,185,112,,,,,,,,,,,,,,,,,,,,,,,,,,,
2218,my-name-is-khan,100,My Name Is Khan,2010,104,149,128,434,599,1885,1930,2827,847,1278,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
# movie_industry_df --> name, year
# letterboxd_df --> title, year
# tmdb_movies_df --> title, release_date[:4]

def update_with_tmdb(movies_df, old_df, title, year, old_field, target_field):
    value = old_df.loc[(old_df['title'] == title) & (old_df['year'] == year), old_field].iloc[0]
    movies_df.loc[(movies_df['title'] == title) & (movies_df['year'] == year), target_field] = value

def update_with_industry(movies_df, old_df, title, year, old_field, target_field):
    value = old_df.loc[(old_df['name'] == title) & (old_df['year'] == year), old_field].iloc[0]
    movies_df.loc[(movies_df['title'] == title) & (movies_df['year'] == year), target_field] = value

def update_with_tmdb_dicts(movies_df, old_df, title, year, old_field, target_field):
    value = old_df.loc[(old_df['title'] == title) & (old_df['year'] == year), old_field].iloc[0]
    value_lst = json.loads(value)
    return_lst = []
    for cur_dict in value_lst:
        return_lst.append(cur_dict["name"])
    movies_df.loc[(movies_df['title'] == title) & (movies_df['year'] == year), target_field] = json.dumps(return_lst)

In [None]:

for title, year in intersect_pairs:
    update_with_tmdb(movies_df, tmdb_movies_df, title, year, 'budget', 'tmdb_budget')
    update_with_industry(movies_df, movie_industry_df, title, year, 'budget', 'imdb_budget')

    update_with_tmdb(movies_df, tmdb_movies_df, title, year, 'revenue', 'tmdb_revenue')
    update_with_industry(movies_df, movie_industry_df, title, year, 'gross', 'imdb_revenue')

    update_with_tmdb(movies_df, tmdb_movies_df, title, year, 'vote_average', 'tmdb_vote_average')
    update_with_tmdb(movies_df, tmdb_movies_df, title, year, 'vote_count', 'tmdb_vote_count')

    update_with_industry(movies_df, movie_industry_df, title, year, 'score', 'imdb_vote_average')
    update_with_industry(movies_df, movie_industry_df, title, year, 'votes', 'imdb_vote_count')

    update_with_tmdb(movies_df, tmdb_movies_df, title, year, 'popularity', 'tmdb_popularity')

    update_with_tmdb(movies_df, tmdb_movies_df, title, year, 'runtime', 'tmdb_runtime')
    update_with_industry(movies_df, movie_industry_df, title, year, 'runtime', 'imdb_runtime')

    update_with_industry(movies_df, movie_industry_df, title, year, 'genre', 'main_genre')

    update_with_tmdb_dicts(movies_df, tmdb_movies_df, title, year, 'genres', 'genres')

    update_with_industry(movies_df, movie_industry_df, title, year, 'director', 'director')
    update_with_industry(movies_df, movie_industry_df, title, year, 'writer', 'writer')
    update_with_industry(movies_df, movie_industry_df, title, year, 'star', 'main_actor')

    update_with_industry(movies_df, movie_industry_df, title, year, 'rating', 'mpaa_rating')
    update_with_tmdb(movies_df, tmdb_movies_df, title, year, 'overview', 'overview')
    update_with_tmdb(movies_df, tmdb_movies_df, title, year, 'tagline', 'tagline')

    update_with_tmdb_dicts(movies_df, tmdb_movies_df, title, year, 'keywords', 'keywords')

    update_with_tmdb(movies_df, tmdb_movies_df, title, year, 'release_date', 'release_date')

    update_with_industry(movies_df, movie_industry_df, title, year, 'company', 'main_prod_company')
    update_with_tmdb_dicts(movies_df, tmdb_movies_df, title, year, 'production_companies', 'production_companies')

    update_with_industry(movies_df, movie_industry_df, title, year, 'country', 'country')
    update_with_tmdb_dicts(movies_df, tmdb_movies_df, title, year, 'production_countries', 'production_countries')

    update_with_tmdb(movies_df, tmdb_movies_df, title, year, 'original_language', 'original_language')
    update_with_tmdb_dicts(movies_df, tmdb_movies_df, title, year, 'spoken_languages', 'spoken_languages')
    pass
    
movies_df

In [None]:
# movies_df.to_csv('/content/drive/My Drive/cs229_proj/movies_full.csv')

In [None]:
# movies_df["nan"] = np.nan
# movies_df

### Getting those GENRES

In [None]:
example_genres = json.loads(tmdb_movies_df["genres"].tolist()[0])
print(example_genres)
print(example_genres[0])

[{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 878, 'name': 'Science Fiction'}]
{'id': 28, 'name': 'Action'}


In [None]:
def get_genres(genres):
    genre_lst = []
    for genre in genres:
        genre_lst.append(genre["name"])
    return genre_lst

In [None]:
example_genre_lst = get_genres(example_genres)
print(example_genre_lst)

['Action', 'Adventure', 'Fantasy', 'Science Fiction']


###Seeing Frequencies of Titles

In [None]:
titles_to_count = {}
max_count = 0
for movieid in movieid_to_data:
    title = movieid_to_data[movieid][0]
    if title in titles_to_count:
        titles_to_count[title] += 1
    else:
        titles_to_count[title] = 1
    if titles_to_count[title] > max_count:
        max_count = titles_to_count[title]

print(max_count)

4


In [None]:
buckets = [0,0,0,0,0]
# index - # of appearances

repeated_titles = {}

for title in titles_to_count:
    buckets[titles_to_count[title]] += 1
    if titles_to_count[title] >= 2:
        repeated_titles[title] = titles_to_count[title]

print(buckets)
print(sum(buckets[2:])) # Number of movies with repeat titles

[0, 6456, 168, 11, 2]
181


In [None]:
for title in repeated_titles:
    print(title, repeated_titles[title])

Little Women 2
It 2
Psycho 2
The Lion King 2
A Star Is Born 3
Oldboy 2
The Thing 2
The Invisible Man 2
Frozen 2
Scarface 2
Halloween 3
Suspiria 2
Ghostbusters 2
Mulan 2
The Girl with the Dragon Tattoo 2
Aladdin 2
Watchmen 2
Carrie 2
A Nightmare on Elm Street 2
Beauty and the Beast 3
Alice in Wonderland 2
Les Misérables 2
Dune 2
The Hunt 2
Hercules 2
Paddington 2 2
Batman 2
Saw 2
Godzilla 3
Murder on the Orient Express 2
True Grit 2
House 2
The Fly 2
The Host 2
Metropolis 2
The Mummy 3
The Jungle Book 2
RoboCop 2
Ghost in the Shell 2
Solaris 2
The Parent Trap 2
Night of the Living Dead 2
Mortal Kombat 2
Shame 2
Total Recall 2
Candyman 2
Poltergeist 2
King Kong 2
Dracula 4
The Woman in the Window 3
Mystic River 2
Lolita 2
Point Break 2
The Wicker Man 2
Friday the 13th 2
Insomnia 2
Cape Fear 2
Funny Games 2
The Beguiled 2
The Addams Family 2
Pinocchio 2
The Karate Kid 2
Life 2
Fantastic Four 2
Bottle Rocket 2
Hellboy 2
Cinderella 4
Dawn of the Dead 2
Peter Pan 2
Dumbo 2
Tag 2
Pet Sematary

In [None]:
with open("movieid_to_data.json", "w") as outfile:
    json.dump(movieid_to_data, outfile)

# User Ratings from Letterboxd - Data Cleaning

In [None]:
user_ratings_path = '/content/drive/My Drive/cs229_proj/user_to_rating.csv'
user_ratings_df = pd.read_csv(user_ratings_path)

In [None]:
user_ratings_df

Unnamed: 0,user_id,movie_id,rating_val
0,shawn_stubbs,the-addams-family,9
1,ramiraff,the-truman-show,9
2,joshua_toomes,the-bourne-identity-2002,8
3,henryg9876,up,8
4,denny_crane,eight-legged-freaks,5
...,...,...,...
392175,brunoamato,far-from-heaven,8
392176,brandonh,the-incredible-burt-wonderstone,4
392177,madrianpl,the-abyss,7
392178,dsparky,kung-fu-panda-2,4


In [None]:
movie_set = set(user_ratings_df["movie_id"].tolist())
print(movie_set)
print(len(movie_set)) # 126,814 movies

In [None]:
movies_full_set = set(movies_full_df["movie_id"].tolist())
print(movies_full_set)
print(len(movies_full_set)) # 2,220 movies

{'the-matrix', 'memoirs-of-a-geisha', 'that-awkward-moment', 'the-terminator', 'diary-of-a-wimpy-kid', 'the-girl-next-door-2004', 'harry-potter-and-the-half-blood-prince', 'argo', 'inception', 'saw-vi', 'ghost-ship', 'the-wolfman', 'slow-west', 'red-cliff', 'coraline', 'species', 'the-departed', 'lights-out-2016', 'con-air', 'the-last-stand-2013', 'lincoln', 'the-cat-in-the-hat-2003', 'kung-pow-enter-the-fist', 'captain-america-civil-war', 'million-dollar-baby', 'scarface-1983', 'paranormal-activity-4', 'mulan', 'intolerable-cruelty', 'xxx-state-of-the-union', '4-months-3-weeks-and-2-days', 'driving-miss-daisy', 'mean-creek', 'session-9', 'mo-better-blues', 'a-beautiful-mind', 'the-hunt-for-red-october', 'dinosaur', 'begin-again', 'if-i-stay', 'a-very-long-engagement', 'i-frankenstein', 'notes-on-a-scandal', 'brokeback-mountain', 'uhf', 'the-heat', 'another-earth', 'a-night-at-the-roxbury', 'inside-man', 'downfall', 'sunshine-cleaning', 'metropolitan', 'aloha', 'michael-clayton', 'drac

In [None]:
movies_intersect_set = movies_full_set.intersection(movie_set)

print(len(movies_intersect_set)) # 2,220 movies

2220


In [None]:
movie_counts = {}
for movie in user_ratings_df["movie_id"].tolist():
    if movie in movies_intersect_set:
        if movie in movie_counts: 
            movie_counts[movie] += 1
        else:
            movie_counts[movie] = 1

In [None]:
counts = list(movie_counts.values())

print(sum(counts) / len(counts)) # 176.657 average count of users that rated one of our 2220 movies

176.65765765765767


In [None]:
all_movie_counts = {}
for movie in user_ratings_df["movie_id"].tolist():
    if movie in all_movie_counts: 
        all_movie_counts[movie] += 1
    else:
        all_movie_counts[movie] = 1

all_counts = list(all_movie_counts.values())
print(sum(all_counts) / len(all_counts)) # 11.82 count for each movie
print(len(all_counts)) # 126814 movies with 1.5 million ratings

11.82834702793067
126814


In [None]:
desired_movie_id = list(movies_intersect_set)
user_ratings_df = user_ratings_df.loc[user_ratings_df['movie_id'].isin(movies_full_set)]
user_ratings_df = user_ratings_df.reset_index(drop=True)
user_ratings_df

Unnamed: 0,user_id,movie_id,rating_val
0,shawn_stubbs,the-addams-family,9
1,ramiraff,the-truman-show,9
2,joshua_toomes,the-bourne-identity-2002,8
3,henryg9876,up,8
4,denny_crane,eight-legged-freaks,5
...,...,...,...
392175,brunoamato,far-from-heaven,8
392176,brandonh,the-incredible-burt-wonderstone,4
392177,madrianpl,the-abyss,7
392178,dsparky,kung-fu-panda-2,4


In [None]:
user_ratings_df

Unnamed: 0,user_id,movie_id,rating_val
11,shawn_stubbs,the-addams-family,9
17,ramiraff,the-truman-show,9
19,joshua_toomes,the-bourne-identity-2002,8
20,henryg9876,up,8
23,denny_crane,eight-legged-freaks,5
...,...,...,...
1499975,brunoamato,far-from-heaven,8
1499978,brandonh,the-incredible-burt-wonderstone,4
1499979,madrianpl,the-abyss,7
1499982,dsparky,kung-fu-panda-2,4


In [None]:
# Surprisingly, I'm already in this dataset...

jesse = user_ratings_df.loc[user_ratings_df['user_id'] == 'jdoan']
jesse

Unnamed: 0,user_id,movie_id,rating_val
16762,jdoan,diary-of-a-wimpy-kid,4
17072,jdoan,percy-jackson-the-olympians-the-lightning-thief,4
23484,jdoan,inside-out-2015,6
32258,jdoan,hero-2002,8
32270,jdoan,the-ghost-writer,9
55383,jdoan,iron-man-2,4
83706,jdoan,kick-ass,6
104873,jdoan,hugo,7
113470,jdoan,incendies,8
116280,jdoan,holes,6


In [None]:
# rando = user_ratings_df.loc[user_ratings_df['user_id'] == 'ralfmakesmovies']
# rando

In [None]:
some_set = set(user_ratings_df['movie_id'].tolist())

In [None]:
some_intersect = some_set.intersection(movies_full_set)
print(len(some_intersect))

2220


In [None]:
user_ratings_df.to_csv('user_ratings_full.csv')

In [None]:
users = user_ratings_df['user_id'].tolist()
movies = user_ratings_df['movie_id'].tolist()
ratings = user_ratings_df['rating_val'].tolist()

In [None]:
ratings_set = set()

In [None]:
user_to_rating_dict = {}

for i, cur_user in enumerate(users):
    cur_movie = movies[i]
    cur_rating = ratings[i]
    if cur_user not in user_to_rating_dict:
        user_to_rating_dict[cur_user] = {}
    user_to_rating_dict[cur_user][cur_movie] = cur_rating
    ratings_set.add((cur_user, cur_movie, cur_rating))

In [None]:
some_counts = []
for user in user_to_rating_dict:
    cur_dict = user_to_rating_dict[user]
    some_counts.append(len(cur_dict))
print(sum(some_counts))

392180


In [None]:
print(len(ratings_set))

392180


In [None]:
print(len(user_to_rating_dict)) # 5227 users

5227


In [None]:
print(user_to_rating_dict.keys())


rando_dict = user_to_rating_dict['jdoan']
print(len(rando_dict))
print(rando_dict)

dict_keys(['shawn_stubbs', 'ramiraff', 'joshua_toomes', 'henryg9876', 'denny_crane', 'dolfinn', 'lordzadik', 'boston_', 'danscully', 'filmranked', 'moviesco', 'rolle', 'bamsarnett', 'nadiaboff', 'dphbjs', 'aadya', 'naterichard98', 'rosecoloredfilm', 'laurenwood23', 'bobinkafa', 'theslayerbuffy', 'chrispiine', 'bluilluminati', 'nikhil1004', 'chip', 'outerspaceboy', 'etienneone', 'kaisersoze', 'clayton_dillard', 'wilbus', 'movie_mike96', 'eggsy', 'robertsaucedo', 'mariogaborovic', 'isaacramen', 'sara13', 'killfinger', 'tonyd9779', 'thesherminator', 'kwhitesays', 'khoss', 'raphaklopper', 'bethanymaic', 'dorzaby', 'peepinstead', 'utterpiffle', 'musachaudhry', 'flamexio', 'demyfilms', 'deadpegasus', 'madihatesfilm', 'anatrnd', 'thesparkknight', 'doctorsatan', 'mcastimovies', 'prashprash', 'fzilon', 'mattstrohl99', 'duckman', 'sebastiandavis', 'fatyoshi', 'samimcknnn', 'filmslikedreams', 'mollylaich', 'the_blacklodge', 'bacchichiccups', 'p3achglow', 'princesssleia', 'mr_sheldrake', 'coz22998

In [None]:
# with open("user_to_rating_dict.json", "w") as outfile:
#     json.dump(user_to_rating_dict, outfile)

## Injecting my own Letterboxd ratings to user ratings dataset

401/762 of movies made the final cut where those 401 are contained in our augmented dataset.

In [None]:
# movies_full_path = '/content/drive/My Drive/cs229_proj/movies_full.csv'
user_ratings_full_path = '/content/drive/My Drive/cs229_proj/user_ratings_full.csv'
jesse_ratings_path = '/content/drive/My Drive/cs229_proj/movie_to_jesse_rating.json'

# movies_full_df = pd.read_csv(movies_full_path)
ratings_full_df = pd.read_csv(user_ratings_full_path)
movie_to_jesse_rating = {}
with open(jesse_ratings_path, 'r') as fh:
    movie_to_jesse_rating = json.load(fh)

In [None]:
print(len(movie_to_jesse_rating))
jesse_movie_set = set(movie_to_jesse_rating.keys())

762


In [None]:
movie_set = set(ratings_full_df['movie_id'].tolist())
print(len(movie_set))

2220


In [None]:
jesse_intersect = jesse_movie_set.intersection(movie_set)
print(len(jesse_intersect))
print(jesse_intersect)

402
{'gattaca', 'alvin-and-the-chipmunks', 'i-am-love', 'black-swan', 'thor', 'megamind', 'despicable-me', 'forrest-gump', 'se7en', 'the-secret-life-of-pets', 'pans-labyrinth', 'alice-in-wonderland-2010', 'slumdog-millionaire', 'boogie-nights', 'the-imitation-game', 'star-trek', 'before-midnight', 'kiss-kiss-bang-bang-2005', 'amour', 'jingle-all-the-way', 'x-men-days-of-future-past', 'shakespeare-in-love', 'creed', 'kung-fu-panda', 'how-to-train-your-dragon', 'x-men-apocalypse', 'the-avengers-2012', 'hot-fuzz', 'tron-legacy', 'tangled-2010', 'glory', 'before-sunset', 'vanilla-sky', 'the-chronicles-of-narnia-prince-caspian', 'the-matrix', 'intolerable-cruelty', 'kick-ass', 'children-of-men', 'hugo', 'scott-pilgrim-vs-the-world', 'stand-by-me', 'die-hard', 'the-prestige', 'frozen-2013', 'the-great-gatsby-2013', 'g-force', 'click', 'night-at-the-museum', 'drag-me-to-hell', 'harry-potter-and-the-goblet-of-fire', 'jurassic-park', 'stuart-little-2', 'anomalisa', 'moonrise-kingdom', 'pay-it-f

In [None]:
jesse_df = []
for movie in jesse_intersect:
    d = {'user_id' : 'jdoan',
         'movie_id' : movie,
         'rating_val' : movie_to_jesse_rating[movie]}
    jesse_df.append(d)


In [None]:
print(jesse_df)

[{'user_id': 'jdoan', 'movie_id': 'gattaca', 'rating_val': 6}, {'user_id': 'jdoan', 'movie_id': 'alvin-and-the-chipmunks', 'rating_val': 2}, {'user_id': 'jdoan', 'movie_id': 'i-am-love', 'rating_val': 6}, {'user_id': 'jdoan', 'movie_id': 'black-swan', 'rating_val': 10}, {'user_id': 'jdoan', 'movie_id': 'thor', 'rating_val': 4}, {'user_id': 'jdoan', 'movie_id': 'megamind', 'rating_val': 6}, {'user_id': 'jdoan', 'movie_id': 'despicable-me', 'rating_val': 5}, {'user_id': 'jdoan', 'movie_id': 'forrest-gump', 'rating_val': 7}, {'user_id': 'jdoan', 'movie_id': 'se7en', 'rating_val': 8}, {'user_id': 'jdoan', 'movie_id': 'the-secret-life-of-pets', 'rating_val': 2}, {'user_id': 'jdoan', 'movie_id': 'pans-labyrinth', 'rating_val': 10}, {'user_id': 'jdoan', 'movie_id': 'alice-in-wonderland-2010', 'rating_val': 4}, {'user_id': 'jdoan', 'movie_id': 'slumdog-millionaire', 'rating_val': 7}, {'user_id': 'jdoan', 'movie_id': 'boogie-nights', 'rating_val': 10}, {'user_id': 'jdoan', 'movie_id': 'the-imit

In [None]:
jesse_df = pd.DataFrame(jesse_df)
jesse_df

Unnamed: 0,user_id,movie_id,rating_val
0,jdoan,gattaca,6
1,jdoan,alvin-and-the-chipmunks,2
2,jdoan,i-am-love,6
3,jdoan,black-swan,10
4,jdoan,thor,4
...,...,...,...
397,jdoan,into-the-woods-2014,5
398,jdoan,zero-dark-thirty,7
399,jdoan,the-big-lebowski,9
400,jdoan,aliens,8


In [None]:
new_df = ratings_full_df.loc[ratings_full_df['user_id'] != 'jdoan']
new_df

Unnamed: 0,user_id,movie_id,rating_val
0,shawn_stubbs,the-addams-family,9
1,ramiraff,the-truman-show,9
2,joshua_toomes,the-bourne-identity-2002,8
3,henryg9876,up,8
4,denny_crane,eight-legged-freaks,5
...,...,...,...
392175,brunoamato,far-from-heaven,8
392176,brandonh,the-incredible-burt-wonderstone,4
392177,madrianpl,the-abyss,7
392178,dsparky,kung-fu-panda-2,4


In [None]:
new_rating_df = pd.concat([new_df, jesse_df])

In [None]:
new_rating_df = new_rating_df.reset_index(drop=True)
new_rating_df

Unnamed: 0,user_id,movie_id,rating_val
0,shawn_stubbs,the-addams-family,9
1,ramiraff,the-truman-show,9
2,joshua_toomes,the-bourne-identity-2002,8
3,henryg9876,up,8
4,denny_crane,eight-legged-freaks,5
...,...,...,...
392546,jdoan,into-the-woods-2014,5
392547,jdoan,zero-dark-thirty,7
392548,jdoan,the-big-lebowski,9
392549,jdoan,aliens,8


In [None]:
ratings_full_df

Unnamed: 0,user_id,movie_id,rating_val
0,shawn_stubbs,the-addams-family,9
1,ramiraff,the-truman-show,9
2,joshua_toomes,the-bourne-identity-2002,8
3,henryg9876,up,8
4,denny_crane,eight-legged-freaks,5
...,...,...,...
392175,brunoamato,far-from-heaven,8
392176,brandonh,the-incredible-burt-wonderstone,4
392177,madrianpl,the-abyss,7
392178,dsparky,kung-fu-panda-2,4


In [None]:
# movies_full_df = movies_full_df.loc[:,~movies_full_df.columns.str.match("Unnamed")]
# ratings_full_df = ratings_full_df.loc[:,~ratings_full_df.columns.str.match("Unnamed")]
# movies_full_df.to_csv('movies_full.csv', index=False)
# new_rating_df.to_csv('user_ratings_full.csv', index=False)