In [None]:
from google.colab import drive
drive.mount('/content/drive')

# tmdb_api_key = input("Enter TMDb API key: ")

# change this to a folder that exists in your google drive. must include the paths indicated below
base_path = "/content/drive/MyDrive/mrec"
movies_path = f"{base_path}/datasets/movies/movies_metadata.csv"

# below paths are populated by scraper code (doesn't need to exist)

# https://agoodmovietowatch.com/api/content/?page=1&limit=30&content_type=movie&mood=funny
# mood_content_types = ["movie", "show"]
# moods = ["funny", "romantic", "mind-blowing", "feel-good", "thrilling", "thought-provoking", "weird", "uplifting", "challenging", "dark", "dramatic", "easy", "emotional", "heart-warming", "inspiring", "instructive-2", "intense", "no-plot", "slow", "smart"]
moods = ["funny", "romantic", "feel-good", "thrilling", "thought-provoking", "uplifting", "challenging", "dark", "dramatic", "easy", "emotional", "heart-warming", "inspiring", "intense"]
moods_path = f"{base_path}/scraped_data/moods.json"

reviews_raw_path = f"{base_path}/scraped_data/imdb_reviews_raw.json"
reviews_path = f"{base_path}/scraped_data/imdb_reviews.json"

ranking_path = f"{base_path}/scraped_data/word_ranking.json"
mooded_movies_path = f"{base_path}/scraped_data/mooded_movies.json"

embeddings_path = f"{base_path}/scraped_data/movie_embeddings.pickle"
base_embeddings_path = f"{base_path}/scraped_data/base_embeddings.pickle"
mood_embeddings_path = f"{base_path}/scraped_data/mood_embeddings.pickle"

related_path = f"{base_path}/scraped_data/movie_relations.json"

word_to_movies_path = f"{base_path}/scraped_data/word_to_movies.json"

def get_imdb_id(movie):
  # extract "tt8009428" from https://www.imdb.com/title/tt8009428/
  meta = movie["meta"] # required
  if "imdb" not in meta:
    return None
  return meta["imdb"].split("/")[-2]

def get_imdb_items():
  items = []

  # for now, only use the mood-mapped dataset
  with open(moods_path) as file:
    data = json.load(file)
    for mood in moods:
      items += [get_imdb_id(item) for item in data[mood] if get_imdb_id(item) is not None]

  # remove duplicates
  items = list(dict.fromkeys(items))
  return items

Mounted at /content/drive


## Mood Scraper
This will scrape movies from https://agoodmovietowatch.com/mood/ and dump it to Google Drive.

In [None]:
!pip install --quiet --progress-bar off requests
import requests
import json

data = {}
for mood in moods:
  mood_data = []
  page = 1
  while True:
    url = f"https://agoodmovietowatch.com/api/content/?page={page}&mood={mood}"
    page_data = requests.get(url).json()["data"]
    if len(page_data) == 0: break
    mood_data += page_data
    page += 1
  print(f"Scraped {page} pages for mood {mood}")
  data[mood] = mood_data

with open(moods_path, "w") as file:
  json.dump(data, file)

Scraped 17 pages for mood funny
Scraped 9 pages for mood romantic
Scraped 8 pages for mood mind-blowing
Scraped 8 pages for mood feel-good
Scraped 18 pages for mood thrilling
Scraped 18 pages for mood thought-provoking
Scraped 9 pages for mood weird
Scraped 8 pages for mood uplifting
Scraped 8 pages for mood challenging
Scraped 7 pages for mood dark
Scraped 14 pages for mood dramatic
Scraped 12 pages for mood easy
Scraped 11 pages for mood emotional
Scraped 8 pages for mood heart-warming
Scraped 8 pages for mood inspiring
Scraped 9 pages for mood instructive-2
Scraped 10 pages for mood intense
Scraped 5 pages for mood no-plot
Scraped 9 pages for mood slow
Scraped 6 pages for mood smart


## Data Processing

In [None]:
import json
import csv

with open(moods_path) as file:
  data = json.load(file)
  total = 0
  for mood in moods:
    print(f"{mood}: {len(data[mood])}")
    total += len(data[mood])
  print(f"total {total}")

  with open(movies_path, newline='') as file:
    reader = csv.DictReader(file)
    unknown = [] # this is the items we know of in the previous dataset
    for mood in moods:
      unknown += [get_imdb_id(item) for item in data[mood] if get_imdb_id(item) is not None]

      # remove duplicates
      unknown = list(dict.fromkeys(unknown))
    
    # better to enumerate through entire dataset once, since there is no index by id
    for row in reader: # compare `known` with the larger dataset (so we can find missing items)
      for item_id in unknown:
        if row["imdb_id"] == item_id:
          unknown.remove(item_id)
    # thus, `unknown` becomes a list of items that are not in the larger dataset
    if len(unknown) != 0:
      print(f"I could not find {len(unknown)} items in the dataset: {unknown}")

funny: 151
romantic: 75
mind-blowing: 63
feel-good: 69
thrilling: 161
thought-provoking: 167
weird: 80
uplifting: 61
challenging: 68
dark: 51
dramatic: 126
easy: 106
emotional: 92
heart-warming: 62
inspiring: 62
instructive-2: 71
intense: 84
no-plot: 36
slow: 74
smart: 46
total 1705
I could not find 331 items in the dataset: ['tt8009428', 'tt10417836', 'tt11317142', 'tt7826376', 'tt4677934', 'tt10801368', 'tt12851524', 'tt14129378', 'tt14064072', 'tt12614214', 'tt11394180', 'tt10366460', 'tt11000902', 'tt13660958', 'tt8036816', 'tt10642834', 'tt8980602', 'tt0459159', 'tt14218830', 'tt7886936', 'tt8594324', 'tt13406094', 'tt7158430', 'tt1489887', 'tt8129450', 'tt3147316', 'tt9138170', 'tt8526872', 'tt9484998', 'tt4686844', 'tt10098620', 'tt4971344', 'tt7639280', 'tt8594510', 'tt5957766', 'tt10062292', 'tt5619658', 'tt5905354', 'tt5688932', 'tt3762198', 'tt5912064', 'tt7476116', 'tt3521126', 'tt7738450', 'tt5791098', 'tt7649694', 'tt5295524', 'tt3824648', 'tt7767422', 'tt4169146', 'tt843

## IMDB Review Scraping (unused)

In [None]:
!pip install --quiet --progress-bar off requests beautifulsoup4
import json
import requests
from bs4 import BeautifulSoup
from time import sleep

items = get_imdb_items()

# https://www.imdb.com/title/tt8009428/reviews/_ajax
raw_data = {}
data = {}

def scrape_review_page(item_id, page):
  response = requests.get(f"https://www.imdb.com/title/{item_id}/reviews/_ajax" + (f"?paginationKey={page}" if page is not None else ""))
  if response.status_code != 200:
    print("Encountered non-200 status code, sleeping for 30 seconds and retrying...")
    sleep(30)
    return scrape_review_page(item_id, page)
  response_text = response.text
  if raw_data.get(item_id) is None: raw_data[item_id] = []
  raw_data[item_id].append(response_text)
  soup = BeautifulSoup(response_text)
  for review_item in soup.select(".lister-item"):
    title_el = review_item.select_one(".review-container > .lister-item-content > .title")
    title = title_el.get_text() if title_el is not None else None
    content_el = review_item.select_one(".review-container > .lister-item-content > .content > .text")
    content = content_el.get_text() if content_el is not None else None
    if title is None and content is None: continue # skip review if it's empty
    if data.get(item_id) is None: data[item_id] = []
    data[item_id].append({title: title, content: content})
  load_more_data = soup.select_one(".load-more-data")
  if load_more_data is None: return None
  return load_more_data.get("data-key")

def scrape_reviews(item_id):
    next_page = None
    while True:
      next_page = scrape_review_page(item_id, next_page)
      if next_page is None: break

for item_id in items:
  scrape_reviews(item_id)
  print(f"Scraped {item_id} ({items.index(item_id)+1}/{len(items)})")

with open(reviews_raw_path, "w") as file:
  json.dump(raw_data, file)

with open(reviews_path, "w") as file:
  json.dump(data, file)

## Word Count (TF-IDF)

In [None]:
!pip install --quiet --progress-bar off scikit-learn sentence-transformers

from itertools import chain
import json
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

[?25l
[?25h[?25l
[?25h[?25l
[?25h[?25l
[?25h[?25l
[?25h[?25l
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [None]:
titles = {mood: [] for mood in moods}
overviews = {mood: [] for mood in moods}
taglines = {mood: [] for mood in moods}

with open(moods_path) as file:
  data = json.load(file)

  with open(movies_path) as file:
    reader = csv.DictReader(file)

    for row in reader:
      mood = None
      for item_mood, items in data.items():
        for item in items:
          if get_imdb_id(item) == row["imdb_id"]:
            mood = item_mood
            break

      if mood is None: continue # this wasn't mood-mapped, skip

      title = row["title"]
      overview = row["overview"]
      tagline = row["tagline"]

      titles[mood].append(title)
      overviews[mood].append(overview)
      taglines[mood].append(tagline)

In [None]:
def word_rankings(top = -1):
  ranking = {}
  for mood in moods:
    v = TfidfVectorizer(stop_words='english')
    r = v.fit_transform([' '.join(titles[mood] + overviews[mood] + taglines[mood])])
    words_sorted = sorted(zip(v.get_feature_names_out(), r.toarray()[0]), key=lambda x: x[1], reverse=True)
    ranking[mood] = words_sorted[:top]
  return ranking

In [None]:
rankings = word_rankings(top=50)

In [None]:
with open(ranking_path, "w") as file:
  json.dump(rankings, file)

In [None]:
rankings = {}
with open(ranking_path) as file:
  rankings = json.load(file)

print(rankings)

{'funny': [['black', 0.21457053771143725], ['family', 0.1716564301691498], ['film', 0.1716564301691498], ['news', 0.1716564301691498], ['story', 0.1716564301691498], ['young', 0.1716564301691498], ['burn', 0.12874232262686236], ['gangster', 0.12874232262686236], ['jane', 0.12874232262686236], ['life', 0.12874232262686236], ['luke', 0.12874232262686236], ['real', 0.12874232262686236], ['cat', 0.0858282150845749], ['cocaine', 0.0858282150845749], ['dadan', 0.0858282150845749], ['day', 0.0858282150845749], ['deal', 0.0858282150845749], ['doesn', 0.0858282150845749], ['dot', 0.0858282150845749], ['euro', 0.0858282150845749], ['event', 0.0858282150845749], ['home', 0.0858282150845749], ['job', 0.0858282150845749], ['lesbian', 0.0858282150845749], ['married', 0.0858282150845749], ['million', 0.0858282150845749], ['network', 0.0858282150845749], ['new', 0.0858282150845749], ['shih', 0.0858282150845749], ['snatch', 0.0858282150845749], ['stolen', 0.0858282150845749], ['stories', 0.085828215084

In [None]:
mooded_movies = {mood: [] for mood in moods}

with open(movies_path) as file:
  reader = csv.DictReader(file)

  for row in reader:
    title = row['title']
    overview = row['overview']
    tagline = row['tagline']

    try:
      v = TfidfVectorizer(stop_words='english')
      r = v.fit_transform([' '.join([title, overview, tagline])])
      word_to_float = dict(zip(v.get_feature_names_out(), r.toarray()[0]))
      # print(word_to_float)
    except Exception as e:
      print(f"Error in tf-idf vectorizer for {title} ({row['imdb_id']}): {e}")

    scores = {mood: 0 for mood in moods}
    for mood in moods:
      score = 0
      for top_word in rankings[mood]: # [0]: word, [1]: frequency
        # print(top_word)
        freq = word_to_float.get(top_word[0], 0)
        score += freq * top_word[1]
      scores[mood] = score

    mood_sorted = sorted(scores, key=lambda x: scores[x], reverse=True)
    # for s in mood_sorted:
    #   print(f"{s}: {scores[s]}")
    # print(f"{title} is {mood_sorted[0]}")

    row["score"] = scores[mood_sorted[0]]
    mooded_movies[mood_sorted[0]].append(row)

In [None]:
import json

with open(mooded_movies_path, "w") as file:
  json.dump(mooded_movies, file)

In [None]:
import json

with open(mooded_movies_path) as file:
  mooded_movies = json.load(file)

for mood in moods:
  mooded_movies[mood] = sorted(mooded_movies[mood], key=lambda x: x["score"], reverse=True)
  print(f"{mood}:")
  print("\n".join(f"- {movie['title']}" for movie in mooded_movies[mood][:5]))
  print("-----")

funny:
- The Watermelon Woman
- The Story of Luke
- The Young Offenders
- Broadcast News
- Black Cat, White Cat
-----
romantic:
- Keith
- Kal Ho Naa Ho
- The Black Orchid
- Julayi
- Original Sin
-----
mind-blowing:
- 35 Shots of Rum
- The Double Life of Veronique
- Primer
- Christ lives in Siberia
- Carnal Knowledge
-----
feel-good:
- Whisper of the Heart
- The Grand Seduction
- The Wackness
- Somers Town
- Mr. Deeds Goes to Town
-----
thrilling:
- Do You Wanna Know a Secret?
- Clockers
- Croupier
- Dry Cleaning
- 13 Tzameti
-----
thought-provoking:
- Grizzly Man
- Alfie
- H.M. Pulham, Esq.
- Yield to the Night
- One Hundred and One Nights
-----
weird:
- Lars and the Real Girl
- For a Handful of Kisses
- Down by Love
- U Be Dead
- A Summer Story
-----
uplifting:
- Rams
- Chef
- We Are the Best!
- Salmon Fishing in the Yemen
- Kung Fu Chefs
-----
challenging:
- The Prestige
- Blackfish
- Death by Death
- The Whistleblower
- Snowtown
-----
dark:
- No Man's Island
- Martha Marcy May Marle

## BERT Embedding

In [None]:
!pip install --quiet --progress-bar off sentence-transformers
import json, csv
from sentence_transformers import SentenceTransformer

[?25l
[?25h[?25l
[?25h[?25l
[?25h[?25l
[?25h[?25l
[?25h[?25l
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")

def try_float(s):
  try:
    return float(s)
  except:
    return 0.0

embedded_movies = []
with open(movies_path) as file:
  reader = csv.DictReader(file)

  for row in sorted(reader, key=lambda x: try_float(x['popularity'] or "0.0"), reverse=True)[:5000]:
    title = row['title']
    overview = row['overview']
    tagline = row['tagline']

    try:
      input = title + ". " + overview + ". " + tagline
      print(title + ": " + str(len(input)))
      r = model.encode(input)
    except Exception as e:
      print(f"Error embedding for {title} ({row['imdb_id']}): {e}")

    row["embedding"] = r.dumps()
    embedded_movies.append(row)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Minions: 216
Wonder Woman: 139
Beauty and the Beast: 202
Baby Driver: 174
Big Hero 6: 231
Deadpool: 399
Guardians of the Galaxy Vol. 2: 170
Avatar: 212
John Wick: 152
Gone Girl: 225
The Hunger Games: Mockingjay - Part 1: 175
War for the Planet of the Apes: 515
Captain America: Civil War: 340
Pulp Fiction: 320
Pirates of the Caribbean: Dead Men Tell No Tales: 520
The Dark Knight: 430
Blade Runner: 295
The Avengers: 336
Captain Underpants: The First Epic Movie: 214
The Circle: 338
The Bad Batch: 121
The Maze Runner: 277
Dawn of the Planet of the Apes: 260
Alien: Covenant: 347
Ghost in the Shell: 265
Boyka: Undisputed IV: 379
Whiplash: 189
Fight Club: 332
What Happened to Monday: 297
Wish Upon: 136
Thor: Ragnarok: 286
Logan: 251
Guardians of the Galaxy: 218
47 Meters Down: 297
The Shawshank Redemption: 466
The Last King: 333
The Dark Tower: 414
John Wick: Chapter 2: 308
The Fate of the Furious: 198
Forrest Gump: 509
Pirates of the Caribbean: The Curse of the Black Pearl: 388
Security: 496

In [None]:
import pickle
with open(embeddings_path, "wb") as file:
  pickle.dump(embedded_movies, file)

In [None]:
import re

embedded_moods = {mood: [] for mood in moods}
with open(moods_path) as file:
  movies = json.load(file)

  for mood in moods:
    for movie in movies[mood]:
      title = re.sub(" \\(\\d{4}\\)", "", movie['title'])
      overview = movie.get('shortSummary', '')
      # todo: lookup summary/long overview in other dataset

      try:
        input = title + ". " + overview
        print(title + ": " + str(len(input)))
        r = model.encode(input)
      except Exception as e:
        print(f"Error embedding for {title} ({movie['imdb_id']}): {e}")

      movie["embedding"] = r.dumps()
      embedded_moods[mood].append(movie)
      # print(movie)

Hustle: 80
Work in Progress: 80
Shiva Baby: 86
Upload: 93
Documentary Now!: 76
Starstruck: 62
Only Murders in the Building: 83
Killing It: 76
Donkeyhead: 50
Rushmore: 81
The Death of Mr. Lazarescu: 101
Bagdad Café: 60
The Afterparty: 86
Dick Johnson Is Dead: 79
CODA: 60
Our Flag Means Death: 93
Pretend it’s a City: 86
Defending Your Life: 81
This Close: 86
The Forty-Year-Old Version: 87
Broadcast News: 83
The Kid Detective: 121
The Thick of It: 83
Abbott Elementary: 54
The Daytrippers: 65
Raising Victor Vargas: 93
Dating Amber: 56
Ghosts: 84
The White Lotus: 65
The African Doctor: 82
Hearts Beat Loud: 46
Booksmart: 121
Everybody Wants Some!!: 149
Breeders: 79
Deadbeat: 57
Limbo: 104
Living in Oblivion: 85
Dolemite Is My Name: 68
Palm Springs: 55
The Death of Stalin: 21
Feel Good: 55
The Young Offenders: 45
The Sisters Brothers: 22
Man Like Mobeen: 138
Back to Life: 99
Loudermilk: 73
Never Have I Ever: 94
Flowers: 95
Stan &#038; Ollie: 68
Norsemen: 60
Wild Tales: 173
Sorry to Bother You

In [None]:
import pickle
with open(base_embeddings_path, "wb") as file:
  pickle.dump(embedded_moods, file)

## Compare Embeddings

In [None]:
!pip install --quiet --progress-bar off scikit-learn
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
import pickle
embeddings = []
base_embeddings = {}

with open(embeddings_path, "rb") as file:
  embeddings = pickle.load(file)

with open(base_embeddings_path, "rb") as file:
  base_embeddings = pickle.load(file)

loaded_base_embeddings = {mood: [pickle.loads(base["embedding"]) for base in base_embeddings[mood]] for mood in moods}

In [None]:
# print([x["title"] for x in embeddings if x["title"] in [m["title"] for m in base_embeddings]])
title = "Starstruck"
ms = []
b = [[x for x in base_embeddings[mood] if title in x["title"]] for mood in moods][0][0]
b_embedding = pickle.loads(b["embedding"])
print(b["title"])

for m in embeddings:
  m_embedding = pickle.loads(m["embedding"])
  distance = float(cosine_similarity([m_embedding], [b_embedding])[0][0])
  
  ms.append({
    "title": m["title"],
    "genres": m["genres"],
    "release_date": m["release_date"],
    "distance": distance
  })

print(json.dumps(sorted(ms, key=lambda x: x["distance"], reverse=True)[:10], indent=2))


Starstruck
[
  {
    "title": "Galaxy Quest",
    "genres": "[{'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}, {'id': 878, 'name': 'Science Fiction'}]",
    "release_date": "1999-12-23",
    "distance": 0.39984142780303955
  },
  {
    "title": "Maps to the Stars",
    "genres": "[{'id': 18, 'name': 'Drama'}]",
    "release_date": "2014-05-21",
    "distance": 0.3658679723739624
  },
  {
    "title": "The Emoji Movie",
    "genres": "[{'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}, {'id': 16, 'name': 'Animation'}]",
    "release_date": "2017-07-28",
    "distance": 0.34616613388061523
  },
  {
    "title": "Ghost Machine",
    "genres": "[{'id': 28, 'name': 'Action'}, {'id': 27, 'name': 'Horror'}, {'id': 878, 'name': 'Science Fiction'}, {'id': 53, 'name': 'Thriller'}]",
    "release_date": "2009-02-01",
    "distance": 0.3414096236228943
  },
  {
    "title": "Pirates of Silicon Valley",
    "genres": "[{'id': 99, 'name': 'Documentary'}, {'id': 18, 'name': 

In [None]:
movies = {mood: [] for mood in moods}

for movie in embeddings:
  m_embedding = pickle.loads(movie["embedding"])
  top_mood = None
  top_score = None

  for mood in moods:
    mood_score = 0.0
    for b_embedding in loaded_base_embeddings[mood]:
      mood_score += cosine_similarity([m_embedding], [b_embedding])[0][0]

    mood_score /= len(base_embeddings[mood])

    if top_mood is None or mood_score > top_score:
      top_mood = mood
      top_score = mood_score
  
  print(f"{movie['title']} is {top_mood}")
  movie["mood"] = top_mood
  movie["score"] = top_score
  movies[top_mood].append(movie)

Minions is dark
Wonder Woman is dramatic
Beauty and the Beast is emotional
Baby Driver is thrilling
Big Hero 6 is inspiring
Deadpool is emotional
Guardians of the Galaxy Vol. 2 is emotional
Avatar is challenging
John Wick is dark
Gone Girl is challenging
The Hunger Games: Mockingjay - Part 1 is intense
War for the Planet of the Apes is intense
Captain America: Civil War is dramatic
Pulp Fiction is dark
Pirates of the Caribbean: Dead Men Tell No Tales is dark
The Dark Knight is funny
Blade Runner is inspiring
The Avengers is thrilling
Captain Underpants: The First Epic Movie is funny
The Circle is challenging
The Bad Batch is intense
The Maze Runner is emotional
Dawn of the Planet of the Apes is thought-provoking
Alien: Covenant is challenging
Ghost in the Shell is challenging
Boyka: Undisputed IV is intense
Whiplash is inspiring
Fight Club is intense
What Happened to Monday is challenging
Wish Upon is challenging
Thor: Ragnarok is thrilling
Logan is dramatic
Guardians of the Galaxy is 

In [None]:
import pickle
with open(mood_embeddings_path, "wb") as file:
  pickle.dump(movies, file)

In [None]:
import pickle, json
movies = {}
with open(mood_embeddings_path, "rb") as file:
  movies = pickle.load(file)
for mood in moods:
  for movie in movies[mood]:
    movie.pop("embedding", None)
with open(mooded_movies_path, "w") as file:
  json.dump(movies, file)

## Movie to Movie Related

In [None]:
import pickle
embeddings = []
base_embeddings = {}

with open(embeddings_path, "rb") as file:
  embeddings = pickle.load(file)
  for movie in embeddings:
    movie["embedding"] = pickle.loads(movie["embedding"])

In [None]:
!pip install --quiet --progress-bar off scikit-learn
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
related = [[] for j in range(5000)]

count = 0
scores = [[0 for j in range(5000)] for i in range(5000)]
for i in range(len(embeddings)):
  if count % 50 == 0: print(count)
  emi = embeddings[i]["embedding"]
  for j in range(i+1, len(embeddings)):
    emj = embeddings[j]["embedding"]
    scores[i][j] = cosine_similarity([emi], [emj])[0][0]
    scores[j][i] = scores[i][j]

  indices = [j for j in range(5000)]
  top50 = sorted(indices, key=lambda x: scores[i][x], reverse=True)[:50]

  for k in top50:
    related[i].append({"score": float(scores[i][k]), "id": embeddings[k]["id"]})

  count += 1

related = {embeddings[i]["id"]: related[i] for i in range(len(related))}

In [None]:
with open(related_path, "w") as file:
  json.dump(related, file)

In [None]:
import json
import pickle

embeddings = []
related = []

with open(related_path, "r") as file:
  related = json.load(file)

with open(embeddings_path, "rb") as file:
  embeddings = pickle.load(file)

## Word to Movie

In [None]:
import pickle
embeddings = []
with open(embeddings_path, "rb") as file:
  embeddings = pickle.load(file)

In [None]:
from collections import Counter
import re

stopwords = ['the', 'to', 'and', 'a', 'in', 'it', 'is', 'i', 'that', 'had', 'on', 'for', 'were', 'was', 'by', 'are', 'of', 'an']

def segment(txt):
  l = [re.sub("([/\",'\\.‘’!?\)\():])", "", x.lower()).strip().strip("-") for x in txt.split()]
  return filter(lambda x: x not in stopwords, l)

inverted_list = {}

for movie in embeddings:
  words = Counter() 
  for key in ["title", "overview", "tagline"]:
    for w in segment(movie["title"]):
      words[w] += 2
    for w in segment(movie["overview"]):
      words[w] += 0.
    for w in segment(movie["tagline"]):
      words[w] += 0.2
    
  for word in words:
    if word not in inverted_list:
      inverted_list[word] = []

    inverted_list[word].append({"id": movie["id"], "score": words[word]})

In [None]:
import json
with open(word_to_movies_path, "w") as file:
  json.dump(inverted_list, file)

In [None]:
import json
w_to_m = {}
with open(word_to_movies_path, "r") as file:
  w_to_m = json.dump(file)

## (Key)word to Embedding to Movies

In [None]:
!pip install --quiet --progress-bar=off sentence-transformers scikit-learn
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
import pickle
embeddings = []
with open(embeddings_path, "rb") as file:
  embeddings = pickle.load(file)

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
for movie in embeddings:
  movie["embedding"] = pickle.loads(movie["embedding"])

In [None]:
query = "minion"

emb = model.encode(query)
for m in embeddings:
  m["score"] = cosine_similarity([m["embedding"]], [emb])[0][0]

for e in sorted(embeddings, key=lambda x: x["score"], reverse=True)[:5]:
  print("-----")
  print(e["title"])
  print(e["overview"])
  print("-----\n")

-----
Minions
Minions Stuart, Kevin and Bob are recruited by Scarlet Overkill, a super-villain who, alongside her inventor husband Herb, hatches a plot to take over the world.
-----

-----
Small Fry
A fast food restaurant mini variant of Buzz forcibly switches places with the real Buzz and his friends have to deal with the obnoxious impostor.
-----

-----
The Sandlot
Scotty Smalls moves to a new neighborhood with his mom and stepdad, and wants to learn to play baseball. The neighborhood baseball guru Rodriquez takes Smalls under his wing. They fall into adventures involving baseball, treehouse sleep-ins, the desirous lifeguard at the local pool, the snooty rival ball team, and the travelling fair.
-----

-----
Honey, I Shrunk the Kids
The scientist father of a teenage girl and boy accidentally shrinks his and two other neighborhood teens to the size of insects. Now the teens must fight diminutive dangers as the father searches for them.
-----

-----
Stuart Little
The adventures of a he