In [40]:
import sys
print(sys.executable)

# !pip3.10 install numpy
# !pip3.10 install tenacity
# !pip3.10 install pandas
# !pip3.10 install tiktoken
# !pip3.10 install  matplotlib
# plotly, scipy, sklearn, scikit-learn

/Library/Frameworks/Python.framework/Versions/3.10/bin/python3.10


In [72]:
import openai
from dotenv import dotenv_values
config = dotenv_values("/Users/julianboyce/Documents/work/gpt/.env")
openai.api_key = config["OPENAI_API_KEY"]

In [42]:
# Sanity check your api key is here
####### print(openai.api_key)

In [43]:
import pandas as pd
import numpy as np
from tenacity import retry, wait_random_exponential, stop_after_attempt
import pickle
import tiktoken

## Load The Movie Data

Reference: https://www.udemy.com/course/mastering-openai/learn/lecture/37398616#overview

You can download the entire movie set found here at [Movie Plots](https://www.kaggle.com/datasets/jrobischon/wikipedia-movie-plots) from kaggle.com

Store this under the file name: wiki_movie_plots_deduped.csv

In [44]:
dataset_path = "./wiki_movie_plots_deduped.csv"
df = pd.read_csv(dataset_path)

In [52]:
# Narrow our data set to 500 recent American movies (!!!to save money!!!)
movies = df[df["Origin/Ethnicity"] == "American"].sort_values("Release Year", ascending=False).head(1000)

In [53]:
movies

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
17376,2017,Phantom Thread,American,Paul Thomas Anderson,Paul Thomas Anderson (director/screenplay); Da...,drama,https://en.wikipedia.org/wiki/Phantom_Thread,"In 1954 London, renowned fashion designer Reyn..."
17243,2017,"Everything, Everything",American,Stella Meghie,Stella Meghie (director); J. Mills Goodloe (sc...,"romance, drama","https://en.wikipedia.org/wiki/Everything,_Ever...","Eighteen-year-old Maddy suffers from SCID, an ..."
17241,2017,Alien: Covenant,American,Ridley Scott,"Ridley Scott (director); Michael Green, Jack P...","sci-fi, horror",https://en.wikipedia.org/wiki/Alien:_Covenant,"In a prologue, business magnate Peter Weyland ..."
17240,2017,Paris Can Wait,American,Eleanor Coppola,Eleanor Coppola (director/screenplay); Diane L...,"comedy, romance",https://en.wikipedia.org/wiki/Paris_Can_Wait,Anne (Diane Lane) is in Cannes with her husban...
17239,2017,The Wall,American,Doug Liman,Doug Liman (director); Dwain Worrell (screenpl...,"drama, thriller",https://en.wikipedia.org/wiki/The_Wall_(2017_f...,"During the Iraq War, U.S. Army Staff Sergeant ..."
...,...,...,...,...,...,...,...,...
16217,2012,Celeste and Jesse Forever,American,Lee Toland Krieger,"Rashida Jones, Andy Samberg, Chris Messina, Ar...",romantic comedy,https://en.wikipedia.org/wiki/Celeste_and_Jess...,Celeste (Rashida Jones) and Jesse (Andy Samber...
16246,2012,Deadline,American,Curt Hahn,"Eric Roberts, Steve Talley, J.D. Souther, Davi...",drama,https://en.wikipedia.org/wiki/Deadline_(2012_f...,Young newsparer reporter Matt Harper (Steve Ta...
16216,2012,Casa de Mi Padre,American,Matt Piedmont,"Will Ferrell, Gael García Bernal, Diego Luna, ...",comedy,https://en.wikipedia.org/wiki/Casa_de_Mi_Padre,Armando Álvarez (Will Ferrell) has lived and w...
16245,2012,Deadfall,American,Stefan Ruzowitzky,"Eric Bana, Olivia Wilde, Jason Cavalier, Charl...",drama,https://en.wikipedia.org/wiki/Deadfall_(2012_f...,"After a casino heist gone wrong, siblings Addi..."


In [54]:
# Extract the movie plots into a list
movie_plots = movies["Plot"].values

## Generating The Embeddings

In [55]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text, model="text-embedding-ada-002"):

    # replace newlines, which can negatively affect performance.
    text = text.replace("\n", " ")

    return openai.Embedding.create(input=text, model=model)["data"][0]["embedding"]

In [56]:
enc = tiktoken.encoding_for_model("text-embedding-ada-002")

In [57]:
total_tokens = sum([len(enc.encode(plot)) for plot in movie_plots])

In [58]:
total_tokens
cost = total_tokens * (.0004 / 1000)
print(f"Estimated cost ${cost:.2f}")

Estimated cost $0.30


## TODO: Instead of storing this to a file you should persist this to a vector database

In [59]:
# establish a cache of embeddings to avoid recomputing
# cache is a dict of tuples (text, model) -> embedding, saved as a pickle file

# set path to embedding cache
embedding_cache_path = "movie_embeddings_cache2.pkl"

# load the cache if it exists, and save a copy to disk
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path, "wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)

# define a function to retrieve embeddings from the cache if present, and otherwise request via the API
def embedding_from_string(
    string,
    model="text-embedding-ada-002",
    embedding_cache=embedding_cache
):
    """Return embedding of given string, using a cache to avoid recomputing."""
    if (string, model) not in embedding_cache.keys():
        embedding_cache[(string, model)] = get_embedding(string, model)
        print(f"GOT EMBEDDING FROM OPENAI FOR {string[:20]}")
        with open(embedding_cache_path, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return embedding_cache[(string, model)]

In [60]:
# This line actaully generates the embeddings
plot_embeddings = [embedding_from_string(plot, model="text-embedding-ada-002") for plot in movie_plots]

GOT EMBEDDING FROM OPENAI FOR In 1954 London, reno
GOT EMBEDDING FROM OPENAI FOR Eighteen-year-old Ma
GOT EMBEDDING FROM OPENAI FOR In a prologue, busin
GOT EMBEDDING FROM OPENAI FOR Anne (Diane Lane) is
GOT EMBEDDING FROM OPENAI FOR During the Iraq War,
GOT EMBEDDING FROM OPENAI FOR A contemporary tale 
GOT EMBEDDING FROM OPENAI FOR Recently fired from 
GOT EMBEDDING FROM OPENAI FOR The movie opens with
GOT EMBEDDING FROM OPENAI FOR Mary (Debra Winger) 
GOT EMBEDDING FROM OPENAI FOR In 2014, Peter Quill
GOT EMBEDDING FROM OPENAI FOR A young street magic
GOT EMBEDDING FROM OPENAI FOR Having made a career
GOT EMBEDDING FROM OPENAI FOR When her car breaks 
GOT EMBEDDING FROM OPENAI FOR Mikael (Oscar Isaac)
GOT EMBEDDING FROM OPENAI FOR Julia Banks is being
GOT EMBEDDING FROM OPENAI FOR This documentary fol
GOT EMBEDDING FROM OPENAI FOR Best friends Mindy (
GOT EMBEDDING FROM OPENAI FOR Dash (voiced by Schw
GOT EMBEDDING FROM OPENAI FOR Ireland, 1905: Percy
GOT EMBEDDING FROM OPENAI FOR T

GOT EMBEDDING FROM OPENAI FOR San Francisco, 1998:
GOT EMBEDDING FROM OPENAI FOR Robin Cavendish fall
GOT EMBEDDING FROM OPENAI FOR Frankie (Harris Dick
GOT EMBEDDING FROM OPENAI FOR Patricia "Dumbo" Dom
GOT EMBEDDING FROM OPENAI FOR Jimmy Logan is laid 
GOT EMBEDDING FROM OPENAI FOR Private UK-based bod
GOT EMBEDDING FROM OPENAI FOR Thomas Webb (Callum 
GOT EMBEDDING FROM OPENAI FOR Ingrid Thorburn is a
GOT EMBEDDING FROM OPENAI FOR Connie Nikas forcibl
GOT EMBEDDING FROM OPENAI FOR Surly Squirrel is no
GOT EMBEDDING FROM OPENAI FOR The unconventional, 
GOT EMBEDDING FROM OPENAI FOR In 1943, dollmaker S
GOT EMBEDDING FROM OPENAI FOR A team of trained op
GOT EMBEDDING FROM OPENAI FOR During a winter seas
GOT EMBEDDING FROM OPENAI FOR Karla Dyson (Halle B
GOT EMBEDDING FROM OPENAI FOR Eleven-year-old Jake
GOT EMBEDDING FROM OPENAI FOR James lives in an un
GOT EMBEDDING FROM OPENAI FOR On July 23, 1967, th
GOT EMBEDDING FROM OPENAI FOR Gene is an emoji tha
GOT EMBEDDING FROM OPENAI FOR W

GOT EMBEDDING FROM OPENAI FOR Shortly after the de
GOT EMBEDDING FROM OPENAI FOR After surviving the 
GOT EMBEDDING FROM OPENAI FOR Sixteen-year-old asp
GOT EMBEDDING FROM OPENAI FOR As a result of the B
GOT EMBEDDING FROM OPENAI FOR Sophie, a 10-year-ol
GOT EMBEDDING FROM OPENAI FOR A young Charlie Roan
GOT EMBEDDING FROM OPENAI FOR A Jack Russell Terri
GOT EMBEDDING FROM OPENAI FOR Brothers Mike and Da
GOT EMBEDDING FROM OPENAI FOR The story begins in 
GOT EMBEDDING FROM OPENAI FOR During the 1980s, U.
GOT EMBEDDING FROM OPENAI FOR Physicists Abby Yate
GOT EMBEDDING FROM OPENAI FOR Twenty years after t
GOT EMBEDDING FROM OPENAI FOR Scrat, trying to bur
GOT EMBEDDING FROM OPENAI FOR A supermarket called
GOT EMBEDDING FROM OPENAI FOR In a textile factory
GOT EMBEDDING FROM OPENAI FOR After faking his dea
GOT EMBEDDING FROM OPENAI FOR Rocky, Alex, and Mon
GOT EMBEDDING FROM OPENAI FOR In feudal Japan, 12-
GOT EMBEDDING FROM OPENAI FOR A Jewish nobleman, J
GOT EMBEDDING FROM OPENAI FOR F

GOT EMBEDDING FROM OPENAI FOR In 2002, Ava, a 14-y
GOT EMBEDDING FROM OPENAI FOR When a blue collar w
GOT EMBEDDING FROM OPENAI FOR Two years after the 
GOT EMBEDDING FROM OPENAI FOR In 1993, in the work
GOT EMBEDDING FROM OPENAI FOR Hazel Grace Lancaste
GOT EMBEDDING FROM OPENAI FOR Allyson Field is a y
GOT EMBEDDING FROM OPENAI FOR In 1985, 13-year-old
GOT EMBEDDING FROM OPENAI FOR When Joel (Rudd) and
GOT EMBEDDING FROM OPENAI FOR As a young boy, Noah
GOT EMBEDDING FROM OPENAI FOR Cedric (Kevin Hart) 
GOT EMBEDDING FROM OPENAI FOR In Belleville, New J
GOT EMBEDDING FROM OPENAI FOR Three MIT students –
GOT EMBEDDING FROM OPENAI FOR After performing a s
GOT EMBEDDING FROM OPENAI FOR Five years after the
GOT EMBEDDING FROM OPENAI FOR Thirteen-year-old Ja
GOT EMBEDDING FROM OPENAI FOR Two years following 
GOT EMBEDDING FROM OPENAI FOR In 2015, an alien ra
GOT EMBEDDING FROM OPENAI FOR Mac (Seth Rogen) and
GOT EMBEDDING FROM OPENAI FOR In 1882, in the town
GOT EMBEDDING FROM OPENAI FOR M

GOT EMBEDDING FROM OPENAI FOR A young couple, Cath
GOT EMBEDDING FROM OPENAI FOR Paul, a mechanic, sp
GOT EMBEDDING FROM OPENAI FOR Mia Hall and her fam
GOT EMBEDDING FROM OPENAI FOR After his girlfriend
GOT EMBEDDING FROM OPENAI FOR Ben and George, a sa
GOT EMBEDDING FROM OPENAI FOR Confronted with the 
GOT EMBEDDING FROM OPENAI FOR Marv regains conscio
GOT EMBEDDING FROM OPENAI FOR Scarlett Marlowe, a 
GOT EMBEDDING FROM OPENAI FOR Professor Jonathan V
GOT EMBEDDING FROM OPENAI FOR Emily (Olesya Rulin)
GOT EMBEDDING FROM OPENAI FOR The story, about los
GOT EMBEDDING FROM OPENAI FOR Following their succ
GOT EMBEDDING FROM OPENAI FOR The story opens in 2
GOT EMBEDDING FROM OPENAI FOR Chip, a young dancer
GOT EMBEDDING FROM OPENAI FOR Frances Halladay is 
GOT EMBEDDING FROM OPENAI FOR Generation Iron chro
GOT EMBEDDING FROM OPENAI FOR The film opens in an
GOT EMBEDDING FROM OPENAI FOR Reggie the turkey ha
GOT EMBEDDING FROM OPENAI FOR Charley Brewster, Ed
GOT EMBEDDING FROM OPENAI FOR P

GOT EMBEDDING FROM OPENAI FOR In the city of New O
GOT EMBEDDING FROM OPENAI FOR In 2009, an elderly 
GOT EMBEDDING FROM OPENAI FOR Blynn Lehman, one of
GOT EMBEDDING FROM OPENAI FOR An expectant couple 
GOT EMBEDDING FROM OPENAI FOR During a sweltering 
GOT EMBEDDING FROM OPENAI FOR After her husband Ma
GOT EMBEDDING FROM OPENAI FOR Season of Miracles c
GOT EMBEDDING FROM OPENAI FOR  Walter Mitty is a n
GOT EMBEDDING FROM OPENAI FOR Salesman Zach Newman
GOT EMBEDDING FROM OPENAI FOR The film begins with
GOT EMBEDDING FROM OPENAI FOR Sam (Josh Gad) is a 
GOT EMBEDDING FROM OPENAI FOR Grace is the young s
GOT EMBEDDING FROM OPENAI FOR While drunk, Robert 
GOT EMBEDDING FROM OPENAI FOR John Moon's wife rec
GOT EMBEDDING FROM OPENAI FOR Charlie Sheen and Li
GOT EMBEDDING FROM OPENAI FOR In preparation for h
GOT EMBEDDING FROM OPENAI FOR As the story begins,
GOT EMBEDDING FROM OPENAI FOR The film opens with 
GOT EMBEDDING FROM OPENAI FOR Set entirely inside 
GOT EMBEDDING FROM OPENAI FOR S

GOT EMBEDDING FROM OPENAI FOR Chris Farraday (Mark
GOT EMBEDDING FROM OPENAI FOR 12-year old Ida Clay
GOT EMBEDDING FROM OPENAI FOR Zack (Haas) is a you
GOT EMBEDDING FROM OPENAI FOR A mixed-blood Native
GOT EMBEDDING FROM OPENAI FOR Eight years after th
GOT EMBEDDING FROM OPENAI FOR Newly transferred co
GOT EMBEDDING FROM OPENAI FOR Swanson (Tim Heideck
GOT EMBEDDING FROM OPENAI FOR In 1760, the Collins
GOT EMBEDDING FROM OPENAI FOR Casper Galloway, an 
GOT EMBEDDING FROM OPENAI FOR Beth Winter (Keaton)
GOT EMBEDDING FROM OPENAI FOR Kate is a shark expe
GOT EMBEDDING FROM OPENAI FOR In February 2012, Se
GOT EMBEDDING FROM OPENAI FOR Arkansas, November 1
GOT EMBEDDING FROM OPENAI FOR Teenager Elena Peter
GOT EMBEDDING FROM OPENAI FOR Will Shaw (Henry Cav
GOT EMBEDDING FROM OPENAI FOR In the Chatham Islan
GOT EMBEDDING FROM OPENAI FOR The film is about a 
GOT EMBEDDING FROM OPENAI FOR The Citizen integrat
GOT EMBEDDING FROM OPENAI FOR Strange electrical c
GOT EMBEDDING FROM OPENAI FOR T

## Plot The Embeddings Using Atlas

In [71]:
data = movies[["Title", "Genre"]].to_dict("records")

In [63]:
from nomic import atlas

In [73]:
project = atlas.map_embeddings(
    embeddings=np.array(plot_embeddings),
    data=data
)

[32m2023-04-13 20:37:15.632[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_create_project[0m:[36m965[0m - [1mCreating project `flippant-loft` in organization `coltsteele1`[0m
[32m2023-04-13 20:37:16.774[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m100[0m - [1mUploading embeddings to Atlas.[0m
4it [00:09,  2.34s/it]                       
[32m2023-04-13 20:37:26.243[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_add_data[0m:[36m1577[0m - [1mUpload succeeded.[0m
[32m2023-04-13 20:37:26.243[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m119[0m - [1mEmbedding upload succeeded.[0m
[32m2023-04-13 20:37:27.083[0m | [1mINFO    [0m | [36mnomic.project[0m:[36mcreate_index[0m:[36m1282[0m - [1mCreated map `flippant-loft` in project `flippant-loft`: https://atlas.nomic.ai/map/5899fca0-27a9-40c7-9a65-0c962306e065/469708b0-74ef-4839-91bc-7186f85f13d7[0m
[32m2023-04-13 20:37:27.083[0m | [1mINFO  

## Reccommending Movies By Plot

In [74]:
from openai.embeddings_utils import distances_from_embeddings, indices_of_nearest_neighbors_from_distances

In [75]:
def print_recommendations_from_strings(
    strings,
    index_of_source_string,
    k_nearest_neighbors=3,
    model="text-embedding-ada-002"
):
    #Get all of the embeddings
    embeddings = [embedding_from_string(string) for string in strings]
    # get embedding for our specific query string
    query_embedding = embeddings[index_of_source_string]
    # get distances between our embedding and all other embeddings
    distances = distances_from_embeddings(query_embedding, embeddings)
    # get indices of the nearest neighbors
    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)
    
    query_string = strings[index_of_source_string]
    match_count = 0
    for i in indices_of_nearest_neighbors:
        if query_string == strings[i]:
            continue
        if match_count >= k_nearest_neighbors:
            break
        match_count += 1
        print(f"Found {match_count} closest match: ")
        print(f"Distance of: {distances[i]} ")
        print(strings[i])

In [76]:
print_recommendations_from_strings(movie_plots, 2)

Found 1 closest match: 
Distance of: 0.13645957078907822 
In the near future, the unmanned Pilgrim 7 space probe returns from Mars to Earth orbit with soil samples potentially containing evidence of extraterrestrial life. The probe is captured and its samples retrieved by the International Space Station and its six-member crew. Exobiologist Hugh Derry, who is paralyzed from the waist down, revives a dormant cell from the sample, which quickly grows into a multi-celled organism that American school children name "Calvin". Hugh realizes that Calvin's cells can change their specialisation, acting as muscle, sensor, and neuron cells all at once.
An accident in the lab causes Calvin to become dormant; Hugh attempts to revive Calvin with electric shocks, but Calvin immediately becomes hostile and attacks Hugh, crushing his hand. While Hugh lies unconscious from Calvin's attack, Calvin uses Hugh's electric shock tool to escape its enclosure; now free in the laboratory, Calvin devours a lab r