# Embeddings

In [1]:
from openai import OpenAI

In [2]:
# load environment variables
import os
from dotenv import load_dotenv

load_dotenv()


True

In [3]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [4]:
# Getting embeddings from OpenAI

client = OpenAI(api_key=OPENAI_API_KEY)

def get_embeddings(input_text: str, model: str = "text-embedding-3-small"):
    response = client.embeddings.create(
        input=input_text,
        model=model
    )
    return response.data[0].embedding

In [5]:
get_embeddings("Hi, this is a test embedding.")

[0.007825891487300396,
 -0.007966236211359501,
 0.02817588299512863,
 -0.030448131263256073,
 -0.009784035384654999,
 -0.03886881843209267,
 0.011120651848614216,
 0.03079565055668354,
 -0.018418580293655396,
 -0.009015480056405067,
 0.033281758427619934,
 -0.01148822158575058,
 -0.014208236709237099,
 -0.03485896810889244,
 0.012758007273077965,
 0.03098277747631073,
 -0.014916643500328064,
 0.031704552471637726,
 -0.013359485194087029,
 0.031062975525856018,
 0.014769615605473518,
 -0.03774606063961983,
 0.013399583287537098,
 0.04886671155691147,
 0.018766099587082863,
 -0.06378335505723953,
 0.03296096995472908,
 0.039777714759111404,
 0.01742948405444622,
 -0.05370526388287544,
 0.014876545406877995,
 -0.03392333537340164,
 -0.0018629096448421478,
 -0.02459375001490116,
 -0.031490691006183624,
 -0.00301407091319561,
 0.005944603122770786,
 0.05333101004362106,
 -0.041943036019802094,
 -0.002763455267995596,
 0.01346641406416893,
 -0.012664444744586945,
 -0.00926943775266409,
 0.02

In [6]:
import requests
import os


In [7]:
headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {os.getenv('TMDB_API_READ_ACCESS_TOKEN')}"
}
def get_movies_top_rated():
    url = "https://api.themoviedb.org/3/movie/top_rated?language=en-US&page=1"
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        return None

In [8]:
movies = get_movies_top_rated()


In [9]:
movies

{'page': 1,
 'results': [{'adult': False,
   'backdrop_path': '/zfbjgQE1uSd9wiPTX4VzsLi0rGG.jpg',
   'genre_ids': [18, 80],
   'id': 278,
   'original_language': 'en',
   'original_title': 'The Shawshank Redemption',
   'overview': 'Imprisoned in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope.',
   'popularity': 53.0997,
   'poster_path': '/9cqNxx0GxF0bflZmeSMuL5tnGzr.jpg',
   'release_date': '1994-09-23',
   'title': 'The Shawshank Redemption',
   'video': False,
   'vote_average': 8.712,
   'vote_count': 28610},
  {'adult': False,
   'backdrop_path': '/htuuuEwAvDVECMpb0ltLLyZyDDt.jpg',
   'genre_ids': [18, 80],
   'id': 238,
   'original_language': 'en',
   '

In [10]:
type(movies)

dict

In [11]:
movies.keys()

dict_keys(['page', 'results', 'total_pages', 'total_results'])

In [12]:
movies['results'][0]['overview']

'Imprisoned in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope.'

In [13]:
movies_data = [{
    "id": movie['id'],
    "title": movie['title'],
    "overview": movie['overview']
} for movie in movies['results']]


In [14]:
movies_data[0]

{'id': 278,
 'title': 'The Shawshank Redemption',
 'overview': 'Imprisoned in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope.'}

In [15]:
def add_context_to_movies(movies_data):
    for movie in movies_data:
        movie['context'] = f"id: {movie['id']} - title: {movie['title']} - overview: {movie['overview']}"
    return movies_data

In [16]:
add_context_to_movies(movies_data)

[{'id': 278,
  'title': 'The Shawshank Redemption',
  'overview': 'Imprisoned in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope.',
  'context': 'id: 278 - title: The Shawshank Redemption - overview: Imprisoned in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope.'},
 {'id': 238,
  'title': 'The Godfather',
  'overview': 'Spanning the years

In [17]:
len(movies_data)

20

In [18]:
movies_data[3]

{'id': 424,
 'title': "Schindler's List",
 'overview': 'The true story of how businessman Oskar Schindler saved over a thousand Jewish lives from the Nazis while they worked as slaves in his factory during World War II.',
 'context': "id: 424 - title: Schindler's List - overview: The true story of how businessman Oskar Schindler saved over a thousand Jewish lives from the Nazis while they worked as slaves in his factory during World War II."}

In [19]:
def add_embeddings_to_movies(movies_data):
    for movie in movies_data:
        movie['embedding'] = get_embeddings(movie['context'])
    return movies_data


In [20]:
movies_data = add_embeddings_to_movies(movies_data)

In [21]:
movies_data[0]

{'id': 278,
 'title': 'The Shawshank Redemption',
 'overview': 'Imprisoned in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope.',
 'context': 'id: 278 - title: The Shawshank Redemption - overview: Imprisoned in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope.',
 'embedding': [-0.04386962950229645,
  -0.0006395035306923091,
  0.001978218555

In [22]:
from utils.paths import DATA_RAW_DIR

In [23]:
path_movies = str(DATA_RAW_DIR / 'movies.txt')

# Read file raw/movies.txt
with open(path_movies, 'r', encoding='utf-8') as file:
    movies_txt = file.read()



In [24]:
type(movies_txt)

str

In [27]:
# Scipy for chunks
import spacy
nlp = spacy.load("en_core_web_sm")

In [30]:
# spicy function to split text into chunks

def split_text_spacy(text, max_chunk_size=100, overlap=20):
    doc = nlp(text)
    chunks = []
    current = ""

    for sent in doc.sents:
        if len(current) + len(sent.text) <= max_chunk_size:
            current += " " + sent.text
        else:
            chunks.append(current.strip())

            words = current.split()
            overlap_words = " ".join(words[-(overlap // 5):])
            current = overlap_words + " " + sent.text

    if current:
        chunks.append(current.strip())

    return chunks

In [31]:
chunks = split_text_spacy(movies_txt)

In [32]:
len(chunks)

2

In [36]:
chunks[0:50]

['',
 '{\n  "page": 1,\n  "results": [\n    {\n      "id": 278,\n      "title": "The Shawshank Redemption",\n      "overview": "Imprisoned in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison..."\n    },\n    {\n      "id": 238,\n      "title": "The Godfather",\n      "overview": "Spanning the years 1945 to 1955, a chronicle of the fictional Italian-American Corleone crime family..."\n    },\n    {\n      "id": 240,\n      "title": "The Godfather Part II",\n      "overview": "In the continuing saga of the Corleone crime family, a young Vito Corleone grows up in Sicily..."\n    }\n  ],\n  "total_pages": 514,\n  "total_results": 10264\n}']