# Build a Movie Search Engine using Pinecone

Utilize Kaggle Data: https://grepp-reco-test.s3.ap-northeast-2.amazonaws.com/tmdb_5000_movies.csv

### Load Data

In [2]:
# !pip3 install sentence-transformers

In [3]:
# !pip3 install pinecone

In [5]:
import pandas as pd

# Replace 'your_file.csv' with your actual file name
df = pd.read_csv('tmdb_5000_movies.csv')

# Display the first 5 rows
print(df.head())

      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3  250000000  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4  260000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...                en   
2  [{"id": 470, "nam

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

### Data Cleanup

In [7]:
df['title'] = df['title'].astype(str).fillna('')
df['overview'] = df['overview'].astype(str).fillna('')

In [8]:
df['metadata'] = df.apply(lambda row: {
    'title': row['title'] + " " + str(row['overview'])
}, axis=1)


In [9]:
df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,metadata
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,"{'title': 'Avatar In the 22nd century, a parap..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,{'title': 'Pirates of the Caribbean: At World'...
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,{'title': 'Spectre A cryptic message from Bond...
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,{'title': 'The Dark Knight Rises Following the...
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,{'title': 'John Carter John Carter is a war-we...


### Prep for Upsert

In [10]:
# from google.colab import userdata

# # Use Colab Secrets
# API_KEY = userdata.get('pinecone_key')
import os
from dotenv import load_dotenv

load_dotenv()
API_KEY = os.getenv("PINECONE_API_KEY")


In [11]:
import pinecone
print(pinecone.__version__)


6.0.2


In [12]:
from pinecone import Pinecone

pc = Pinecone(api_key=API_KEY)


In [13]:
from pinecone import ServerlessSpec

# Spec for free tier plan
spec = ServerlessSpec(
    cloud="aws",
    region="us-east-1"
)


In [14]:
index_name = 'semantic-search-fast'

In [15]:
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

In [16]:
print(existing_indexes)

['semantic-search-fast']


In [17]:
print(pc.list_indexes())


[{
    "name": "semantic-search-fast",
    "metric": "dotproduct",
    "host": "semantic-search-fast-0my71l3.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}]


In [18]:
# existing_indexes

In [19]:
import time

# If the following code runs more than once, you might get "The index already existed" error
# In that case, delete the index first: pc.delete_index(index_name)
try:
    pc.delete_index(index_name)
except Exception as e:
    print(e)

pc.create_index(
    index_name,
    dimension=384,  # dimensionality of minilm
    metric='dotproduct',
    spec=spec
)
# wait for index to be initialized
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

In [20]:
from sentence_transformers import SentenceTransformer
import torch

## Generate sentence embeddings and ingest them into Pinecone

If you want to use "cuda", change the runtime to GPU

In [21]:
model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu') # cuda or cpu

In [22]:
import numpy as np
print(np.__version__)

1.24.4


In [23]:
# .map(lambda x: ...): This applies a function to each row in the 'metadata' column.
# model.encode(x["title"]): This uses a sentence embedding model to encode the text in the "title" field
#                           into a vector (embedding).
# .tolist(): Converts the embedding (a NumPy array) to a regular Python list.
df['values'] = df['metadata'].map(
    lambda x: (model.encode(x["title"])).tolist()) # python list, 6k rows 1 min

In [24]:
df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,metadata,values
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,"{'title': 'Avatar In the 22nd century, a parap...","[0.008871081285178661, 0.07623747736215591, 0...."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,{'title': 'Pirates of the Caribbean: At World'...,"[-0.038626134395599365, 0.004829192068427801, ..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,{'title': 'Spectre A cryptic message from Bond...,"[-0.06775256246328354, 0.026975926011800766, -..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,{'title': 'The Dark Knight Rises Following the...,"[0.0033306165132671595, 0.020907564088702202, ..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,{'title': 'John Carter John Carter is a war-we...,"[-0.0004865489318035543, 0.04032321646809578, ..."


In [25]:
df['id'] = df.reset_index(drop = 'index').index

In [26]:
# 'values', 'metadata', 'sparse_values', 'id'
df_upsert = df[['id', 'values', 'metadata']]

In [27]:
df_upsert['id'] = df_upsert['id'].map(lambda x: str(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_upsert['id'] = df_upsert['id'].map(lambda x: str(x))


In [28]:
df_upsert.head()

Unnamed: 0,id,values,metadata
0,0,"[0.008871081285178661, 0.07623747736215591, 0....","{'title': 'Avatar In the 22nd century, a parap..."
1,1,"[-0.038626134395599365, 0.004829192068427801, ...",{'title': 'Pirates of the Caribbean: At World'...
2,2,"[-0.06775256246328354, 0.026975926011800766, -...",{'title': 'Spectre A cryptic message from Bond...
3,3,"[0.0033306165132671595, 0.020907564088702202, ...",{'title': 'The Dark Knight Rises Following the...
4,4,"[-0.0004865489318035543, 0.04032321646809578, ...",{'title': 'John Carter John Carter is a war-we...


In [29]:
index = pc.Index(index_name)

In [30]:
index.upsert_from_dataframe(df_upsert) # 6k takes 1 min

sending upsert requests:   0%|          | 0/4803 [00:00<?, ?it/s]

{'upserted_count': 4803}

### Query

In [31]:
xc = index.query(vector=(model.encode("what is ethics in AI")).tolist(), # python list
           top_k=10,
           include_metadata=True,
           include_values=True)   #

In [32]:
for result in xc['matches']:
    print(result['id'], result['score'], result['metadata'])

363 0.481508672 {'title': 'A.I. Artificial Intelligence A robotic boy, the first programmed to love, David is adopted as a test case by a Cybertronics employee and his wife. Though he gradually becomes their child, a series of unexpected circumstances make this life impossible for David. Without final acceptance by humans or machines, David embarks on a journey to discover where he truly belongs, uncovering a world in which the line between robot and machine is both vast and profoundly thin.'}
266 0.448641 {'title': 'I, Robot In 2035, where robots are common-place and abide by the three laws of robotics, a techno-phobic cop investigates an apparent suicide. Suspecting that a robot may be responsible for the death, his investigation leads him to believe that humanity may be in danger.'}
2654 0.341164619 {'title': 'Automata Jacq Vaucan, an insurance agent of ROC robotics corporation, routinely investigates the case of manipulating a robot. What he discovers will have profound consequence