In [1]:
# !pip install -U sentence-transformers
# !pip install pinecone-client
# !pip install gradio

In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from ast import literal_eval

[25k IMDb Movie Dataset - Kaggle](https://www.kaggle.com/datasets/utsh0dey/25k-movie-dataset)

In [3]:
from paths import RAW_DIR

In [4]:
df = pd.read_csv(filepath_or_buffer=(str(RAW_DIR / "25k-imdb-movie-dataset.csv")))

In [5]:
df.head(3)

Unnamed: 0,movie title,Run Time,Rating,User Rating,Generes,Overview,Plot Kyeword,Director,Top 5 Casts,Writer,year,path
0,Top Gun: Maverick,"$170,000,000 (estimated)",8.6,187K,"['Action', 'Drama']",After more than thirty years of service as one...,"['fighter jet', 'sequel', 'u.s. navy', 'fighte...",Joseph Kosinski,"['Jack Epps Jr.', 'Peter Craig', 'Tom Cruise',...",Jim Cash,-2022,/title/tt1745960/
1,Jurassic World Dominion,2 hours 27 minutes,6.0,56K,"['Action', 'Adventure', 'Sci-Fi']",Four years after the destruction of Isla Nubla...,"['dinosaur', 'jurassic park', 'tyrannosaurus r...",Colin Trevorrow,"['Colin Trevorrow', 'Derek Connolly', 'Chris P...",Emily Carmichael,-2022,/title/tt8041270/
2,Top Gun,"$15,000,000 (estimated)",6.9,380K,"['Action', 'Drama']",As students at the United States Navy's elite ...,"['pilot', 'male camaraderie', 'u.s. navy', 'gr...",Tony Scott,"['Jack Epps Jr.', 'Ehud Yonay', 'Tom Cruise', ...",Jim Cash,-1986,/title/tt0092099/


In [6]:
def concatenar_lista(lista):
    lista = literal_eval(lista)
    return ' '.join(lista)

In [7]:
def string_to_list(lista):
    lista = literal_eval(lista)
    return lista

In [8]:
df = df.fillna(' ')
df['Keywords'] = df['Plot Kyeword'].apply(concatenar_lista)
df['Stars'] = df['Top 5 Casts'].apply(concatenar_lista)
df['Generes'] = df['Generes'].apply(string_to_list)
df['Rating'] = pd.to_numeric(df['Rating'], errors="coerce").fillna(0).astype("float")

In [9]:
unique_generes = df['Generes'].explode().unique()
unique_generes

array(['Action', 'Drama', 'Adventure', 'Sci-Fi', 'Animation', 'Crime',
       'Comedy', 'Thriller', 'Fantasy', 'Horror', 'History', 'Mystery',
       'Biography', 'War', 'Western', 'Sport', 'Family', 'Romance',
       'Music', 'Musical', 'Film-Noir', 'Game-Show', 'Adult',
       'Reality-TV'], dtype=object)

In [10]:
df.drop(['Plot Kyeword','Top 5 Casts'],axis=1, inplace=True)

In [11]:
df['text'] = df.apply(lambda x : str(x['Overview'])+' '+x['Keywords']+' '+x['Stars'], axis=1)

In [12]:
df.head(3)

Unnamed: 0,movie title,Run Time,Rating,User Rating,Generes,Overview,Director,Writer,year,path,Keywords,Stars,text
0,Top Gun: Maverick,"$170,000,000 (estimated)",8.6,187K,"[Action, Drama]",After more than thirty years of service as one...,Joseph Kosinski,Jim Cash,-2022,/title/tt1745960/,fighter jet sequel u.s. navy fighter aircraft ...,Jack Epps Jr. Peter Craig Tom Cruise Jennifer ...,After more than thirty years of service as one...
1,Jurassic World Dominion,2 hours 27 minutes,6.0,56K,"[Action, Adventure, Sci-Fi]",Four years after the destruction of Isla Nubla...,Colin Trevorrow,Emily Carmichael,-2022,/title/tt8041270/,dinosaur jurassic park tyrannosaurus rex veloc...,Colin Trevorrow Derek Connolly Chris Pratt Bry...,Four years after the destruction of Isla Nubla...
2,Top Gun,"$15,000,000 (estimated)",6.9,380K,"[Action, Drama]",As students at the United States Navy's elite ...,Tony Scott,Jim Cash,-1986,/title/tt0092099/,pilot male camaraderie u.s. navy grumman f 14 ...,Jack Epps Jr. Ehud Yonay Tom Cruise Tim Robbin...,As students at the United States Navy's elite ...


In [13]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [14]:
embeddings = model.encode(df['text'],batch_size=64,show_progress_bar=True)

Batches:   0%|          | 0/382 [00:00<?, ?it/s]

In [15]:
df['embeddings'] = embeddings.tolist()
df['ids'] = df.index
df['ids'] = df['ids'].astype('str')

In [16]:
df.head()

Unnamed: 0,movie title,Run Time,Rating,User Rating,Generes,Overview,Director,Writer,year,path,Keywords,Stars,text,embeddings,ids
0,Top Gun: Maverick,"$170,000,000 (estimated)",8.6,187K,"[Action, Drama]",After more than thirty years of service as one...,Joseph Kosinski,Jim Cash,-2022,/title/tt1745960/,fighter jet sequel u.s. navy fighter aircraft ...,Jack Epps Jr. Peter Craig Tom Cruise Jennifer ...,After more than thirty years of service as one...,"[-0.07095599919557571, -0.009480934590101242, ...",0
1,Jurassic World Dominion,2 hours 27 minutes,6.0,56K,"[Action, Adventure, Sci-Fi]",Four years after the destruction of Isla Nubla...,Colin Trevorrow,Emily Carmichael,-2022,/title/tt8041270/,dinosaur jurassic park tyrannosaurus rex veloc...,Colin Trevorrow Derek Connolly Chris Pratt Bry...,Four years after the destruction of Isla Nubla...,"[-0.0253621656447649, -0.06149575859308243, 0....",1
2,Top Gun,"$15,000,000 (estimated)",6.9,380K,"[Action, Drama]",As students at the United States Navy's elite ...,Tony Scott,Jim Cash,-1986,/title/tt0092099/,pilot male camaraderie u.s. navy grumman f 14 ...,Jack Epps Jr. Ehud Yonay Tom Cruise Tim Robbin...,As students at the United States Navy's elite ...,"[-0.00739348353818059, 0.025649581104516983, -...",2
3,Lightyear,"$71,101,257",5.2,32K,"[Animation, Action, Adventure]",While spending years attempting to return home...,Angus MacLane,Angus MacLane,-2022,/title/tt10298810/,galaxy spaceship robot rocket space adventure ...,Jason Headley Matthew Aldrich Chris Evans Keke...,While spending years attempting to return home...,"[-0.06644968688488007, -0.0030729426071047783,...",3
4,Spiderhead,not-released,5.4,23K,"[Action, Crime, Drama]","In the near future, convicts are offered the c...",Joseph Kosinski,George Saunders,-2022,/title/tt9783600/,discover medical test reality fictional drug v...,Rhett Reese Paul Wernick Chris Hemsworth Miles...,"In the near future, convicts are offered the c...","[-0.05996754765510559, 0.06489714235067368, 0....",4


In [None]:
import pinecone
from getpass import getpass

In [None]:
pincone_api = getpass('Enter the secret value: ')

Enter the secret value: ··········


In [None]:
pinecone.init(api_key=pincone_api, environment="asia-southeast1-gcp-free")

In [None]:
dimensions_embeddings = len(df['embeddings'][0])
index_name = 'movies-embeddings'
all_index = pinecone.list_indexes()
if index_name in all_index:
    index = pinecone.Index(index_name)
else:
    pinecone.create_index(index_name, dimension=dimensions_embeddings, metric="cosine")
    index = pinecone.Index(index_name)

In [None]:
index = pinecone.Index('movies-embeddings')

In [None]:
from tqdm.auto import tqdm

# we will use batches of 64
batch_size = 64

for i in tqdm(range(0, len(df), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(df))
    # extract batch
    batch = df[i:i_end]
    # generate embeddings for batch
    ids = batch['ids']
    emb = batch['embeddings']
    metadata = batch.drop(['ids','embeddings','text','path'],axis=1).to_dict('records')

    # add all to upsert list
    to_upsert = list(zip(ids, emb,metadata))
    # upsert/insert these records to pinecone
    _ = index.upsert(vectors=to_upsert)

# check that we have all vectors in index
index.describe_index_stats()

  0%|          | 0/382 [00:00<?, ?it/s]



{'dimension': 384,
 'index_fullness': 0.1,
 'namespaces': {'': {'vector_count': 24402}},
 'total_vector_count': 24402}

In [None]:
query = 'a history of time travel and science'
query_vector = model.encode(query).tolist()

responses = index.query(
  vector=query_vector,
  top_k=3,
  include_metadata=True,
  filter ={
  "Generes": { "$in": ['Action'] }
}
)

In [None]:
responses

{'matches': [{'id': '3387',
              'metadata': {'Director': 'Vikram K. Kumar',
                           'Generes': ['Action', 'Comedy', 'Drama'],
                           'Keywords': 'brother time travel watch time '
                                       'traveler',
                           'Overview': 'A scientist invents a time machine, '
                                       'which leads to a bitter battle between '
                                       'his evil twin brother and his son.',
                           'Rating': 7.9,
                           'Run Time': datetime.datetime(2023, 8, 8, 2, 44),
                           'Stars': 'Suriya Samantha Ruth Prabhu Nithya Menen '
                                    'Vikram K. Kumar Vikram K. Kumar',
                           'User Rating': '22K',
                           'Writer': 'Vikram K. Kumar',
                           'movie title': '24',
                           'year': '(I) (2016)'},
            

In [None]:
def search(query, genre, rating,top_k):
    query_vector = model.encode(query).tolist()

    if rating:
        filter_rating = rating
    else:
        filter_rating = 0

    if genre:
         conditions ={
                "Generes": { "$in": [genre] },
                "Rating": { "$gte": filter_rating }
                }
    else:
        conditions ={
                "Rating": { "$gte": filter_rating },
                }

    responses = index.query(
        vector=query_vector,
        top_k=top_k,
        include_metadata=True,
        filter=conditions
    )

    # Format the responses for better display
    response_data = []
    for response in responses['matches']:
        response_data.append({
            'Title': response['metadata']['movie title'],
            'Overview': response['metadata']['Overview'],
            'Director': response['metadata']['Director'],
            'Genre': response['metadata']['Generes'],
            'year': response['metadata']['year'],
            'Rating': response['metadata']['Rating'],
            'Score': response['score'],
        })

    df = pd.DataFrame(response_data)
    return df



In [None]:
import gradio as gr

# Define possible genres
genres = unique_generes.tolist()
iface = gr.Interface(
    fn=search,
    inputs=[
        gr.Textbox(lines=5, placeholder="Escribe aquí tu consulta...", label="Consulta"),
        gr.Dropdown(choices=genres, label="Género de la película"),
        gr.Slider(minimum=1, maximum=10, value=5, label="Puntuación mínima"),
        gr.Number(minimum=1, maximum=10, value=3, label="Número de resultados")

    ],
    outputs=gr.Dataframe(type="pandas", label="Resultados"),
    title="Buscador de películas",
    description="Introduce tu consulta, selecciona un género y define una puntuación mínima para buscar películas.",
)

# Launch the interface
iface.launch()


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

