In [1]:
!pip install -q sentence-transformers
!pip install -q wikipedia-api
!pip install -q numpy
!pip install -q scipy

In [None]:
# Modelo de embedding
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('all-mpnet-base-v2')

In [3]:
# Extraer texto de wikipedia sobre Hayao Miyazaki
from wikipediaapi import Wikipedia
wiki = Wikipedia('RAGBot/0.0', 'en')
doc = wiki.page('Hayao_Miyazaki').text

In [4]:
# Chucking in paragraphs
paragraphs = doc.split('\n') 

In [5]:
docs_embedded = embedding_model.encode(paragraphs, normalize_embeddings=True)

In [6]:
docs_embedded.shape

(78, 768)

# Let's reduce dimensionality and visualize the data using T-SNE

In [None]:
import numpy as np
from sklearn.manifold import TSNE

docs_embedded_reduced = TSNE(n_components=2).fit_transform(docs_embedded)

In [8]:
def dividir_texto(texto, longitud_maxima=50):

    palabras = texto.split()
    lineas = []
    linea_actual = []
    longitud_actual = 0
    
    for palabra in palabras:
        if longitud_actual + len(palabra) <= longitud_maxima:
            linea_actual.append(palabra)
            longitud_actual += len(palabra) + 1  # +1 por el espacio
        else:
            lineas.append(' '.join(linea_actual))
            linea_actual = [palabra]
            longitud_actual = len(palabra) + 1
    
    lineas.append(' '.join(linea_actual))
    
    return '<br>'.join(lineas) 

In [9]:
import pandas as pd

df_embeddings = pd.DataFrame(docs_embedded_reduced)
df_embeddings = df_embeddings.rename(columns={0:'x',1:'y'})
df_embeddings['text'] = paragraphs
df_embeddings['text'] = df_embeddings['text'].apply(dividir_texto)
df_embeddings

Unnamed: 0,x,y,text
0,-153.707306,39.250156,"Hayao Miyazaki (宮崎 駿 or 宮﨑 駿, Miyazaki Hayao,<..."
1,-26.375402,16.409048,"Born in Tokyo City in the Empire of Japan,<br>..."
2,-57.974106,53.492111,Miyazaki co-founded Studio Ghibli in 1985. He<...
3,-252.153168,77.099991,Miyazaki's works are characterized by the<br>r...
4,332.344788,-135.351471,
...,...,...,...
73,332.344788,-135.351471,
74,196.543594,83.081299,Studio Ghibli (in Japanese)
75,-192.424469,128.799179,Hayao Miyazaki at Anime News Network's<br>ency...
76,-163.476135,93.956017,Hayao Miyazaki at IMDb


In [10]:
import plotly.express as px

# Ahora, intenta crear el gráfico de dispersión nuevamente
fig = px.scatter(
    df_embeddings, x='x', y='y',
    hover_data=['text'],  # Ahora 'text' debería ser reconocido correctamente
    title='Embeddings de Wikipedia de Hayao Miyazaki'
)

fig.show()

# Embed the Query

In [11]:
query = "What was studio Ghiblis's first film?"
query_embed = embedding_model.encode(query, normalize_embeddings=True)

In [12]:
query_embed.shape

(768,)

## Cosine Similarity

$$
\text{similarity} = \frac{\mathbf{a} \cdot \mathbf{b}}{||\mathbf{a}||_2 \cdot ||\mathbf{b}||_2}
$$

In [13]:
# vamos a utilizar la distnacia del coseno para medir la similitud entre el query y los documentos
import numpy as np
similarities = np.dot(docs_embedded, query_embed) 

- `docs_embedded` es una matriz de tamaño $$ (n, d) $$ donde $$ n $$ es el número de documentos y $$ d $$ es la dimensión de los embeddings.
- `query_embed` es un vector de tamaño $$ (d,) $$.

Para realizar la operación de producto punto entre `docs_embedded` y `query_embed`, necesitamos que `query_embed` sea una matriz columna de tamaño $$ (d, 1) $$. Esto se logra tomando la transpuesta del vector de consulta, convirtiéndolo en una matriz columna:

In [14]:
# ordenamos y guardamos los indices embeddings de menor a mayor similitud
sorted_indexs = np.argsort(similarities, axis=0)

In [15]:
# obtenemos los indices de los 3 documentos más similares   
top_3_idx = sorted_indexs[-3:].flatten()

In [16]:
most_similar_paragraphs = [paragraphs[i] for i in top_3_idx]

In [17]:
# visualizamos cuales son los top 3 parrafos más similares que utilizaremos como contexto
for i, p in enumerate(most_similar_paragraphs):
    print(f'{i+1}. {p}\n')

1. Hayao Miyazaki (宮崎 駿 or 宮﨑 駿, Miyazaki Hayao, Japanese: [mijaꜜzaki hajao]; born January 5, 1941) is a Japanese animator, filmmaker, and manga artist. A founder of Studio Ghibli, he has attained international acclaim as a masterful storyteller and creator of Japanese animated feature films, and is widely regarded as one of the most accomplished filmmakers in the history of animation.

2. On June 15, 1985, Miyazaki and Takahata founded the animation production company Studio Ghibli as a subsidiary of Tokuma Shoten. Studio Ghibli's first film was Laputa: Castle in the Sky (1986), directed by Miyazaki. Some of the architecture in the film was also inspired by a Welsh mining town; Miyazaki witnessed the mining strike upon his first visit to Wales in 1984 and admired the miners' dedication to their work and community. Laputa was released on August 2, 1986, by the Toei Company. It sold around 775,000 tickets; Miyazaki and Suzuki expressed their disappointment with the film's box office fig

In [18]:
CONTEXT = '\n'.join(most_similar_paragraphs)

In [19]:
# Creamos el system prompt
prompt = f"""
use the following CONTEXT to answer the QUESTION at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

CONTEXT: {CONTEXT}
QUESTION: {query}

"""

In [20]:
from openai import OpenAI
import os

client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

In [21]:
response = client.chat.completions.create(
  model="gpt-4o",
  messages=[
    {"role": "user", "content": prompt},
  ]
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [22]:
print(response.choices[0].message.content)

Studio Ghibli's first film was **Laputa: Castle in the Sky**.
