In [1]:
import os
import sys
from pathlib import Path

root_path = Path(__file__).resolve().parents[1] if '__file__' in globals() else Path.cwd().parent
sys.path.append(str(root_path))


In [3]:
from utils.secrets import SecretsLoader

secrets = SecretsLoader(env_path='../.env')
openapi_key = secrets.get("OPENAI_API_KEY")
openapi_key

'sk-proj-oow1k7eQM9F52w2LJFh5T3BlbkFJCBEHGqcRun72HkIlvA7x'

In [3]:
from openai import OpenAI
client = OpenAI(api_key = openapi_key)

In [None]:
def get_response(prompt):
    response = client.chat.completions.create(
        model = "gpt-4o-mini",
        messages =[{
            "role": "user",
            "content": prompt
        }]
    )
    return response.choices[0].message.content.strip()

In [None]:
response = get_response('Hello, who are you?')
response

In [9]:
from llama_index.llms.openai import OpenAI
from llama_index.core.llms import ChatMessage

client_llama = OpenAI(model = "gpt-4o-mini")

In [10]:
message = [ChatMessage(role="user", content="Hi! Who are you?")]

In [None]:
response = client_llama.chat(message)
response

In [None]:
response.message.content

In [None]:
import requests

response = requests.get(
    "https://en.wikipedia.org/w/api.php",
    params = {
        "action": "query",
        "format": "json",
        "titles": "OpenAI",
        "prop" : "extracts",
        "explaintext": True
    }
).json()

response



In [None]:
page = next(iter(response['query']['pages'].values()))
text = page['extract']
text

In [None]:
# chunking

In [16]:
def get_chunks(text, chunk_size):
    words = text.split()
    chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

In [None]:
chunks = get_chunks(text, 128)
chunks

In [None]:
client.embeddings.create(input = ["machine_learning"], model = "text-embedding-3-small")

In [23]:
def get_embedding(text):
    return client.embeddings.create(input=[text], model="text-embedding-3-small").data[0].embedding

In [None]:
embeddings = [get_embedding(c) for c in chunks]
embeddings

In [None]:
# Retreive

In [25]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def retriever(query, top_k):
    q_emb = get_embedding(query)
    sim_score = cosine_similarity([q_emb], embeddings)[0]
    max_indices = np.argsort(sim_score)[::-1][:top_k]
    retrieved_data = [chunks[i] for i in max_indices]
    return retrieved_data

In [None]:
contexts = retriever("Tell me about OpenAI's SORT", 3)
contexts

In [None]:
# Generate

In [27]:
def generator(query, contexts):
    context = '\n\n'.join(contexts)
    
    prompt = f"""
    Utilizing the given context, please answer the question
    [context]
    {context}
    
    [user question]
    {query}
    """
    return get_response(prompt)

In [None]:
generator("Tell me about OpenAI's SORA", contexts)

In [29]:
# LlamaIndex
from pathlib import Path

data_path = Path('../dataset/llamaindex_data')

if not data_path.exists():
    Path.mkdir(data_path)
    
with open(f"{data_path}/openai.text", "w") as fp:
    fp.write(text)

In [30]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

documents = SimpleDirectoryReader("../dataset/llamaindex_data").load_data()
vector_index = VectorStoreIndex.from_documents(documents)

In [31]:
query_engine = vector_index.as_query_engine()

In [None]:
response = query_engine.query("Tell me about OpenAI's SORA")
response

In [None]:
from llama_index.core import Settings
Settings

In [None]:
Settings._llm

In [None]:
Settings._embed_model

In [None]:
Settings._node_parser

In [38]:
# How to adjust chunk size

from llama_index.core.node_parser import SentenceSplitter

text_splitter = SentenceSplitter(chunk_size = 128, chunk_overlap = 10)
nodes = text_splitter.get_nodes_from_documents(documents)

In [None]:
len(nodes)

In [None]:
nodes[0]

In [None]:
from llama_index.core.node_parser import TokenTextSplitter

text_splitter = TokenTextSplitter(
    chunk_size = 128,
    chunk_overlap = 10,
    include_metadata = False
)

nodes = text_splitter.get_nodes_from_documents(documents)
len(nodes)

In [49]:
from llama_index.core.schema import TextNode

node_new = TextNode(
    text = "OpenAI is a text to video model. It can only generate 2D animations. The only outstanding animation it generated was on introduction to NLP"
)

nodes.append(node_new)

In [50]:
vector_index = VectorStoreIndex(nodes)
vector_index.as_query_engine()
response = query_engine.query("Tell me about OpenAI's SORA? What's one of its outstanding work?")

In [None]:
print(response.response)