In [35]:
# Install necessary libraries
!pip install -q openai
!pip install -q llama-index==0.9.13
!pip install -q transformers
!pip install -q pinecone-client

In [39]:
# Import libraries
from google.colab import userdata
import os
import logging
import sys
import llama_index
import openai
import pinecone
from transformers import AutoTokenizer, AutoModel
import torch
from llama_index.vector_stores import PineconeVectorStore


# Setup logging
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))


In [40]:
# Retrieve API keys from user data
api_key = userdata.get('PINECONE_API_KEY')
openai_api_key = userdata.get('OPENAI_API_KEY')

# Set environment variables
os.environ["PINECONE_API_KEY"] = api_key
os.environ["OPENAI_API_KEY"] = openai_api_key

# Check versions of llama_index and openai
llama_index_version, openai_version = llama_index.__version__, openai.__version__
print(f"Llama Index version: {llama_index_version}, OpenAI version: {openai_version}")

Llama Index version: 0.9.13, OpenAI version: 1.4.0


In [41]:
# Pinecone setup
environment = "gcp-starter"
index_name = "health-dhver"
pinecone.init(api_key=api_key, environment=environment)
pinecone_index = pinecone.Index(index_name)

# Initialize the tokenizer and model for embedding
model_name = "intfloat/multilingual-e5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [47]:
# Function to create embeddings
def create_embeddings(text):
    encoded_input = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        model_output = model(**encoded_input)
    embeddings = model_output.last_hidden_state.mean(dim=1).numpy()
    return embeddings

# Function to query Pinecone index with new embeddings
def query_pinecone_directly(text):
    embeddings = create_embeddings(text)
    query_results = pinecone_index.query(embeddings.tolist(), top_k=10)
    return query_results

# Example query
query_result = query_pinecone_directly('관절이 굳고, ㅎㅎㅎㅋㅋㅋ아이고골반만 아프다')
print(query_result)

{'matches': [{'id': 'vector-7945', 'score': 0.88628763, 'values': []},
             {'id': 'vector-2033', 'score': 0.881732166, 'values': []},
             {'id': 'vector-2429', 'score': 0.881380081, 'values': []},
             {'id': 'vector-147', 'score': 0.878392398, 'values': []},
             {'id': 'vector-8191', 'score': 0.876716077, 'values': []},
             {'id': 'vector-752', 'score': 0.876006, 'values': []},
             {'id': 'vector-2232', 'score': 0.875326335, 'values': []},
             {'id': 'vector-2451', 'score': 0.874104619, 'values': []},
             {'id': 'vector-7784', 'score': 0.873839319, 'values': []},
             {'id': 'vector-361', 'score': 0.872633815, 'values': []}],
 'namespace': ''}
