# Configuration
## Install python packages
* LangChain SDK and GCS Service SDK for python

#### Architecture
![Architecture](./data/architecture.png)


#### Library

In [None]:
%pip install google-cloud-storage # google-cloud-storage: Google Cloud Storage 객체 관리
%pip install --upgrade google-cloud-discoveryengine # google-cloud-discoveryengine: Google Cloud Discovery Engine 클라이언트
%pip install --upgrade --quiet google-cloud-aiplatform # google-cloud-aiplatform: Google Cloud AI Platform 모델 관리
%pip install --upgrade --quiet langchain langchain_community # langchain: LLM 기반 응용 프로그램 구축, langchain_community: LangChain 커뮤니티 확장 기능
%pip install -U langchain-google-community # langchain-google-community: LangChain과 Google Cloud 제품 통합

## Authentication to access to the GCP

### Set the environment on GCP Project

In [1]:
import os
from google.cloud import storage
from dotenv import load_dotenv, set_key
from google.oauth2 import service_account
import google.oauth2.credentials

# Load Google API Key from .env file
env_path = '.env'
load_dotenv(env_path)
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
    raise ValueError("Failed to load API key. Please set GOOGLE_API_KEY in the .env file.")

# Service account key file path and project settings
KEY_PATH = "./pablo-test-425702-22d29fa73af8.json"
PROJECT_ID = "pablo-test-425702"
GS_LOCATION = "asia-northeast3"

# Initialize Google Cloud Storage client
try:
    storage_client = storage.Client.from_service_account_json(KEY_PATH)
    print("Google Cloud Storage client initialized")
except Exception as e:
    print(f"Google Cloud Storage client error: {e}")

credentials = service_account.Credentials.from_service_account_file(
    KEY_PATH, 
    scopes=['https://www.googleapis.com/auth/cloud-platform']
)

Google Cloud Storage client initialized


## Authentication to access to the GCP

# Upload data
## Upload data in Cloud Storage

In [3]:
from google.cloud import storage

def upload_to_gcs_cloud_storage(bucket_name, source_file_name, destination_blob_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)

    print(f"File {source_file_name} uploaded to {destination_blob_name}!")

bucket_name = 'example_test_bucket_rag'
source_file_name = './data/json_example_resume.json'
workspace_name = "example/"
destination_blob_name = workspace_name + source_file_name.split('/')[-1]

upload_to_gcs_cloud_storage(bucket_name, source_file_name, destination_blob_name)

File ./data/json_example_resume.json uploaded to example/json_example_resume.json!


## Create data store

In [7]:
from google.api_core.client_options import ClientOptions
from google.cloud import discoveryengine

def create_data_store_sample(
    project_id: str,
    location: str,
    data_store_id: str,
    display_name: str
) -> str:
    """
    Create a data store in Google Cloud Discovery Engine.

    Args:
        project_id (str): Google Cloud project ID.
        location (str): Location of the data store.
        data_store_id (str): Unique ID for the data store.
        display_name (str): Display name for the data store.

    Returns:
        str: Operation name.
    """
    # Set client options based on location
    client_options = (ClientOptions(api_endpoint=f"{location}-discoveryengine.googleapis.com"))

    # Initialize the DataStoreServiceClient
    client = discoveryengine.DataStoreServiceClient(client_options=client_options)
    parent = client.collection_path(
        project=project_id,
        location=location,
        collection="default_collection",
    )
    
    # Define the data store configuration
    data_store = discoveryengine.DataStore(
        display_name=display_name, 
        industry_vertical=discoveryengine.IndustryVertical.GENERIC,
        solution_types=[discoveryengine.SolutionType.SOLUTION_TYPE_SEARCH],
        content_config=discoveryengine.DataStore.ContentConfig.NO_CONTENT,
    )
    request = discoveryengine.CreateDataStoreRequest(
        parent=parent,
        data_store_id=data_store_id,
        data_store=data_store,
    )

    operation = client.create_data_store(request=request)

    print(f"Waiting for operation to complete: {operation.operation.name}")
    response = operation.result()
    
    # Create the request to create a data store
    metadata = discoveryengine.CreateDataStoreMetadata(operation.metadata)
    print(response)
    print(metadata)
    return operation.operation.name


DS_LOCATION = "global"
DATA_STORE_ID = 'example_resume_data'
DISPLAY_NAME = 'example_resume_data'
create_data_store_sample(PROJECT_ID, DS_LOCATION, DATA_STORE_ID, DISPLAY_NAME)

Waiting for operation to complete: projects/151473909705/locations/global/collections/default_collection/operations/create-data-store-15406143751886910738
name: "projects/151473909705/locations/global/collections/default_collection/dataStores/example_resume_data"
display_name: "example_resume_data"
industry_vertical: GENERIC
solution_types: SOLUTION_TYPE_SEARCH
default_schema_id: "default_schema"
content_config: NO_CONTENT




'projects/151473909705/locations/global/collections/default_collection/operations/create-data-store-15406143751886910738'

## Import data to data store
* store embedding data

In [8]:
from google.cloud import discoveryengine_v1beta as discoveryengine
from google.api_core.client_options import ClientOptions
from google.protobuf import field_mask_pb2

def import_documents(
    project_id: str,
    location: str,
    data_store_id: str,
    gcs_uri: str
) -> str:
    """
    Import documents from Google Cloud Storage into Google Cloud Discovery Engine.

    Args:
        project_id (str): Google Cloud project ID.
        location (str): Location of the data store.
        data_store_id (str): ID of the data store.
        gcs_uri (str): Google Cloud Storage URI of the documents to import.

    Returns:
        str: Operation name.
    """
    
    api_endpoint = f"{location}-discoveryengine.googleapis.com"
    client_options = ClientOptions(api_endpoint=api_endpoint)
    
    client = discoveryengine.DocumentServiceClient(client_options=client_options)
    parent = client.branch_path(
        project=project_id,
        location=location,
        data_store=data_store_id,
        branch="default_branch",
    )
    
    gcs_source = discoveryengine.GcsSource(
        input_uris=[gcs_uri],
        data_schema="custom"
    )
    
    request = discoveryengine.ImportDocumentsRequest(
        parent=parent,
        gcs_source=gcs_source,
        reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL,
    )

    operation = client.import_documents(request=request)
    print(f"Waiting for operation to complete: {operation.operation.name}")
    response = operation.result()
    metadata = discoveryengine.ImportDocumentsMetadata(operation.metadata)

    print(response)
    print(metadata) 

    return operation.operation.name

gcs_uri = 'gs://pablo_test_bucket1/pablo_example/json_example_resume.json'
import_documents(PROJECT_ID, DS_LOCATION, DATA_STORE_ID, gcs_uri)

Waiting for operation to complete: projects/151473909705/locations/global/collections/default_collection/dataStores/example_resume_data/branches/0/operations/import-documents-4555773986608185226
error_config {
  gcs_prefix: "gs://151473909705_asia_northeast3_import_custom/errors4555773986608185234"
}

create_time {
  seconds: 1720762898
  nanos: 621037000
}
update_time {
  seconds: 1720762903
  nanos: 905509000
}
success_count: 9
total_count: 9



'projects/151473909705/locations/global/collections/default_collection/dataStores/example_resume_data/branches/0/operations/import-documents-4555773986608185226'

# RAG with Vertex AI Search 

### Vertex AI initialization
**Configure project information**
* Model name : LLM model name : https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models
    * Project Id : prodect id in GCP

**Configure Vertex AI and access to the foundation model.**
* Vertex AI initialization : aiplatform.init(..)
    * https://cloud.google.com/python/docs/reference/aiplatform/latest#initialization

In [12]:
import vertexai
from vertexai.generative_models import GenerativeModel, Part, Tool
import vertexai.generative_models as generative_models
from langchain.retrievers import GoogleVertexAIMultiTurnSearchRetriever
from langchain_google_community import VertexAISearchRetriever
from vertexai.preview.generative_models import grounding

MODEL_NAME="gemini-1.5-flash"
VERTEX_AI_LOCATION="us-central1"

# Initalizate the current vertex AI execution environment.
vertexai.init(project=PROJECT_ID, location=VERTEX_AI_LOCATION)
model = GenerativeModel(MODEL_NAME)

### Retrieve VertexAISearch 

In [19]:
from langchain.retrievers import GoogleVertexAIMultiTurnSearchRetriever
from langchain_google_community import VertexAISearchRetriever
from langchain_community.retrievers import (
    GoogleVertexAIMultiTurnSearchRetriever,
    GoogleVertexAISearchRetriever,
)

retriever = VertexAISearchRetriever(
    project_id=PROJECT_ID,
    location_id=DS_LOCATION,
    data_store_id=DATA_STORE_ID,
    max_documents=3,
    max_extractive_answer_count=3,
    max_extractive_segment_count=1,
    query_expansion_condition=2,
    engine_data_type = 1
)

query = "김철수의 휴대전화 번호와 생일"

result = retriever.get_relevant_documents(query)

* Convert Unicode

In [21]:
def fix_unicode(data):
    if isinstance(data, dict):
        return {k: fix_unicode(v) for k, v in data.items()}
    elif isinstance(data, list):
        return [fix_unicode(v) for v in data]
    elif isinstance(data, str):
        return data.encode('latin1').decode('unicode_escape')
    else:
        return data
context ={}

for doc in result:
    context[fix_unicode(doc.metadata['name'])] = fix_unicode(doc.page_content)

### Request LLM using prompt and retriever data

In [23]:
from langchain_google_vertexai.llms import VertexAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from IPython.display import display, Markdown

gemini_pro = VertexAI( model_name = MODEL_NAME,
                  project=PROJECT_ID,
                  location=VERTEX_AI_LOCATION,
                  verbose=True,
                  streaming=False,
                  temperature = 0.2,
                  top_p = 1,
                  top_k = 20
                 )
prompt = PromptTemplate.from_template("""
  당신은 내 질문에 대해 답변하는 AI 어시스턴트입니다.
  아래 Question 에 대해서 반드시 Context에 있는 개별 내용을 기반으로 간결하게 답변주세요.
  
  Context : {context}
  Context Size : {len_context}
  Question : {question}

  """)

prompt = prompt.format(context=context,
                       question=query,
                       len_context = len(context)
                       )

print(f"Prompt : {prompt}")
response = gemini_pro.invoke(prompt)
display(Markdown(response))

Prompt : 
  당신은 내 질문에 대해 답변하는 AI 어시스턴트입니다.
  아래 Question 에 대해서 반드시 Context에 있는 개별 내용을 기반으로 간결하게 답변주세요.
  
  Context : {'projects/151473909705/locations/global/collections/default_collection/dataStores/example_resume_data/branches/0/documents/resume2': '{"jsonData": "{"name":"김철수","birth_date":"1985-05-20","contact":{"email":"kimchulsoo@example.com","phone":"010-9876-5432"},"education":[{"school":"고려대학교","degree":"경영학 학사","year":"2008"},{"school":"고려대학교","degree":"경영학 석사","year":"2011"}],"experience":[{"company":"삼성전자","role":"마케팅 매니저","years":"2011-2017","description":"글로벌 마케팅 전략 수립 및 실행"},{"company":"LG전자","role":"프로덕트 매니저","years":"2017-현재","description":"신제품 기획 및 출시 관리"}],"skills":["Marketing Strategy","Product Management","Business Development"],"languages":["한국어","영어","일본어"]}", "content": {"mimeType": "application/pdf", "uri": "gs://pablo_test_bucket1/pablo_example/111/resume_test2.pdf"}}', 'projects/151473909705/locations/global/collections/default_collection/dataStores/example_r

김철수의 휴대전화 번호는 010-9876-5432이고 생일은 1985-05-20입니다. 
