In [None]:
# ! pip install -U langchain-google-vertexai
# ! pip install unstructured
# ! pip install "unstructured[pdf]" -- It's terrirble to install this package. This pacakage will install CUDA and NVIDIA packages which are over 3GB.

In [None]:
from typing import List
import nbformat
import requests
import time

# LangChain
#from langchain.llms import VertexAI
from langchain_google_vertexai import VertexAI
from langchain.embeddings import VertexAIEmbeddings

from langchain.schema.document import Document

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import Language

from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

# Vertex AI
from google.cloud import aiplatform
import vertexai

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()

PROJECT_ID = os.getenv("PROJECT_ID")
LOCATION = os.getenv("LOCATION")
VERTEXAI_LOCATION = os.getenv("VERTEXAI_LOCATION")
GCS_BUCKET = os.getenv("GCS_BUCKET")
VECTOR_SEARCH_DATASET = os.getenv("VECTOR_SEARCH_DATASET")
VECTOR_SEARCH_TABLE = os.getenv("VECTOR_SEARCH_TABLE")

vertexai.init(project=PROJECT_ID, location=VERTEXAI_LOCATION)


In [None]:
from langchain_community.document_loaders import GCSFileLoader
from google.cloud import storage
from langchain_community.document_loaders import PyPDFLoader


def load_pdf(file_path):
    return PyPDFLoader(file_path)

def is_pdf_file(url: str) -> bool:
    return url.lower().endswith(".pdf")

# Crawls a GitHub repository and returns a list of all ipynb files in the repository
def crawl_gcs(url: str):
    storage_client = storage.Client()
    bucket_name = url.split("/")[2]
    prefix = "/".join(url.split("/")[3:])
    blobs = storage_client.list_blobs(bucket_name, prefix=prefix)
    pdf_file_loaders = []
    for blob in blobs:
        if is_pdf_file(blob.name):
            file_uri = f"gs://{bucket_name}/{blob.name}"
            loader = GCSFileLoader(project_name='turnkey-charter-358922', bucket=bucket_name, blob=blob.name, loader_func=load_pdf)
            documents = loader.load()
            pdf_file_loaders.append(documents) 
    return pdf_file_loaders
    

In [None]:
from langchain_google_vertexai import VertexAIEmbeddings

embedding = VertexAIEmbeddings(
    model_name="textembedding-gecko-multilingual@latest", project=PROJECT_ID
)


In [None]:
from google.cloud import bigquery

client = bigquery.Client(project=PROJECT_ID, location=LOCATION)
client.create_dataset(dataset=VECTOR_SEARCH_DATASET, exists_ok=True)


In [None]:
from langchain.vectorstores.utils import DistanceStrategy
from langchain_community.vectorstores import BigQueryVectorSearch

store = BigQueryVectorSearch(
    project_id=PROJECT_ID,
    dataset_name=VECTOR_SEARCH_DATASET,
    table_name=VECTOR_SEARCH_TABLE,
    location=LOCATION,
    embedding=embedding,
    distance_strategy=DistanceStrategy.COSINE,
)



In [None]:
pdf_files = crawl_gcs(f"gs://{GCS_BUCKET}")

In [None]:
## Add document in the Vector Store in BigQuery

for documents in pdf_files:
  # texts = [document.page_content for document in documents]
  # meta = [document.metadata for document in documents]
  store.add_documents(documents)


In [None]:
query = "라이센스 정보가 어떻게 될까?"
docs = store.similarity_search(query, k=3)

Wow. it's very easy to store embedding vectors into database. Just to use add_document method. 

Let's make a query API on GCP cloud run.

Please refer to /cloudrun_rag directory
