In [1]:
!pip install -q grobid-client langchain openai faiss-cpu PyPDF2 tiktoken chromadb

In [7]:
PROJECT_ID = "jerry-argolis"  # @param {type:"string"}
LOCATION = "us-central1" # @param {type:"string"}
FOLDER_ID = "1UHgK76I5-FVr8r0KRvKCWo8S9zX41TAD" # @param {type:"string"}

In [3]:
from platform import python_version
print(python_version())

3.10.12


In [4]:
# Install Vertex AI LLM SDK, langchain and dependencies
# Remember to click RESTART RUNTIME after this step.
! pip install config --upgrade --user
! pip install google-cloud-aiplatform google-api-python-client

Collecting config
  Downloading config-0.5.1-py2.py3-none-any.whl (20 kB)
Installing collected packages: config
Successfully installed config-0.5.1
Collecting google-cloud-aiplatform
  Downloading google_cloud_aiplatform-1.31.1-py2.py3-none-any.whl (2.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m
Collecting google-cloud-resource-manager<3.0.0dev,>=1.3.3 (from google-cloud-aiplatform)
  Downloading google_cloud_resource_manager-1.10.3-py2.py3-none-any.whl (320 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m321.0/321.0 kB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting shapely<2.0.0 (from google-cloud-aiplatform)
  Downloading Shapely-1.8.5.post1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m84.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: shapely, google-cl

In [4]:
from google.colab import auth as google_auth
google_auth.authenticate_user()

In [5]:
import langchain
import vertexai

# Vertex AI
from google.cloud import aiplatform
from langchain.llms import VertexAI

vertexai.init(project=PROJECT_ID, location=LOCATION)

print(f"LangChain version: {langchain.__version__}")
print(f"Vertex AI SDK version: {aiplatform.__version__}")

LangChain version: 0.0.278
Vertex AI SDK version: 1.31.1


In [8]:
from langchain.document_loaders import GoogleDriveLoader

loader = GoogleDriveLoader(
    folder_id=FOLDER_ID,
    recursive=False
)
docs = loader.load()

In [9]:
# split the documents into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=200)
docs = text_splitter.split_documents(docs)

print(f"# of chunks = {len(docs)}")

# of chunks = 224


In [10]:
from langchain.embeddings import VertexAIEmbeddings
from langchain.vectorstores import Chroma

# Embedding
embeddings = VertexAIEmbeddings()
db = Chroma.from_documents(docs, embeddings)

In [11]:
# Create chain to answer questions
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# LLM model
llm = VertexAI(
    model_name="text-bison@001",
    max_output_tokens=256,
    temperature=0,
    top_p=0.95,
    top_k=40,
    verbose=True,
)

prompt_template = """
You are a trustworthy consultant.
Answer the users question based on the context only.
Do not make up data. If you cannot find answer in the context, then say 'Sorry, I cannot find the answers in the documents.'

{context}

Question: {question}
"""

PROMPT = PromptTemplate(
    template = prompt_template, input_variables=["context", "question"]
)

chain_type_kwargs = {"prompt": PROMPT}

# Seting the approriate retriver
#retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})
retriever = db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.4, "k": 4})

qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs,
)

In [13]:
# query = "What is the best AI model to implemnet fraud detection system"
query = "fraud detection 시스템을 구축할 때 가장 적합한 AI 모델은?"
result = qa({"query": query})

print(f"Query: ", result["query"])
print(f"Answer: ", result["result"])
print(f"Source: ", result["source_documents"])

Query:  fraud detection 시스템을 구축할 때 가장 적합한 AI 모델은?
Answer:  The authors suggest that the most suitable AI model for fraud detection is the decision tree. This is because the decision tree is able to learn from the data and identify patterns that can be used to detect fraud. The authors also suggest that the decision tree should be normalized in order to improve its accuracy.
