In [21]:
from langchain_community.document_loaders import PyPDFLoader
import os
import getpass
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain_openai import OpenAI
from langchain_community.document_loaders import OnlinePDFLoader
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
openai_api_key = os.getenv('OPENAI_API_KEY', 'YourAPIKeyIfNotSet')

os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')



In [29]:
loader = PyPDFDirectoryLoader("/Users/kaitokikuchi/Library/CloudStorage/Dropbox/LLM-agriculture/PDFs")
docs = loader.load_and_split()

Illegal character in Name Object (b'/QEOWUY+HG\x91n\x89p\x8ap\xce\xdf\xaf\xcc\xdf\x91\xcc-WinCharSetFFFF-H')
Illegal character in Name Object (b'/QEOWUY+HG\x91n\x89p\x8ap\xce\xdf\xaf\xcc\xdf\x91\xcc')
Illegal character in Name Object (b'/CRDEAK+HG\x91n\x89p\x8ap\xce\xdf\xaf\xcc\xdf\x91\xcc-WinCharSetFFFF-H')
Illegal character in Name Object (b'/CRDEAK+HG\x91n\x89p\x8ap\xce\xdf\xaf\xcc\xdf\x91\xcc')


In [66]:


def make_embedder(model_type, save_path):
    if model_type == "sup-simcse-ja-base":
        model_name = "cl-nagoya/sup-simcse-ja-base"
        model_kwargs = {'device': 'cpu'}
        encode_kwargs = {'normalize_embeddings': False}
        embedding_model = HuggingFaceEmbeddings(
            model_name=model_name,
            model_kwargs=model_kwargs,
            encode_kwargs=encode_kwargs
            )
    else:  # Assuming OpenAIEmbeddings
        embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")
    
    fs = LocalFileStore(save_path)
    cached_embedder = CacheBackedEmbeddings.from_bytes_store(embedding_model, fs)
    return cached_embedder

def save_embeddings_and_ids_with_faiss(all_documents, cached_embedder, save_path):
    db = FAISS.from_documents(all_documents, cached_embedder)
    db.save_local(save_path)



In [None]:
model_type = "sup-simcse-ja-base"  # "sup-simcse-ja-base" or "OpenAIEmbeddings"
cached_embedder_dir = "data/cached_embeddings"
save_path = os.path.join(cached_embedder_dir, model_type)

cached_embedder = make_embedder(model_type, save_path)
save_embeddings_and_ids_with_faiss(docs, cached_embedder, save_path)

Load from cached embedding

In [31]:
model_type = "sup-simcse-ja-base"  # "sup-simcse-ja-base" or "OpenAIEmbeddings"
cached_embedder_dir = "data/cached_embeddings"
save_path = os.path.join(cached_embedder_dir, model_type)

model_name = "cl-nagoya/sup-simcse-ja-base"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embedding_model = HuggingFaceEmbeddings(
            model_name=model_name,
            model_kwargs=model_kwargs,
            encode_kwargs=encode_kwargs
            )
fs = LocalFileStore(save_path)
cached_embedder = CacheBackedEmbeddings.from_bytes_store(embedding_model, fs)
db = FAISS.load_local(save_path, cached_embedder)

In [32]:

llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=db.as_retriever())

In [50]:
query = "令和3年の福島県でのハスモンヨトウ発生状況は？"
qa.invoke(query)

{'query': '令和3年の福島県でのハスモンヨトウ発生状況は？',
 'result': ' 令和3年の福島県でのハスモンヨトウ発生状況は、県内全域においてやや多いと予想されています。'}

In [39]:
query = "炭疽病の発生時期はいつ？"
qa.invoke(query)

{'query': '炭疽病の発生時期はいつ？',
 'result': ' 高温時期に発生しやすい。気温28℃の比較的高温条件で多発する。定植後秋口にすぐ症状が見られるものは、潜在感染した親株から採苗～育苗期に感染したと考えられる。'}

In [59]:
query = "イチゴのうどんこ病の防ぎ方を具体的に教えてください"
qa.invoke(query)

{'query': 'イチゴのうどんこ病の防ぎ方を具体的に教えてください',
 'result': ' 発生初期から防除を実施することが重要です。また、まん延すると防除が困難になるので、早めに対策を行うことが大切です。'}

In [63]:
query = "紫外線はうどんこ病を防げる？"
qa.invoke(query)

BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 4097 tokens, however you requested 5822 tokens (5566 in your prompt; 256 for the completion). Please reduce your prompt; or completion length.", 'type': 'invalid_request_error', 'param': None, 'code': None}}

# Using OpenAI Embeddings
Note: as of 02/04/2024 Tiktoken has not been updated to include the [new OpenAI embedders](https://openai.com/blog/new-embedding-models-and-api-updates) so `Warning: model not found. Using cl100k_base encoding.` will be raised ([PR link](https://github.com/openai/tiktoken/pull/247))

In [67]:
model_type = "OpenAIEmbeddings"  # "sup-simcse-ja-base" or "OpenAIEmbeddings"
cached_embedder_dir = "data/cached_embeddings"
save_path = os.path.join(cached_embedder_dir, model_type)

cached_embedder = make_embedder(model_type, save_path)
save_embeddings_and_ids_with_faiss(docs, cached_embedder, save_path)

In [69]:
model_type = "OpenAIEmbeddings"  # "sup-simcse-ja-base" or "OpenAIEmbeddings"
cached_embedder_dir = "data/cached_embeddings"
save_path = os.path.join(cached_embedder_dir, model_type)

embedding_model = embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")
fs = LocalFileStore(save_path)
cached_embedder = CacheBackedEmbeddings.from_bytes_store(embedding_model, fs)
db = FAISS.load_local(save_path, cached_embedder)
llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=db.as_retriever())

In [71]:
query = "令和3年の福島県でのハスモンヨトウ発生状況は？"
qa.invoke(query)



AssertionError: 