In [1]:
# # For chromadb embeddings
# !pip install sentence_transformers 

import os
import pandas as pd

import chromadb
from chromadb.utils import embedding_functions

from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings


In [9]:
# Read the doc from s3
filename = 'data/registry_20231206_2.csv'
df = pd.read_csv(filename, dtype=str)

# # Take a subset of the data
states = ['00', '06', '53', '36'] # all states, California, Washington, New York
commodities = ['0000', '0154', '0227', '0028', '0054'] # all commodities, strawberries, oranges, almonds
plans = ['00', '90', '47', '76'] # all plans, APH, ARH, WFRP
df = df[df['state'].isin(states)]
df = df[df['commodity'].isin(commodities)]
df = df[df['plan'].isin(plans)]

print(df.shape)
df.head(1)

(50060, 13)


Unnamed: 0,title,year,commodity,plan,source_url,filename,s3_url,date_published,doc_category,file_ext,county,state,plan_commodity_county_state
0,Walnut Crop Provisions 23-029,2023,29,90,https://www.rma.usda.gov/-/media/RMA/Policies/...,Walnut-Crop-Provisions-23-029.pdf,https://croptalk-spoi.s3.amazonaws.com/CP/Waln...,date_published,CP,pdf,0,0,90_0029_000_00


In [None]:
# Save file from s3 url to local directory
outdir = 'data/demo_knowledge'
if not os.path.exists(outdir):
    os.makedirs(outdir)
    
for i, row in df.iterrows():
    url = row['s3_url']
    
    if row['doc_category'] in ['CP', 'BP', 'CIH']:
        # FIX
        url = url.replace('.pdf', '.ashx')
        filename = os.path.basename(url).replace('.ashx', '.pdf')
        outpath = os.path.join(outdir, filename)
    else:
        filename = os.path.basename(url)
        outpath = os.path.join(outdir, filename)
    if not os.path.exists(outpath):
        print(f'Downloading {url} to {outpath}')
        os.system(f'curl -o {outpath} {url}')
    else:
        print(f'File exists: {outpath}')
    

In [4]:
vectorestore_dir = '/Users/katerina/Documents/CropGuard/Dec_demo/vectorstore'

# Chroma uses the Sentence Transformers all-MiniLM-L6-v2 model to create embeddings
emb_fn = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

chroma_client = chromadb.PersistentClient(path=vectorestore_dir)
collection = chroma_client.get_or_create_collection(name="v0", embedding_function=emb_fn)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=600, chunk_overlap=150, add_start_index=True
    )

In [5]:
# Store the docs that were indexed
indexed_docs = []

In [6]:
# Create a document loader using Langchain's PyMuPDFLoader

import tqdm

filedir = '/Users/katerina/Documents/CropGuard/Dec_demo/demo_knowledge'
meta_columns = ['source_url', 'year', 'plan', 'commodity', 'state', 'county', 'doc_category', 'title', 'filename', 'plan_commodity_county_state']


for i, (_, row) in tqdm.tqdm(enumerate(df.iterrows()), total=len(df)):

    if row.filename in indexed_docs:
        # print(f"Skipping {row.filename}")
        continue

    row_metadata = row[meta_columns].to_dict()

    # load pdf and split into pages and text chunks
    filepath = os.path.join(filedir, row.filename)
    loader = PyMuPDFLoader(filepath)

    try:
        doc_pages = loader.load()
        doc_splits = text_splitter.split_documents(doc_pages)

        # extract texts and pypdf metadata
        texts = [doc.page_content for doc in doc_splits]
        metas = [doc.metadata for doc in doc_splits]
        # embds = [emb_fn(doc.page_content) for doc in doc_splits]

        
        for d in metas:
            # update with rma doc metadata, some fields are overwritten
            d.update(row_metadata)
            # overwrite source with s3 url
            d['source'] = row['s3_url']
        
        
        # create ids for splits
        ids = [f"{d.metadata['filename']}_{d.metadata['plan_commodity_county_state'].replace('_', '-')}_page_{d.metadata['page']}_startindex_{d.metadata['start_index']}" for d in doc_splits]

        # load it into Chroma
        collection.add(documents=texts, metadatas=metas, ids=ids) # embeddings=embds)
        indexed_docs.append(row_metadata['filename'])

    except Exception as e:
        print(f"\n{e}\n{row_metadata}\n\n")
        continue

  2%|▏         | 2/118 [00:12<11:56,  6.17s/it]


cannot open broken document
{'source_url': 'https://www.rma.usda.gov/-/media/RMA/Policies/Apple/2011/Apple-Crop-Provisions-11-0054.ashx', 'year': '2011', 'plan': '90', 'commodity': '0054', 'state': '00', 'county': '000', 'doc_category': 'CP', 'title': 'Apple Crop Provisions 11-0054', 'filename': 'Apple-Crop-Provisions-11-0054.pdf', 'plan_commodity_county_state': '90_0054_000_00'}




100%|██████████| 118/118 [14:57<00:00,  7.61s/it]  


In [7]:
collection.count()

3733

In [8]:
metas

[{'source': 'https://croptalk-spoi.s3.amazonaws.com/SPOI/06_115_90_0054_20231031.pdf',
  'file_path': '/Users/katerina/Documents/CropGuard/Dec_demo/demo_knowledge/06_115_90_0054_20231031.pdf',
  'page': 0,
  'total_pages': 2,
  'format': 'PDF 1.3',
  'title': 'Special Provisions for insuring 0054 under plan 90 in state 06, county 115',
  'author': '',
  'subject': '',
  'keywords': '',
  'creator': 'Microsoft Reporting Services 10.0.0.0',
  'producer': 'Microsoft Reporting Services PDF Rendering Extension 10.0.0.0',
  'creationDate': "D:20230816083754-05'00'",
  'modDate': '',
  'trapped': '',
  'start_index': 0,
  'source_url': 'https://pubfs-rma.fpac.usda.gov/pub/Special_Provisions/2024/2024_SPOI_1031.zip',
  'year': '2024',
  'plan': '90',
  'commodity': '0054',
  'state': '06',
  'county': '115',
  'doc_category': 'SP',
  'filename': '06_115_90_0054_20231031.pdf',
  'plan_commodity_county_state': '90_0054_115_06'},
 {'source': 'https://croptalk-spoi.s3.amazonaws.com/SPOI/06_115_90_

In [6]:
metas[0]

{'source': 'https://croptalk-spoi.s3.amazonaws.com/SPOI/06_019_90_0028_20230831.pdf',
 'file_path': '/Users/katerina/Documents/CropGuard/Dec_demo/demo_knowledge/06_019_90_0028_20230831.pdf',
 'page': 0,
 'total_pages': 1,
 'format': 'PDF 1.3',
 'title': 'Special Provisions for insuring 0028 under plan 90 in 06, 019county',
 'author': '',
 'subject': '',
 'keywords': '',
 'creator': 'Microsoft Reporting Services 10.0.0.0',
 'producer': 'Microsoft Reporting Services PDF Rendering Extension 10.0.0.0',
 'creationDate': "D:20230816081544-05'00'",
 'modDate': '',
 'trapped': '',
 'start_index': 0,
 'source_url': 'https://pubfs-rma.fpac.usda.gov/pub/Special_Provisions/2024/2024_SPOI_0831.zip',
 'year': '2024',
 'plan': '90',
 'commodity': '0028',
 'state': '06',
 'county': '019',
 'doc_category': 'SP',
 'filename': '06_019_90_0028_20230831.pdf',
 'plan_commodity_county_state': '90_0028_019_06'}

In [7]:
print(f"Indexed {len(indexed_docs)} documents")
print(f"Not indexed files: {set(df.filename) - set(indexed_docs)}")

Indexed 52 documents
Not indexed files: set()


# Test the vectorstore

In [8]:
from langchain.vectorstores import Chroma

# Load the collection through langchain's Chroma wrapper
# Using langchain's SentenceTransformerEmbeddings wrapper
lanngchain_emb = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

langchain_chroma = Chroma(
    client=chroma_client,
    collection_name="Demo",
    embedding_function=lanngchain_emb,
)

print("There are", langchain_chroma._collection.count(), "docs in the collection")

There are 3279 docs in the collection


In [12]:
where_filter = {
    "$and": [
        {
            "doc_category": {
                '$eq': 'CP'
            }
        },
        {
            "plan_commodity_county_state": {
                "$eq":  "47_0154_000_00"
            }
        }
    ]
}

In [19]:
matches = langchain_chroma.similarity_search("strawberries", k=5, filter=where_filter)
['47_0154_000_00' in m.metadata.values() for m in matches]

[True, True, True, True, True]

In [20]:
matches[0].metadata

{'author': 'USDA Risk Management Agency',
 'commodity': '0154',
 'county': '000',
 'creationDate': "D:20170501074444-05'00'",
 'creator': 'Acrobat PDFMaker 11 for Word',
 'doc_category': 'CP',
 'file_path': '/Users/katerina/Documents/CropGuard/Dec_demo/demo_knowledge/Strawberry-Pilot-Crop-Provisions-18-0154.pdf',
 'filename': 'Strawberry-Pilot-Crop-Provisions-18-0154.pdf',
 'format': 'PDF 1.6',
 'keywords': '',
 'modDate': "D:20170501102358-05'00'",
 'page': 1,
 'plan': '47',
 'plan_commodity_county_state': '47_0154_000_00',
 'producer': 'Adobe PDF Library 11.0',
 'source': '/Users/katerina/Documents/CropGuard/Dec_demo/demo_knowledge/Strawberry-Pilot-Crop-Provisions-18-0154.pdf',
 'source_url': 'https://www.rma.usda.gov/-/media/RMA/Policies/ARH-Strawberry/2018/Strawberry-Pilot-Crop-Provisions-18-0154.ashx',
 'start_index': 3206,
 'state': '00',
 'subject': 'Provisions',
 'title': 'Strawberry Pilot Crop Provisions 18-0154',
 'total_pages': 6,
 'trapped': '',
 'year': '2018'}