In [1]:
import os
import re
from operator import itemgetter
from textwrap import dedent
from typing import List

import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import pandas as pd
import seaborn as sns
from datasets import load_dataset
from dotenv import load_dotenv
from edgar import Company, CompanyFiling, CompanyFilings, set_identity
from langchain.schema import Document
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from langchain_community.vectorstores import SupabaseVectorStore
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langsmith import Client as LangsmithClient
from langsmith.evaluation import LangChainStringEvaluator, evaluate
from rich import print
from supabase.client import Client, create_client

load_dotenv()
os.environ["LANGCHAIN_PROJECT"] = "llm-evals"
%load_ext rich


In [18]:
supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_PRIVATE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)

embeddings = OpenAIEmbeddings(model="text-embedding-3-small", dimensions=512)

vectorstore = SupabaseVectorStore(
    supabase,
    embedding=embeddings,
    table_name="documents_duplicate",
    query_name="match_documents_duplicate",
)


In [21]:
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_openai import OpenAI

metadata_field_info = [
    AttributeInfo(
        name="ticker",
        description="The ticker of the company",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="year",
        description="The year of the report",
        type="integer or list[integer]",
    ),
]

document_content_description = "Chunks of text from financial reports"

llm = OpenAI(temperature=0)

retriever = SelfQueryRetriever.from_llm(
    llm, vectorstore, document_content_description, metadata_field_info, verbose=True
)


In [25]:
# This example only specifies a filter
retriever.invoke("What is COST's risk?")



[1m[[0m
    [1;35mDocument[0m[1m([0m
        [33mpage_content[0m=[32m'of coverage to account for these variables, actual claims and costs could differ significantly from recorded liabilities. Historically, adjustments to our estimates have not been material. Recent Accounting Pronouncements We do not expect that any recently issued accounting pronouncements will have a material effect on our financial statements.'[0m,
        [33mmetadata[0m=[1m{[0m[32m'form'[0m: [32m'10-K'[0m, [32m'year'[0m: [1;36m2022[0m, [32m'ticker'[0m: [32m'COST'[0m, [32m'item_name'[0m: [32m'Item 7'[0m[1m}[0m
    [1m)[0m,
    [1;35mDocument[0m[1m([0m
        [33mpage_content[0m=[32m'or are reasonably likely to have a material current or future effect on our financial condition or financial statements. Critical Accounting Estimates The preparation of our consolidated financial statements in accordance with U.S. generally accepted accounting principles [0m[32m([0m[32mU.S

In [11]:
# This example only specifies a relevant query
retriever.invoke("What are some movies about dinosaurs")



[1m[[0m
    [1;35mDocument[0m[1m([0m
        [33mpage_content[0m=[32m'A bunch of scientists bring back dinosaurs and mayhem breaks loose'[0m,
        [33mmetadata[0m=[1m{[0m[32m'year'[0m: [1;36m1993[0m, [32m'genre'[0m: [32m'science fiction'[0m, [32m'rating'[0m: [1;36m7.7[0m[1m}[0m
    [1m)[0m,
    [1;35mDocument[0m[1m([0m
        [33mpage_content[0m=[32m'Toys come alive and have a blast doing so'[0m,
        [33mmetadata[0m=[1m{[0m[32m'year'[0m: [1;36m1995[0m, [32m'genre'[0m: [32m'animated'[0m[1m}[0m
    [1m)[0m,
    [1;35mDocument[0m[1m([0m
        [33mpage_content[0m=[32m'Leo DiCaprio gets lost in a dream within a dream within a dream within a ...'[0m,
        [33mmetadata[0m=[1m{[0m[32m'year'[0m: [1;36m2010[0m, [32m'rating'[0m: [1;36m8.2[0m, [32m'director'[0m: [32m'Christopher Nolan'[0m[1m}[0m
    [1m)[0m,
    [1;35mDocument[0m[1m([0m
        [33mpage_content[0m=[32m'A psychologist / detective 

In [12]:
# This example specifies a query and composite filter
retriever.invoke(
    "What's a movie after 1990 but before (or on) 2005 that's all about toys, and preferably is animated"
)



[1m[[0m
    [1;35mDocument[0m[1m([0m
        [33mpage_content[0m=[32m'Toys come alive and have a blast doing so'[0m,
        [33mmetadata[0m=[1m{[0m[32m'year'[0m: [1;36m1995[0m, [32m'genre'[0m: [32m'animated'[0m[1m}[0m
    [1m)[0m
[1m][0m