In [77]:
from pathlib import Path
from dotenv import load_dotenv
from langchain.document_loaders import Docx2txtLoader
import chromadb
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma


In [45]:
load_dotenv()

True

# Load Documents

In [82]:
def read_file(input_fname):
    with open(input_fname, "r", encoding = 'utf-8') as f:
        return f.read()

In [75]:
DATA_DIR = Path("data", "raw")
SUMMARIES_DIR = Path('data', 'summaries')
toc_2015_fname = Path(DATA_DIR, "Jan 2015.docx")
toc_2023_fname = Path(DATA_DIR, "Mar 2023.docx")
# Get all diference summaries

sections_2015_fname = Path(SUMMARIES_DIR, "2015_sections_summary.txt")
sections_2023_fname = Path(SUMMARIES_DIR,  "2023_sections_summary.txt")

bart_2015 =  Path(SUMMARIES_DIR, "bart_large_2015_summary.txt")
bart_2023 = Path(SUMMARIES_DIR,  "bart_large_2023_summary.txt")

davinci_2015 =  Path(SUMMARIES_DIR, "davinci_map_reduce_summarized_2015.txt")
davinci_2023 = Path(SUMMARIES_DIR,  "davinci_map_reduce_summarized_2023.txt")

gpt4_2015 =  Path(SUMMARIES_DIR, "gpt4_map_reduce_summarized_2015.txt")
gpt4_2023 = Path(SUMMARIES_DIR,  "gpt4_map_reduce_summarized_2023.txt")

vectors_2015 =  Path(SUMMARIES_DIR, "vector_2015_summary.txt")
vectors_2023 = Path(SUMMARIES_DIR,  "vector_2023_summary.txt")

In [80]:
summary_names = ['sections_2015', 
                 'sections_2023', 
                 'bart_2015', 
                 'bart_2023', 
                 'davinci_2015', 
                 'davinci_2023',
                 'gpt_2015', 
                 'gpt_2023', 
                 'vectors_2015',
                 'vectors_2023']

summary_fnames = [sections_2015_fname, 
                  sections_2023_fname, 
                  bart_2015,
                  bart_2023, 
                  davinci_2015, 
                  davinci_2023, 
                  gpt4_2015, 
                  gpt4_2023, 
                  vectors_2015, 
                  vectors_2023]

In [83]:
# Dict with keys as summary names and values as summary text
summaries_dict = {}
for name, fname in zip(summary_names, summary_fnames):
    summaries_dict[name] = read_file(fname)

In [49]:
loader_2015 = Docx2txtLoader(str(toc_2015_fname))  # str reqd for loader
data_2015 = loader_2015.load()
loader_2023 = Docx2txtLoader(str(toc_2023_fname))
data_2023 = loader_2023.load()

# Embeddings

In [88]:
embeddings = OpenAIEmbeddings()
db_2015= Chroma.from_documents(data_2015, embeddings, persist_directory="./chroma_db",collection_name="2015_collection")
db_2023 = Chroma.from_documents(data_2023, embeddings, persist_directory="./chroma_db",collection_name="2023_collection")


The returned distance score is cosine distance. Therefore, a lower score is better.

In [86]:
def get_cosine_score(db:Chroma, query:str ) -> float:
    return db.similarity_search_with_score(query, k = 1)[0][1]

In [91]:
scores = {}
for name, text in summaries_dict.items():
    if '2015' in name:
        scores[name] = get_cosine_score(db_2015, text)
    elif '2023' in name:
        scores[name] = get_cosine_score(db_2023, text)
    else:
        'Please update fname to include 2015 or 2023'

In [92]:
scores

{'sections_2015': 0.13150725406001215,
 'sections_2023': 0.1009054191913468,
 'bart_2015': 0.3277767874915742,
 'bart_2023': 0.15002867243160461,
 'davinci_2015': 0.14972689987933305,
 'davinci_2023': 0.16909702760303102,
 'gpt_2015': 0.13398331336445812,
 'gpt_2023': 0.1306894823900674,
 'vectors_2015': 0.22299383923891755,
 'vectors_2023': 0.19667902694552447}