In [1]:
import os, sys
import openai
import matplotlib.pyplot as plt
import numpy as np
import time

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

sys.path.append('/Users/janek/Coding/PersonalProjects/playground/pdfGPT/pdfgpt/server/')
import utils

In [2]:
# load and proces documents
df_file = '/Users/janek/Coding/PersonalProjects/playground/pdfGPT/assets/orwellanimalfarm.pdf' 
pdf_name = 'orwellanimalfarm.pdf'
paragraphs = utils.get_paragraphs(df_file, pdf_name)

In [19]:
text_objects = [dict(item, embedding=1, s=2,d=3)
                for item, embed in zip(paragraphs, embeddings)]

In [20]:
text_objects[:5]

[{'content': 'MR. JONES, of the Manor Farm, had locked the hen-houses for the night, but was too drunk to remember to shut the popholes. With the ring of light from his lantern dancing from side to side, he lurched across the yard, kicked off his boots at the back door, drew himself a last glass of beer from the barrel in the scullery, and made his way up to bed, where Mrs. Jones was already snoring. ',
  'page': 0,
  'title': 'orwellanimalfarm.pdf',
  'embedding': 1,
  's': 2,
  'd': 3},
 {'content': "As soon as the light in the bedroom went out there was a stirring and a fluttering all through the farm buildings. Word had gone round during the day that old Major, the prize Middle White boar, had had a strange dream on the previous night and wished to communicate it to the other animals. It had been agreed that they should all meet in the big barn as soon as Mr. Jones was safely out of the way. Old Major (so he was always called, though the name under which he had been exhibited was W

In [18]:
text_objects[:2]

[{'content': 'MR. JONES, of the Manor Farm, had locked the hen-houses for the night, but was too drunk to remember to shut the popholes. With the ring of light from his lantern dancing from side to side, he lurched across the yard, kicked off his boots at the back door, drew himself a last glass of beer from the barrel in the scullery, and made his way up to bed, where Mrs. Jones was already snoring. ',
  'page': 0,
  'title': 'orwellanimalfarm.pdf',
  'embedding': [0.0015704138204455376,
   -0.008690107613801956,
   -0.011756130494177341,
   -0.018156109377741814,
   -0.10586223006248474,
   0.047558512538671494,
   0.023245900869369507,
   0.029433073475956917,
   0.02600407600402832,
   -0.02509382739663124,
   0.044125884771347046,
   0.019424332305788994,
   0.022282078862190247,
   -0.07595585286617279,
   0.03560918569564819,
   0.06684665381908417,
   0.010288158431649208,
   0.00903920829296112,
   -0.04376186802983284,
   -0.014105813577771187,
   -0.03125646337866783,
   0.0

In [17]:
text_objects = [dict(item, embedding=embed)
                for item, embed in zip(paragraphs, embeddings)]

In [4]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.faiss import FAISS

# create document embeddings
texts = [par['content'] for par in paragraphs]
start_time = time.time()
embedding_method = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
embeddings = embedding_method.embed_documents(texts)
print(f'Created embeddings in: {time.time() - start_time} seconds')

Created embeddings in: 14.980893850326538 seconds


In [5]:
# create faiss from embeddings
start_time = time.time()
texts, metadata = utils.get_metadata(paragraphs)
text_embedding_pairs = list(zip(texts, embeddings))
faiss = FAISS.from_embeddings(text_embedding_pairs, embedding_method, metadatas=metadata)
print(f'Created faiss index in: {time.time() - start_time} seconds')

Created faiss index in: 0.056813716888427734 seconds


In [None]:
# create faiss from texts
start_time = time.time()
list_of_documents = utils.append_metadata(paragraphs)
faiss = FAISS.from_documents(list_of_documents, embedding_method)
print(f'Created faiss index in: {time.time() - start_time} seconds')

In [6]:
# ... and provide semantic search answer
start_time = time.time()
query = 'What two sayings has boxer learned from Napoleon'
K=5
docs_with_score = faiss.similarity_search_with_score(query, k=K)
print(f'Answered query in: {time.time() - start_time} seconds')

Answered query in: 0.08425402641296387 seconds


In [7]:
docs_with_score

[(Document(page_content='Once again this argument was unanswerable. Certainly the animals did not want Jones back; if the holding of debates on Sunday mornings was liable to bring him back, then the debates must stop. Boxer, who had now had time to think things over, voiced the general feeling by saying: "If Comrade Napoleon says it, it must be right." And from then on he adopted the maxim, "Napoleon is always right," in addition to his private motto of "I will work harder." ', metadata={'page': 28, 'title': 'orwellanimalfarm.pdf'}),
  0.6425617),
 (Document(page_content='Napoleon himself appeared at the meeting on the following Sunday morning and pronounced a short oration in Boxer\'s honour. It had not been possible, he said, to bring back their lamented comrade\'s remains for interment on the farm, but he had ordered a large wreath to be made from the laurels in the farmhouse garden and sent down to be placed on Boxer\'s grave. And in a few days\' time the pigs intended to hold a me