In [1]:
%pip install "evadb[document, notebook]"
%pip install openai
%pip install --upgrade tiktoken
%pip install transformers

Collecting evadb[document,notebook]
  Downloading evadb-0.3.7-py3-none-any.whl (530 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m530.1/530.1 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aenum>=2.2.0 (from evadb[document,notebook])
  Downloading aenum-3.1.15-py3-none-any.whl (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.6/137.6 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting lark>=1.0.0 (from evadb[document,notebook])
  Downloading lark-1.1.7-py3-none-any.whl (108 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.9/108.9 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Collecting retry>=0.9.2 (from evadb[document,notebook])
  Downloading retry-0.9.2-py2.py3-none-any.whl (8.0 kB)
Collecting sqlalchemy-utils>=0.36.6 (from evadb[document,notebook])
  Downloading SQLAlchemy_Utils-0.41.1-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 kB[0m

In [2]:
import argparse
import os
import evadb
import openai
import numpy as np
import random
import time
import tiktoken
import json
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import BartTokenizer, BartForConditionalGeneration
from timeit import default_timer as timer



In [None]:
# Enter your OpenAI API Key here

openai.api_key = "<OPENAI-API-KEY>"

In [8]:
# Enter the path to your PDF here

pdf_path = '<PDF-PATH>'

In [3]:
path = os.path.dirname(evadb.__file__)

In [4]:
def loadPDF(filepath, pdf_table_name, embeddings_table_name):
  cursor = evadb.connect(path).cursor()
  drop_pdf_table = f""" DROP TABLE IF EXISTS {pdf_table_name};"""
  load_pdf_data = f"""LOAD PDF '{filepath}' INTO {pdf_table_name};"""
  create_embedding_function = f"""CREATE FUNCTION IF NOT EXISTS get_embedding IMPL  '{path}/functions/sentence_feature_extractor.py'; """
  drop_embeddings_table = f""" DROP TABLE IF EXISTS {embeddings_table_name};"""
  get_pdf_embeddings = f"""CREATE TABLE IF NOT EXISTS {embeddings_table_name} AS SELECT get_embedding(data), data FROM {pdf_table_name};"""
  build_faiss_index = f""" CREATE INDEX embedding_index ON {embeddings_table_name}(features) USING FAISS;"""

  cursor.query(drop_pdf_table).execute()
  cursor.query(load_pdf_data).execute()
  cursor.query(create_embedding_function).execute()
  cursor.query(drop_embeddings_table).execute()
  cursor.query(get_pdf_embeddings).execute()
  cursor.query(build_faiss_index).execute()

In [5]:
def getPageCount(pdf_table_name: str) -> int:
  cursor = evadb.connect(path).cursor()
  get_page_count = f"""SELECT MAX(page) FROM {pdf_table_name} """
  page_counts_df = cursor.query(get_page_count).df()
  page_count = np.max(page_counts_df.loc[:, 'MAX.page'])
  return page_count

In [6]:
def getParagraphCount(pdf_table_name: str, page_number: int) -> int:
  cursor = evadb.connect(path).cursor()
  get_para_count = f"""SELECT page, MAX(paragraph) FROM {pdf_table_name} where page = {page_number}"""
  para_counts_df = cursor.query(get_para_count).df()
  para_count = np.max(para_counts_df.loc[:, 'MAX.paragraph'])
  return para_count

In [7]:
def generatePageSummary(pdf_table_name: str, page_number: int) -> str:
  cursor = evadb.connect(path).cursor()
  tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
  model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
  results = cursor.query(f"""SELECT page, paragraph, data from {pdf_data_table} where page = {page_number}""").df()
  dataKey = f'''{pdf_data_table}.data'''
  context = "\n".join(results[dataKey])
  tokenized_context = tokenizer.encode(context,truncation=True, return_tensors="pt")
  outputs = model.generate(tokenized_context, max_length=150, min_length=100, num_beams=4, length_penalty=2.0, early_stopping=True)
  generated_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
  return generated_summary

In [9]:
pdf_embeddings_table = "pdf_embeddings"
pdf_data_table = "pdf_table"

start_load_pdf = timer()
loadPDF(pdf_path, pdf_data_table, pdf_embeddings_table)
end_load_pdf = timer()

pdf_load_time = end_load_pdf - start_load_pdf

random.seed(time.time())
page_set = set()
num_pages = getPageCount(pdf_data_table)

for _ in range(5):
  random_page_number = random.randint(1, num_pages)
  page_set.add(random_page_number)

summaries = []
summary_generation_start_time = timer()
for page in page_set:
  generated_summary = generatePageSummary(pdf_table_name=pdf_data_table, page_number = page)
  print("\n Summary for page - " + str(page) + "\n---------------------\n")
  print("\n", generated_summary, "\n")
  summaries.append(generated_summary)

summary_generation_end_time = timer()

summary_generation_time =  summary_generation_end_time - summary_generation_start_time

summarized_context = "\n".join(summaries)

Downloading: "http://ml.cs.tsinghua.edu.cn/~chenxi/pytorch-models/mnist-b07bb66b.pth" to /root/.cache/torch/hub/checkpoints/mnist-b07bb66b.pth
100%|██████████| 1.03M/1.03M [00:01<00:00, 761kB/s]
Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth


Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]


 Summary for page - 8
---------------------


 memory fragmentation, which often plagues long-running in-memory systems. LeanStore also supports non-Uniform Memory Access (NUMA)-aware allocation inorder to improve performance on multi-socket systems. In LeanStore, buffer frames are physically interleaved with the page content (Fig. 2b). This is in contrast with mosttraditional buffer managers that store a pointer to the pagecontent in the buffer frame. In our implementation, each thread has a small thread-local cache for deleted pages. 


 Summary for page - 2
---------------------


 In classical database systems, all data structures are stored on ﬁxed-size pages in a translation-free manner. For transactional, fully memory-residentworkloads a typical buffer manager is the biggest source ofinefﬁciency. Microsoft’s Siberia project is maybe the most compre-hensive approach for managing large data sets in main-memory databases. Anti-Caching merelyeases memory pressure for applications t

In [10]:
gpt_api_response_start_time = timer()
gptResponse = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": """You are a question generator. User will give content. You need to create multiple choice questions. Questions and answers have to be from the content only. Generate conceptual questions. Do not use any extra knowledge you may have on the subject.
        Every option must have a number assoicated with it. The answer must be the correct option number only. Generate 5 questions. Ensure that your output is in the following JSON format only. Sample question has been provided below:

        questions : [
          {
            question: What is 1+1?,
            options : [(1) 3, (2) 4, (3) 5, (4) 2],
            answer: 4
          }
        ]
        """},
        {"role": "user", "content": f"""{summarized_context}"""},
    ]
)

gpt_api_response_end_time = timer()
gpt_api_response_time = gpt_api_response_end_time - gpt_api_response_start_time

In [11]:
encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')
num_tokens = len(encoding.encode(summarized_context))

print("\nMETRICS\n-------")
print("\nToken Usage\n-----------\nSummary tokens - " + str(num_tokens) + "\t Summary + Prompt tokens - " + str(gptResponse.usage.prompt_tokens) + "\t\tCompletion tokens - " + str(gptResponse.usage.completion_tokens))
print("\nTotal Tokens - " + str(gptResponse.usage.total_tokens))

print("\n\nPerformance\n-----------\nPdf loading - " + str(pdf_load_time) + " seconds \tSummary generation - " + str(summary_generation_time) + " seconds \tGPT API response - " + str(gpt_api_response_time) + " seconds")


METRICS
-------

Token Usage
-----------
Summary tokens - 433	 Summary + Prompt tokens - 592		Completion tokens - 431

Total Tokens - 1023


Performance
-----------
Pdf loading - 50.46811176599999 seconds 	Summary generation - 265.221554457 seconds 	GPT API response - 41.63852049800005 seconds


In [12]:
# Parse the JSON string
quiz_data = json.loads(gptResponse.choices[0].message.content)
score = 0
num_questions = len(quiz_data['questions'])


print("\nYour practice quiz is ready!")

print("\nPRACTICE QUIZ\n--------------\n\n")
print(f"Instructions\n-------------\n\nThere will be {num_questions} questions in total. \nFor each question, enter only your choice (a,b,c or d). \nYou will see your score at the end.\n\nGood luck!!")

question_num = 0

print("\n\nQuiz\n------\n\n")
for question in quiz_data['questions']:
    question_num+=1
    print("Q" + str(question_num) + ") " + question['question'])
    for option in question['options']:
        print(option)
    user_answer = int(input("Your answer: "))
    if user_answer == question['answer']:
        print("Correct!\n")
        score+=1
    else:
        print(f"Sorry, the correct answer is: {question['answer']}\n")

print(f"\n\nYour score: {score}/{num_questions}")



Your practice quiz is ready!

PRACTICE QUIZ
--------------


Instructions
-------------

There will be 5 questions in total. 
For each question, enter only your choice (a,b,c or d). 
You will see your score at the end.

Good luck!!


Quiz
------


Q1) In LeanStore, how are buffer frames stored in relation to the page content?
(1) Buffer frames store a pointer to the page content
(2) Buffer frames are physically interleaved with the page content
(3) Buffer frames are stored separately from the page content
(4) Buffer frames do not store any page content
Your answer: 1
Sorry, the correct answer is: 2

Q2) What is the purpose of a thread-local cache in LeanStore?
(1) To store thread-specific data structures
(2) To improve performance on multi-socket systems
(3) To store deleted pages
(4) To manage memory fragmentation
Your answer: 1
Sorry, the correct answer is: 3

Q3) What is the biggest source of inefficiency for transactional, fully memory-resident workloads in classical database syst