In [1]:
!pip -q install txtai[all] langchain pypdf > /dev/null

In [2]:
%%capture

from langchain.text_splitter import RecursiveCharacterTextSplitter
from txtai.embeddings import Embeddings
from langchain.document_loaders import PyPDFLoader
import glob

In [3]:
pdf_text = []
for pdf in glob.glob('/content/studies/*.pdf'):
  loader = PyPDFLoader(pdf)
  pages = loader.load()
  pdf_text.extend(pages)

In [None]:
pdf_text[0]

In [5]:
document_splitter = RecursiveCharacterTextSplitter(chunk_size=350,
                                                   chunk_overlap=25,
                                                   length_function=len)

In [None]:
split_data = []
for docs in pdf_text:
  print(docs)
  temp_split = document_splitter.split_text(docs.page_content)
  split_data.extend(temp_split)

In [7]:
# Create embeddings index with content enabled. The default behavior is to only store indexed vectors.
embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2", 
                         "content": True, 
                         "objects": True})

Downloading (…)lve/main/config.json:   0%|          | 0.00/587 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [8]:
# Create an index for the list of text
embeddings.index([(uid, 
                   text, 
                   None) for uid, text in enumerate(split_data)])

In [9]:
embeddings.search("What is self ask?",1)

[{'id': '634',
  'text': 'Right Answer\nSelf-ask\nChain of thought\nQuestion When was the ﬁrst location of the world’s largest coffeehouse chain opened?\nRight Answer\nSelf-ask\nChain of thought\nQuestion Who directed the highest grossing ﬁlm?\nRight Answer\nSelf-ask\nChain of thought\n24',
  'score': 0.5150313973426819}]

In [10]:
# Save the index
embeddings.save("index")

# Show basic details about index files
for f in ["config", "documents", "embeddings"]:
  !ls -l "index/{f}"
  !xxd "index/{f}" | head -5
  !file "index/{f}"
  !echo

-rw-r--r-- 1 root root 288 Apr 21 10:30 index/config
00000000: 8004 9515 0100 0000 0000 007d 9428 8c04  ...........}.(..
00000010: 7061 7468 948c 2773 656e 7465 6e63 652d  path..'sentence-
00000020: 7472 616e 7366 6f72 6d65 7273 2f6e 6c69  transformers/nli
00000030: 2d6d 706e 6574 2d62 6173 652d 7632 948c  -mpnet-base-v2..
00000040: 0763 6f6e 7465 6e74 9488 8c07 6f62 6a65  .content....obje
index/config: data

-rw-r--r-- 1 root root 262144 Apr 21 10:30 index/documents
00000000: 5351 4c69 7465 2066 6f72 6d61 7420 3300  SQLite format 3.
00000010: 1000 0101 0040 2020 0000 0001 0000 0040  .....@  .......@
00000020: 0000 0000 0000 0000 0000 0001 0000 0004  ................
00000030: 0000 0000 0000 0000 0000 0001 0000 0000  ................
00000040: 0000 0000 0000 0000 0000 0000 0000 0000  ................
index/documents: SQLite 3.x database, last written using SQLite version 3031001

-rw-r--r-- 1 root root 1958970 Apr 21 10:30 index/embeddings
00000000: 4978 4d70 0003 0000 7c02 0000 0000 0

In [11]:
import json
import pickle

with open("index/config", "rb") as config:
  print(json.dumps(pickle.load(config), 
                   sort_keys=True, 
                   indent=2))

{
  "backend": "faiss",
  "build": {
    "create": "2023-04-21T10:30:18Z",
    "python": "3.9.16",
    "settings": {
      "components": "IDMap,Flat"
    },
    "system": "Linux (x86_64)",
    "txtai": "5.5.0"
  },
  "content": true,
  "dimensions": 768,
  "objects": true,
  "offset": 636,
  "path": "sentence-transformers/nli-mpnet-base-v2",
  "update": "2023-04-21T10:30:18Z"
}


In [12]:
import pandas as pd
import sqlite3

from IPython.display import display, Markdown

# Print details of a txtai SQLite document database
def showdb(path):
  db = sqlite3.connect(path)

  display(Markdown("## Tables"))
  df = pd.read_sql_query("select name FROM sqlite_master where type='table'", db)
  display(df.style.hide_index())

  for table in df["name"]:
    display(Markdown(f"## {table}"))
    df = pd.read_sql_query(f"select * from {table}", db)

    # Truncate large binary objects
    if "object" in df:
      df["object"] = df["object"].str.slice(0, 25)

    display(df[:5].style.hide_index())

showdb("index/documents")

## Tables

  display(df.style.hide_index())


name
documents
objects
sections


## documents

  display(df[:5].style.hide_index())


id,data,tags,entry


## objects

  display(df[:5].style.hide_index())


id,object,tags,entry


## sections

  display(df[:5].style.hide_index())


indexid,id,text,tags,entry
0,0,"Published as a conference paper at ICLR 2023 REAC T: S YNERGIZING REASONING AND ACTING IN LANGUAGE MODELS Shunyu Yao*,1, Jeffrey Zhao2, Dian Yu2, Nan Du2, Izhak Shafran2, Karthik Narasimhan1, Yuan Cao2 1Department of Computer Science, Princeton University 2Google Research, Brain team 1{shunyuy,karthikn}@princeton.edu",,2023-04-21 10:26:59.312376
1,1,"2{jeffreyzhao,dianyu,dunan,izhak,yuancao}@google.com ABSTRACT While large language models (LLMs) have demonstrated impressive performance across tasks in language understanding and interactive decision making, their abilities for reasoning (e.g. chain-of-thought prompting) and acting (e.g. action",,2023-04-21 10:26:59.312376
2,2,"plan generation) have primarily been studied as separate topics. In this paper, we explore the use of LLMs to generate both reasoning traces and task-speciﬁc actions in an interleaved manner, allowing for greater synergy between the two: reasoning traces help the model induce, track, and update action plans as well as handle",,2023-04-21 10:26:59.312376
3,3,"exceptions, while actions allow it to interface with and gather additional information from external sources such as knowledge bases or environments. We apply our approach, named ReAct , to a diverse set of language and decision making tasks and demonstrate its effectiveness over state-of-the-art baselines in addition to",,2023-04-21 10:26:59.312376
4,4,"improved human interpretability and trustworthiness. Concretely, on question answering (HotpotQA) and fact veriﬁcation (Fever), ReAct overcomes prevalent issues of hallucination and error propagation in chain-of-thought reasoning by interacting with a simple Wikipedia API, and generating human-like task-solving",,2023-04-21 10:26:59.312376


In [13]:
import faiss
import numpy as np

# Query
query = "What is language model?"

# Read index
index = faiss.read_index("index/embeddings")
print(index)
print(f"Total records: {index.ntotal}, dimensions: {index.d}")
print()

# Generate query embeddings and run query
queries = np.array([embeddings.transform((None, query, None))])
scores, ids = index.search(queries, 1)

# Lookup query result from original data array
result = split_data[ids[0][0]]

# Show results
print("Query:", query)
print("Results:", result, ids, scores)

<faiss.swigfaiss_avx2.IndexIDMap; proxy of <Swig Object of type 'faiss::IndexIDMapTemplate< faiss::Index > *' at 0x7f4025e0e630> >
Total records: 636, dimensions: 768

Query: What is language model?
Results: However, as the language space Lis unlimited, learning in this augmented action space is difﬁcult
and requires strong language priors. In this paper, we mainly focus on the setup where a frozen
large language model, PaLM-540B (Chowdhery et al., 2022)1, is prompted with few-shot in-context [[49]] [[0.53048104]]


In [None]:
import urllib

from IPython.display import Image

# Get an image
request = urllib.request.urlopen("https://raw.githubusercontent.com/neuml/txtai/master/demo.gif")

# Get data
data = request.read()

# Upsert new record having both text and an object
embeddings.upsert([("txtai", {"text": "txtai executes machine-learning workflows to transform data and build AI-powered semantic search applications.", 
                              "size": len(data), 
                              "object": data}, None)])

embeddings.save("index")

showdb("index/documents")

In [15]:
embeddings.search("select * from authors where similar('Making models to generate')")

[{'id': '93',
  'text': '8/62B, as the former essentially teaches models to memorize (potentially halluincated) knowledge\nfacts, and the latter teaches models how to (reason and) act to access information from Wikipedia, a\nmore generalizable skill for knowledge reasoning. As all prompting methods are still signiﬁcantly',
  'score': 0.5865278244018555},
 {'id': '485',
  'text': 'sequence models for solving arithmetic problems beneﬁt from generating a rationale before out-\nputting the ﬁnal answer. Allowing models to use more computation to solve harder problems\nhas been previously explored in both language (Graves, 2016; Seo et al., 2017; Dong et al., 2019;',
  'score': 0.5851430892944336},
 {'id': '391',
  'text': 'Compositional Celebrities dataset. 4are the regular GPT models and \x0eand\x02are the 001 and 002\ninstruct models. Percentages are relative and indicate the compositionality gaps.\nWe next narrow the compositionality gap by using what we call elicitive prompts . Composit