In [13]:
!pip -q install txtai[all] langchain pypdf > /dev/null

In [31]:
%%capture

from langchain.text_splitter import RecursiveCharacterTextSplitter
from txtai.embeddings import Embeddings
from langchain.document_loaders import PyPDFLoader
import glob

In [None]:
!unzip /content/04\ studies.zip -d /content/studies

In [25]:
pdf_text = []
for pdf in glob.glob('/content/studies/*.pdf'):
  loader = PyPDFLoader(pdf)
  pages = loader.load()
  pdf_text.extend(pages)

In [None]:
pdf_text[0]

In [47]:
document_splitter = RecursiveCharacterTextSplitter(chunk_size=350,
                                                   chunk_overlap=25,
                                                   length_function=len)

In [None]:
split_data = []
for docs in pdf_text:
  print(docs)
  temp_split = document_splitter.split_text(docs.page_content)
  split_data.extend(temp_split)

In [49]:
# Create embeddings index with content enabled. The default behavior is to only store indexed vectors.
embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2", 
                         "content": True, 
                         "objects": True})

In [50]:
# Create an index for the list of text
embeddings.index([(uid, 
                   text, 
                   None) for uid, text in enumerate(split_data)])

In [36]:
embeddings.search("Some studies by lutz",1)

[{'id': '1685',
  'text': 'with our results, Lutz et al. discuss a number of reports',
  'score': 0.6790071129798889}]

In [None]:
# Save the index
embeddings.save("index")

# Show basic details about index files
for f in ["config", "documents", "embeddings"]:
  !ls -l "index/{f}"
  !xxd "index/{f}" | head -5
  !file "index/{f}"
  !echo

In [38]:
import json
import pickle

with open("index/config", "rb") as config:
  print(json.dumps(pickle.load(config), 
                   sort_keys=True, 
                   indent=2))

{
  "backend": "faiss",
  "build": {
    "create": "2023-04-20T11:55:15Z",
    "python": "3.9.16",
    "settings": {
      "components": "IDMap,Flat"
    },
    "system": "Linux (x86_64)",
    "txtai": "5.4.0"
  },
  "content": true,
  "dimensions": 768,
  "objects": true,
  "offset": 3705,
  "path": "sentence-transformers/nli-mpnet-base-v2",
  "update": "2023-04-20T11:55:15Z"
}


In [None]:
import pandas as pd
import sqlite3

from IPython.display import display, Markdown

# Print details of a txtai SQLite document database
def showdb(path):
  db = sqlite3.connect(path)

  display(Markdown("## Tables"))
  df = pd.read_sql_query("select name FROM sqlite_master where type='table'", db)
  display(df.style.hide_index())

  for table in df["name"]:
    display(Markdown(f"## {table}"))
    df = pd.read_sql_query(f"select * from {table}", db)

    # Truncate large binary objects
    if "object" in df:
      df["object"] = df["object"].str.slice(0, 25)

    display(df[:5].style.hide_index())

showdb("index/documents")

In [52]:
import faiss
import numpy as np

# Query
query = "What is meditation?"

# Read index
index = faiss.read_index("index/embeddings")
print(index)
print(f"Total records: {index.ntotal}, dimensions: {index.d}")
print()

# Generate query embeddings and run query
queries = np.array([embeddings.transform((None, query, None))])
scores, ids = index.search(queries, 1)

# Lookup query result from original data array
result = split_data[ids[0][0]]

# Show results
print("Query:", query)
print("Results:", result, ids, scores)

<faiss.swigfaiss_avx2.IndexIDMap; proxy of <Swig Object of type 'faiss::IndexIDMapTemplate< faiss::Index > *' at 0x7f2b41541720> >
Total records: 3706, dimensions: 768

Query: What is meditation?
Results: within them’’ (p. 243).
4.5. The question of meditation proﬁciency
In this study we included 36 MM practitioners with great exper-
tise variability, ranging between 180 - 23,000 h, grouped in three
groups. Albeit the large expertise range, we did not ﬁnd any MM
expertise-related signiﬁcant effects in gamma power for the F [[465]] [[0.7788122]]


In [None]:
import urllib

from IPython.display import Image

# Get an image
request = urllib.request.urlopen("https://raw.githubusercontent.com/neuml/txtai/master/demo.gif")

# Get data
data = request.read()

# Upsert new record having both text and an object
embeddings.upsert([("txtai", {"text": "txtai executes machine-learning workflows to transform data and build AI-powered semantic search applications.", 
                              "size": len(data), 
                              "object": data}, None)])

embeddings.save("index")

showdb("index/documents")

In [51]:
embeddings.search("select * from authors where similar('neuro imaging')")

[{'id': '1004',
  'text': '13. FoxKCR, Nijeboer S,Dixon ML,Floman JL,Ellamil M,Rumak SP,etal.Ismeditation associated with\naltered brain structure? Asystematic review andmeta-analy sisofmorpho metric neuroimaging inmedi-\ntation practitio ners. Neuroscienc e&Biobehavior alReviews. 2014; 43:48±7 3.doi:10.1016/ j.neubiorev.\n2014.03. 016PMID: 24705269',
  'score': 0.6630920171737671},
 {'id': '511',
  'text': 'the right prefrontal cortex in self-evaluation of the face. A functional magneticresonance imaging study. J Cogn Neurosci 2008;20:342–55.\nNelson TO. Consciousness and metacognition. Am Psychol 1996;51:102–16.\nNielsen L, Kaszniak AW. Awareness of subtle emotional feelings: a comparison of',
  'score': 0.6353673338890076},
 {'id': '1029',
  'text': '46. Viola FC,Thorne J,Edmond sB,Schneid erT,Eichele T,Debener S.Semi-a utomatic identificat ionof\nindependent compone ntsrepresenting EEG artifact. Clinical Neurophysio logy. 2009; 120(5):868 ±877.\ndoi:10.1016 /j.clinph.2009. 01.015 P