In [1]:
!pip -q install txtai[all] langchain pypdf > /dev/null

In [2]:
%%capture

from langchain.text_splitter import RecursiveCharacterTextSplitter
from txtai.embeddings import Embeddings
from langchain.document_loaders import PyPDFLoader
import glob

In [None]:
!unzip /content/04\ studies.zip -d /content/studies

In [7]:
pdf_text = []
for pdf in glob.glob('/content/studies/*.pdf'):
  loader = PyPDFLoader(pdf)
  pages = loader.load()
  pdf_text.extend(pages)

In [5]:
pdf_text[0]

IndexError: ignored

In [8]:
document_splitter = RecursiveCharacterTextSplitter(chunk_size=350,
                                                   chunk_overlap=25,
                                                   length_function=len)

In [9]:
split_data = []
for docs in pdf_text:
  print(docs)
  temp_split = document_splitter.split_text(docs.page_content)
  split_data.extend(temp_split)

page_content='Experienced Mindfulness Meditators Exhibit Higher\nParietal-Occipital EEG Gamma Activity during NREM\nSleep\nFabio Ferrarelli1, Richard Smith1, Daniela Dentico1, Brady A. Riedner1, Corinna Zennig1, Ruth M. Benca1,\nAntoine Lutz2,4, Richard J. Davidson2,3, Giulio Tononi1*\n1Department of Psychiatry, University of Wisconsin-Madison, Madison, Wisconsin, United States of America, 2Waisman Center for Brain Imaging and Behavior, University\nof Wisconsin-Madison, Madison, Wisconsin, United States of America, 3Department of Psychology, University of Wisconsin-Madison, Madison, Wisconsin, United States of\nAmerica, 4Lyon Neuroscience Research Center, Lyon 1 University, Lyon, France\nAbstract\nOver the past several years meditation practice has gained increasing attention as a non-pharmacological intervention to\nprovide health related benefits, from promoting general wellness to alleviating the symptoms of a variety of medical\nconditions. However, the effects of meditation traini

In [10]:
# Create embeddings index with content enabled. The default behavior is to only store indexed vectors.
embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2", 
                         "content": True, 
                         "objects": True})

Downloading (…)lve/main/config.json:   0%|          | 0.00/587 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [11]:
# Create an index for the list of text
embeddings.index([(uid, 
                   text, 
                   None) for uid, text in enumerate(split_data)])

In [12]:
embeddings.search("Some studies by lutz",1)

[{'id': '335',
  'text': 'Cognit. 6,267–290.\n30. Lutz, A. & Thompson, E. (2003) J. Conscious. Stud. 10,31–52.\nLutz et al. PNAS /H20841November 16, 2004 /H20841vol. 101 /H20841no. 46 /H2084116373\nNEUROSCIENCE',
  'score': 0.5509806871414185}]

In [None]:
# Save the index
embeddings.save("index")

# Show basic details about index files
for f in ["config", "documents", "embeddings"]:
  !ls -l "index/{f}"
  !xxd "index/{f}" | head -5
  !file "index/{f}"
  !echo

In [15]:
!zip -r textai_index.zip /content/index

  adding: content/index/ (stored 0%)
  adding: content/index/documents (deflated 68%)
  adding: content/index/embeddings (deflated 7%)
  adding: content/index/config (deflated 14%)


In [16]:
import json
import pickle

with open("index/config", "rb") as config:
  print(json.dumps(pickle.load(config), 
                   sort_keys=True, 
                   indent=2))

{
  "backend": "faiss",
  "build": {
    "create": "2023-04-20T14:09:59Z",
    "python": "3.9.16",
    "settings": {
      "components": "IDMap,Flat"
    },
    "system": "Linux (x86_64)",
    "txtai": "5.4.0"
  },
  "content": true,
  "dimensions": 768,
  "objects": true,
  "offset": 1318,
  "path": "sentence-transformers/nli-mpnet-base-v2",
  "update": "2023-04-20T14:09:59Z"
}


In [None]:
import pandas as pd
import sqlite3

from IPython.display import display, Markdown

# Print details of a txtai SQLite document database
def showdb(path):
  db = sqlite3.connect(path)

  display(Markdown("## Tables"))
  df = pd.read_sql_query("select name FROM sqlite_master where type='table'", db)
  display(df.style.hide_index())

  for table in df["name"]:
    display(Markdown(f"## {table}"))
    df = pd.read_sql_query(f"select * from {table}", db)

    # Truncate large binary objects
    if "object" in df:
      df["object"] = df["object"].str.slice(0, 25)

    display(df[:5].style.hide_index())

showdb("index/documents")

In [17]:
import faiss
import numpy as np

# Query
query = "What is meditation?"

# Read index
index = faiss.read_index("index/embeddings")
print(index)
print(f"Total records: {index.ntotal}, dimensions: {index.d}")
print()

# Generate query embeddings and run query
queries = np.array([embeddings.transform((None, query, None))])
scores, ids = index.search(queries, 1)

# Lookup query result from original data array
result = split_data[ids[0][0]]

# Show results
print("Query:", query)
print("Results:", result, ids, scores)

<faiss.swigfaiss_avx2.IndexIDMap; proxy of <Swig Object of type 'faiss::IndexIDMapTemplate< faiss::Index > *' at 0x7f100c276e70> >
Total records: 1318, dimensions: 768

Query: What is meditation?
Results: meditation  that are particularly  related  to mindfulness . [[1106]] [[0.7347889]]


In [None]:
import urllib

from IPython.display import Image

# Get an image
request = urllib.request.urlopen("https://raw.githubusercontent.com/neuml/txtai/master/demo.gif")

# Get data
data = request.read()

# Upsert new record having both text and an object
embeddings.upsert([("txtai", {"text": "txtai executes machine-learning workflows to transform data and build AI-powered semantic search applications.", 
                              "size": len(data), 
                              "object": data}, None)])

embeddings.save("index")

showdb("index/documents")

In [None]:
embeddings.search("select * from authors where similar('neuro imaging')")

[{'id': '1004',
  'text': '13. FoxKCR, Nijeboer S,Dixon ML,Floman JL,Ellamil M,Rumak SP,etal.Ismeditation associated with\naltered brain structure? Asystematic review andmeta-analy sisofmorpho metric neuroimaging inmedi-\ntation practitio ners. Neuroscienc e&Biobehavior alReviews. 2014; 43:48±7 3.doi:10.1016/ j.neubiorev.\n2014.03. 016PMID: 24705269',
  'score': 0.6630920171737671},
 {'id': '511',
  'text': 'the right prefrontal cortex in self-evaluation of the face. A functional magneticresonance imaging study. J Cogn Neurosci 2008;20:342–55.\nNelson TO. Consciousness and metacognition. Am Psychol 1996;51:102–16.\nNielsen L, Kaszniak AW. Awareness of subtle emotional feelings: a comparison of',
  'score': 0.6353673338890076},
 {'id': '1029',
  'text': '46. Viola FC,Thorne J,Edmond sB,Schneid erT,Eichele T,Debener S.Semi-a utomatic identificat ionof\nindependent compone ntsrepresenting EEG artifact. Clinical Neurophysio logy. 2009; 120(5):868 ±877.\ndoi:10.1016 /j.clinph.2009. 01.015 P

In [None]:
from txtai.pipeline import Extractor

# Create extractor instance
extractor = Extractor(embeddings, "google/flan-t5-base")

In [19]:
def prompt(question):
  return f"""Answer the following question using only the context below. Say 'no answer' when the question can't be answered.
Question: {question}
Context: """

def search(query, question=None):
  # Default question to query if empty
  if not question:
    question = query

  return extractor([("answer", query, prompt(question), False)])[0][1]

question = "How many studies has lutz written?"
answer = search(question)
print(question, answer)

nquestion = "Explain to me meditation?"
print(nquestion, search(f"{question} {answer}. {nquestion}"))

How many studies has lutz written? 30
Explain to me meditation? no answer


In [20]:
search("What are the studies done by Lutz?")

'a number of reports implicating slow oscillations with the meditative state(Lutz et al., 2007 ). However, and as also noted by the authors, these results could be biased by two fundamental factors: (1) fast oscillations were not investigated and (2)the studies were based on FA meditation, which could al., 2013; Hinterberger et al., 2014; Lehmann et al., 2001; for a review see: Cahn & Polich, 2006; Fell et al., 2010; Lutz et al., 2004). However, high-frequency gamma was not investigated in these studies and, therefore, it remains unclear whether modulations in high-frequency gamma activity'

In [21]:
search("What is meaning of life?")

'no answer'

In [22]:
search("What is meaning of meditation?")

'cultivates a state ofbeing'

In [25]:
search("Tell me anything interesting")

'no answer'

In [23]:
embeddings.search("What is meaning of meditation?")

[{'id': '1106',
  'text': 'meditation  that are particularly  related  to mindfulness .',
  'score': 0.7294905185699463},
 {'id': '302',
  'text': 'rather cultivates a state ofbeing. Objectless meditation does so in such a way that, accord-ing to reports given after meditation, the intentional or object-oriented aspect of experience appears to dissipate in meditation.This dissipation of focus on a particular object is achieved byletting the very essence of the meditation that is practiced',
  'score': 0.7080163955688477},
 {'id': '1074',
  'text': 'thought  or object.  During  FA meditation,  attention  is directed  towards  a particular  object,  thought  \nor proce ss. These  categories  do not represent  mutually  exclusive  processes  but may be \ndifferentially  engaged  during  long-term meditation  or even  during  a single  meditation  session',
  'score': 0.6929957866668701}]