In [1]:
from langchain.vectorstores.faiss import FAISS
import pypdf
import pathlib
import re
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.chat_models import ChatOpenAI
from langchain import PromptTemplate, LLMChain
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)


from langchain import OpenAI
from langchain.chains.summarize import load_summarize_chain
from knowledge_gpt.embeddings import OpenAIEmbeddings
from datetime import datetime
from app import db, Article, app, OPENAI_KEY
from sqlalchemy.exc import IntegrityError


def parse_pdf(file):
    pdf = pypdf.PdfReader(file)
    output = []
    for page in pdf.pages:
        text = page.extract_text()
        # Merge hyphenated words
        text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
        # Fix newlines in the middle of sentences
        text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
        # Remove multiple newlines
        text = re.sub(r"\n\s*\n", "\n\n", text)

        output.append(text)

    return output

def text_to_docs(text, fname):
    """Converts a string or list of strings to a list of Documents
    with metadata."""
    if isinstance(text, str):
        # Take a single string as one page
        text = [text]
    page_docs = [Document(page_content=page) for page in text]

    # Add page numbers as metadata
    for i, doc in enumerate(page_docs):
        doc.metadata["page"] = i + 1

    # Split pages into chunks
    doc_chunks = []

    for doc in page_docs:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=800,
            separators=["\n\n", "\n", ".", "!", "?", ","],
            chunk_overlap=0,
        )
        chunks = text_splitter.split_text(doc.page_content)
        for i, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
            )
            # Add sources a metadata
            doc.metadata["source"] = f"{fname} {doc.metadata['page']}-{doc.metadata['chunk']}"
            doc.metadata["file"] = fname
            doc_chunks.append(doc)
    return doc_chunks


def process(path):
    with path.open("rb") as f:
        txts = parse_pdf(f)
        return text_to_docs(txts, path.stem)
                   

def summarize_article(doc):
    """ doc is the output from parse
    """
    llm = OpenAI(temperature=0)
    chain = load_summarize_chain(llm, chain_type="map_reduce")
    return chain.run(doc)




def create_index(paths, index_file):
    docs = [doc for doccol in [process(out) for out in paths] for doc in doccol]

    embeddings = OpenAIEmbeddings(
        openai_api_key=OPENAI_KEY,
    )  # type: ignore

    index = FAISS.from_documents(docs, embeddings)

    index.save_local(index_file)

    

def add_article_to_sqlite(path, summary):
    """ add to sqlite db
    """
    pdf = pypdf.PdfReader(path)
    meta = pdf.metadata    

    with app.app_context():
        # Create a new article object
        article = Article(
            path=str(path),
            summary=summary,
            author = meta.author,
            creator = meta.creator,
            producer = meta.producer,
            subject = meta.subject,
            title = meta.title,
        )
        try:
            db.session.add(article)
            db.session.commit()
        except IntegrityError:
            db.session.rollback()
            print(f"An article with this path already exists. ({first_doc})")

            
    

first_doc, *_ = paths = list(pathlib.Path("data").glob("*.pdf"))
index_file = "tjupyter-faiss"

embeddings = OpenAIEmbeddings(
    openai_api_key=OPENAI_KEY,
)  # type: ignore


In [2]:
## create FAISS db
create_index(paths, index_file)

KeyboardInterrupt: 

In [3]:
# create sqlite db
from tqdm import tqdm

for path in tqdm(paths):

    with app.app_context():
        specific_article = Article.query.filter_by(path=str(path)).first()
        if specific_article:
            print(f"{path} already in db. Skipping...")
            continue
    

    doc = process(path)
    try:
        summary = summarize_article(doc)
    except:
        print(f"Summary for {path} failed! Skipping...")
        continue
    add_article_to_sqlite(path, summary)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21/21 [00:00<00:00, 314.48it/s]

data/s41467-023-36502-3.pdf already in db. Skipping...
data/6637898.pdf already in db. Skipping...
data/Benzeneco-reactionwithmethanolanddimethyletheroverzeoliteandzeotypecatalystsEvidenceofparallelreactionpathstotolueneanddiphe.pdf already in db. Skipping...
data/InsitumonitoringofTiO2BanatasenanoparticleformationandapplicationinLi-ionandNa-ionbatteries.pdf already in db. Skipping...
data/InsitustudiesofNOreductionbyH2overPtusingsurfaceX-raydiffractionandtransmissionelectronmicroscopy.pdf already in db. Skipping...
data/LocationofCoandNipromoteratomsinmulti-layerMoS2nanocrystalsforhydrotreatingcatalysis.pdf already in db. Skipping...
data/molecules-27-08578-v2.pdf already in db. Skipping...
data/Morphology-inducedshapeselectivityinzeolitecatalysis.pdf already in db. Skipping...
data/nwac045.pdf already in db. Skipping...
data/Pandiangan_2019_J._Phys. _Conf._Ser._1338_012015.pdf already in db. Skipping...
data/Wang_NanoEnergy_2018.pdf already in db. Skipping...
Summary for data/00614.p




In [4]:
with app.app_context():
    articles = list(Article.query.all())

In [5]:
articles[0].__dict__

{'_sa_instance_state': <sqlalchemy.orm.state.InstanceState at 0x7faa75643940>,
 'path': 'data/s41467-023-36502-3.pdf',
 'author': 'Monica J. Mendoza-Castro',
 'producer': 'iText® 5.3.5 ©2000-2012 1T3XT BVBA (SPRINGER SBM; licensed version)',
 'title': 'Tunable hybrid zeolites prepared by partial interconversion',
 'creator': 'Springer',
 'summary': ' This article discusses the synthesis of hybrid zeolites, which are superior catalysts made of building units of different zeolite types. It covers the synthesis of zeolites via interzeolite transformations, the rearrangement of atomic configurations during the conversion of FAU zeolite to CHA zeolite, and the synthesis of a highly ordered mesoporous MSU-SBEA/zeolite Beta composite material. It also covers the use of cellulose nanocrystals as hard templates for preparing mesoporous zeolite Y assemblies.',
 'id': 1,
 'subject': 'Nature Communications, doi:10.1038/s41467-023-36502-3'}

In [6]:
titles = {
"6637898": "A Short Review on Synthesis, Characterization, and Applications of Zeolites",
"Pandiangan_2019_J._Phys. _Conf._Ser._1338_012015":"Characteristics and catalytic activity of zeolite-a synthesized from rice husk silica and aluminium metal by sol-gel method",
"FacileandbenignconversionofsucrosetofructoseusingzeoliteswithbalancedBrønstedandLewisacidity":"Facile and benign conversion of sucrose to fructose using zeolites with balanced Brønsted and Lewis acidity",
}

with app.app_context():
    articles = Article.query.all()
    


In [7]:
pdf = pypdf.PdfReader("data/FacileandbenignconversionofsucrosetofructoseusingzeoliteswithbalancedBrønstedandLewisacidity.pdf")

for page in pdf.pages:
    print(page.images)

[]
[]
[]
[]
[]
[]
[]


In [9]:

embeddings = OpenAIEmbeddings(
    openai_api_key=OPENAI_KEY,
)  # type: ignore
index = FAISS.load_local(index_file, embeddings = embeddings)

query = "what are zeolies?"

docs = index.similarity_search_with_relevance_scores(query, k=15)


In [7]:
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.llms import OpenAI
from knowledge_gpt.prompts import STUFF_PROMPT
from langchain.vectorstores.faiss import FAISS
from knowledge_gpt.embeddings import OpenAIEmbeddings

def get_answer(docs, query):
    """Gets an answer to a question from a list of Documents."""
    # Get the answer
    
    if len(docs) == 0:
        return {"output_text":"I found no relevant articles for that question...\nSOURCES:"}

    chain = load_qa_with_sources_chain(
        OpenAI(
            temperature=0, openai_api_key=OPENAI_KEY,
        ),  # type: ignore
        chain_type="stuff",
        prompt=STUFF_PROMPT,
    )

    # Cohere doesn't work very well as of now.
    # chain = load_qa_with_sources_chain(
    #     Cohere(temperature=0), chain_type="stuff", prompt=STUFF_PROMPT  # type: ignore
    # )
    answer = chain(
        {"input_documents": docs, "question": query}, return_only_outputs=True
    )
    return answer


embeddings = OpenAIEmbeddings(
    openai_api_key=OPENAI_KEY,
)  # type: ignore

index = FAISS.load_local(index_file, embeddings = embeddings)


query = "what are zeolites?"

docs = index.similarity_search_with_relevance_scores(query, k=15)

LOWER_RELEVANCE_BOUND = 0.72
docs = [doc for doc, relevance in docs if relevance > LOWER_RELEVANCE_BOUND]

answer = get_answer(docs, query)

answer_txt, sources_txt = answer["output_text"].split("SOURCES")



KeyboardInterrupt: 

In [30]:
sources_txt

': molecules-27-08578-v2 2-4, 6637898 1-2, molecules-27-08578-v2 11-2'

In [39]:
partial_paths = list({doc.metadata["file"] for doc in docs})
partial_paths



['Pandiangan_2019_J._Phys. _Conf._Ser._1338_012015',
 'Cage-basedsmall-porecatalystsforNH3-SCRpreparedbycombiningbulkyorganicstructuredirectingagentswithmodifiedzeolitesasreagents',
 'molecules-27-08578-v2',
 '6637898']

In [41]:
from sqlalchemy import or_

with app.app_context():
    matching_articles = Article.query.filter(or_(Article.path.like('%' + partial_path + '%') for partial_path in partial_paths)).all()

matching_articles        

[Article('None', 'data/6637898.pdf'),
 Article('A Comprehensive Review on Zeolite Chemistry for Catalytic Conversion of Biomass/Waste into Green Fuels', 'data/molecules-27-08578-v2.pdf'),
 Article('None', 'data/Pandiangan_2019_J._Phys. _Conf._Ser._1338_012015.pdf'),
 Article('Cage-based small-pore catalysts for NH3-SCR prepared by combining bulky organic structure directing agents with modified zeolites as reagents', 'data/Cage-basedsmall-porecatalystsforNH3-SCRpreparedbycombiningbulkyorganicstructuredirectingagentswithmodifiedzeolitesasreagents.pdf')]

In [4]:
formatted_chunks = """\n\n""".join([f"chunk #{i}:\n{doc.page_content}" for i, doc in enumerate(docs)])

print(formatted_chunks)




chunk #0:
Introduction

chunk #1:
35This set-up allows ultrahigh vacuum (UHV) sample preparation followed by in situ high-pressure, high-temperature gas exposure, during SXRD or grazing-incidence small-angleX-ray scattering (GISAXS) experiments with simultaneous reactivitymeasurements viagas product analysis with a quadrupole mass spectrometer. The Pt(110) sample, which was spark cut and polished to 0.1 1from the (110) plane (Surface Preparation Laboratory), was cleaned by repeated cycles of argon ion bombardment and annealing in UHV, resulting in a (1 /C23) missing-row 36 reconstructed surface, exposing narrow (111) facets. The (1 /C23) structure could be stabilized by carbon segregated from thebulk, 37which was not removed completely during cleaning cycles as the sample was not annealed in oxygen

chunk #2:
Downloaded from https://academic.oup.com/nsr/article/9/9/nwac045/6545815 by guest on 22 April 2023

chunk #3:
Bands in this region have been linked to both a Fermi resonance pheno

In [8]:



chat = ChatOpenAI(
    temperature=0,
    openai_api_key=OPENAI_KEY,
)

messages = [
    SystemMessage(content="""You are a helpful assistant who helps sort document
chunks and decide if they are relevant for a specific query. Your output is a
relevance score from 0-10 where 0 means not relevant at all and 

Here are some examples of how you work:

QUERY:
how large is an elephant

chunk #0:
Introduction

chunk #1:
35This set-up allows ultrahigh vacuum (UHV) sample preparation followed by in situ high-pressure,
high-temperature gas exposure, during SXRD or grazing-incidence small-angleX-ray scattering (GISAXS)
experiments with simultaneous reactivitymeasurements viagas product analysis with a quadrupole mass
spectrometer. The Pt(110) sample, which was spark cut and polished to 0.1 1from the (110) plane
(Surface Preparation Laboratory), was cleaned by repeated cycles of argon ion bombardment and
annealing in UHV, resulting in a (1 /C23) missing-row 36 reconstructed surface, exposing narrow
(111) facets. The (1 /C23) structure could be stabilized by carbon segregated from thebulk, 37which
was not removed completely during cleaning cycles as the sample was not annealed in oxygen

chunk #2:
Downloaded from https://academic.oup.com/nsr/article/9/9/nwac045/6545815 by guest on 22 April 2023

OUTPUT:

    
    """),
    HumanMessage(content="I love programming.")
]

chat(messages)


AIMessage(content="I'm sorry, but that is not a document snippet. Can you please provide a document snippet for me to evaluate?", additional_kwargs={}, example=False)

In [7]:
!pip list | grep langchain

langchain                0.0.181
