# Langchain

This code does a semantic search inside a file

In [1]:
# @title Search in files
Question = "What is artificial intelligence?" # @param {type:"string"}
number_of_citations = 10 # @param {type:"number"}

# Download a test file

If you already have a txt file, create the user_path folder and put it inside

In [2]:
#@title Download a test file
#@markdown If you already have a txt file, create the user_path folder and put it inside

!pip install requests PyPDF2

from IPython.display import clear_output
clear_output()

import os
import requests
from PyPDF2 import PdfReader

def download_pdf(url, destination):
    try:
        response = requests.get(url)
        response.raise_for_status()
        with open(destination, 'wb') as file:
            file.write(response.content)
        print("PDF downloaded successfully.")
    except Exception as e:
        raise RuntimeError(f"Error during PDF download: {e}")

def pdf_to_txt(pdf_path, txt_path):
    try:
        with open(pdf_path, 'rb') as file:
            reader = PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text()
        with open(txt_path, 'w', encoding='utf-8') as txt_file:
            txt_file.write(text)
        print("PDF to TXT conversion completed successfully.")
    except Exception as e:
        raise RuntimeError(f"Error during PDF to TXT conversion: {e}")

def remove_pdf(pdf_path):
    try:
        if os.path.exists(pdf_path):
            os.remove(pdf_path)
            print("PDF file removed successfully.")
    except Exception as e:
        raise RuntimeError(f"Error removing the PDF file: {e}")

def main():
    url = "https://www.europarl.europa.eu/RegData/etudes/STUD/2020/641547/EPRS_STU(2020)641547_EN.pdf"
    user_path = "user_path"
    os.makedirs(user_path, exist_ok=True)

    pdf_path = os.path.join(user_path, "document.pdf")
    txt_path = os.path.join(user_path, "document.txt")

    try:
        download_pdf(url, pdf_path)
        pdf_to_txt(pdf_path, txt_path)
        remove_pdf(pdf_path)
        print("Operation completed successfully.")
    except Exception as error:
        print(f"Error: {error}")

if __name__ == "__main__":
    main()


PDF downloaded successfully.
PDF to TXT conversion completed successfully.
PDF file removed successfully.
Operation completed successfully.


In [3]:
# @title Install dependencies

# Install pyhon 3.10 that will be used within pipenv
!sudo add-apt-repository ppa:deadsnakes/ppa -y > /dev/null
!sudo apt install python3.10 python3.10-distutils psmisc -y > /dev/null
!curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 > /dev/null

!pip install langchain==0.0.354
!pip install 'sentence_transformers>=2.2.2,<2.3.0'
!pip install chromadb==0.4.23
!pip install 'instructorembedding>=1.0.1'

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


Collecting langchain==0.0.354
  Downloading langchain-0.0.354-py3-none-any.whl.metadata (13 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain==0.0.354)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain==0.0.354)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-community<0.1,>=0.0.8 (from langchain==0.0.354)
  Downloading langchain_community-0.0.38-py3-none-any.whl.metadata (8.7 kB)
Collecting langchain-core<0.2,>=0.1.5 (from langchain==0.0.354)
  Downloading langchain_core-0.1.52-py3-none-any.whl.metadata (5.9 kB)
Collecting langsmith<0.1.0,>=0.0.77 (from langchain==0.0.354)
  Downloading langsmith-0.0.92-py3-none-any.whl.metadata (9.9 kB)
Collecting tenacity<9.0.0,>=8.1.0 (from la

In [4]:
embedding_model = "hkunlp/instructor-large"
device = 'cuda'
db_dir = "db_dir_UserData"

import time
start_time = time.time()

from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

def _chunk_sources(sources, chunk=True, chunk_size=512, language=None, db_type=None):
    assert db_type is not None
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0, keep_separator=False, separators=["\n\n", "\n", " ", ""])
    source_chunks = splitter.split_documents(sources)

    # currently in order, but when pull from db won't be, so mark order and document by hash
    [x.metadata.update(dict(chunk_id=chunk_id)) for chunk_id, x in enumerate(source_chunks)]

    # assign chunk_id=-1 for original content
    # this assumes, as is currently true, that splitter makes new documents and list and metadata is deepcopy
    [x.metadata.update(dict(chunk_id=-1)) for chunk_id, x in enumerate(sources)]

    # in some cases sources is generator, so convert to list
    return list(sources) + source_chunks

def split_list(input_list, split_size):
    for i in range(0, len(input_list), split_size):
        yield input_list[i:i + split_size]

import glob
from langchain.document_loaders import TextLoader
docs1 = TextLoader(glob.glob("user_path/*")[0], encoding="utf8", autodetect_encoding=True).load()
sources =  _chunk_sources(docs1, chunk=True, chunk_size=512, db_type="chroma")


embedding = HuggingFaceInstructEmbeddings(model_name=embedding_model,
                              model_kwargs={'device': device},
                              encode_kwargs={'normalize_embeddings': True})

import os
if not os.path.exists(db_dir):
  os.makedirs(db_dir)
  print("Create DB")
  max_batch_size = 41666
  sources_batches = split_list(sources, max_batch_size)
  for sources_batch in sources_batches:
    db = Chroma.from_documents(documents=sources_batch, embedding=embedding, persist_directory=db_dir, collection_name = 'UserData')

    db.persist()

  #clear_embedding(db)
  #save_embed(db, use_openai_embedding, hf_embedding_model)
else:
  print("Open DB")
  chroma_class = Chroma
  import chromadb
  api_kwargs = dict(client=chromadb.PersistentClient(path=db_dir))
  db = chroma_class(persist_directory=db_dir, embedding_function=embedding,
                    collection_name="UserData",
                    **api_kwargs)

docs = db.similarity_search_with_score(Question, k=1000, filter={'chunk_id': {'$gte': 0}})
for doc in docs[:number_of_citations]:
  print(doc[1])
  print(doc[0].page_content)
  print("*******")

print("--- %s seconds ---" % (time.time() - start_time))

  from tqdm.autonotebook import trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.41k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer


  model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu')))


max_seq_length  512
Create DB




0.17794254422187805
Executive summary ............................................................................................................ III 
Glossary  ............................................................................................................................ VI 
1. What is artificial intelligence?  ....................................................................................... 1
*******
0.20304878056049347
AI is an umbrella term  including  a wide range of technologies and applications  that have little more 
in common than their apparent  intelligence , a quality which remains very much open to 
interpretation.  Further, we regularly talk about AI that is already in widespread use alongside AI that 
is under development, and even AI that is specula ted to possibly exist in the future. Consequently, 
the term ' AI' is regularly used to refer to any technique , used in any context – real or imagined – as
*******
0.2060544490814209
9 H. Collins, Artifi