In [78]:
!pip install langchain faiss-cpu pypdf GitPython openpyxl sentence-transformers transformers llama-cpp-python > /dev/null

### Notebook shows how the multiple file types can be loaded into FAISS Index (open source)

The embedding used will be again open source sentence-transformers encoders

**All Files are Converted to Text**

1) Simple Text file ,json and other pure text formats

2) tsv, csv files that are structured

3) PDF File

4) Excel File 

5) Jupyter notebook Files

6) Code files

### Libraries Involved

langchain 

faiss-cpu 

pypdf 

GitPython 

openpyxl 

sentence-transformers

In [113]:
from langchain.embeddings import (
    LlamaCppEmbeddings, 
    HuggingFaceEmbeddings, 
    SentenceTransformerEmbeddings
)

from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.vectorstores import FAISS
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders import (
    PyPDFLoader,
    DataFrameLoader,
    GitLoader
  )
import pandas as pd
import nbformat
from nbconvert import PythonExporter
import os

In [14]:
def get_text_splits(text_file):
  """Function takes in the text data and returns the  
  splits so for further processing can be done."""
  with open(text_file,'r') as txt:
    data = txt.read()

  textSplit = RecursiveCharacterTextSplitter(chunk_size=150,
                                             chunk_overlap=15,
                                             length_function=len)
  doc_list = textSplit.split_text(data)
  return doc_list

In [41]:
loader = PyPDFLoader("/content/self-ask.pdf")
pages = loader.load_and_split()

In [46]:
len(pages)

31

In [None]:
pages[0].page_content

In [47]:
def get_pdf_splits(pdf_file):
  """Function takes in the pdf data and returns the  
  splits so for further processing can be done."""
  
  loader = PyPDFLoader(pdf_file)
  pages = loader.load_and_split()  

  textSplit = RecursiveCharacterTextSplitter(chunk_size=150,
                                             chunk_overlap=15,
                                             length_function=len)
  doc_list = []
  #Pages will be list of pages, so need to modify the loop
  for pg in pages:
    pg_splits = textSplit.split_text(pg.page_content)
    doc_list.extend(pg_splits)

  return doc_list

In [112]:
def get_excel_splits(excel_file,target_col,sheet_name):
  trialDF = pd.read_excel(io=excel_file,
                          engine='openpyxl',
                          sheet_name=sheet_name)
  
  df_loader = DataFrameLoader(trialDF,
                              page_content_column=target_col)
  
  excel_docs = df_loader.load()

  return excel_docs

In [65]:
def get_csv_splits(csv_file):
  """Function takes in the csv and returns the  
  splits so for further processing can be done."""
  csvLoader = CSVLoader(csv_file)
  csvdocs = csvLoader.load()
  return csvdocs

In [107]:
def get_ipynb_splits(notebook):
  """Function takes the notebook file,reads the file 
  data as python script, then splits script data directly"""

  with open(notebook) as fh:
    nb = nbformat.reads(fh.read(), nbformat.NO_CONVERT)

  exporter = PythonExporter()
  source, meta = exporter.from_notebook_node(nb)

  #Python file data is in the source variable
  
  textSplit = RecursiveCharacterTextSplitter(chunk_size=150,
                                             chunk_overlap=15,
                                             length_function=len)
  doc_list = textSplit.split_text(source)
  return doc_list  

In [82]:
def get_git_files(repo_link, folder_path, file_ext):
  # eg. loading only python files
  git_loader = GitLoader(clone_url=repo_link,
    repo_path=folder_path, 
    file_filter=lambda file_path: file_path.endswith(file_ext))
  #Will take each file individual document
  git_docs = git_loader.load()

  textSplit = RecursiveCharacterTextSplitter(chunk_size=150,
                                             chunk_overlap=15,
                                             length_function=len)
  doc_list = []
  #Pages will be list of pages, so need to modify the loop
  for code in git_docs:
    code_splits = textSplit.split_text(code.page_content)
    doc_list.extend(code_splits)

  return doc_list
  

In [84]:
def embed_index(doc_list, embed_fn, index_store):
  """Function takes in existing vector_store, 
  new doc_list and embedding function that is 
  initialized on appropriate model. Local or online. 
  New embedding is merged with the existing index. If no 
  index given a new one is created"""
  #check whether the doc_list is documents, or text
  try:
    faiss_db = FAISS.from_documents(doc_list, 
                              embed_fn)  
  except Exception as e:
    faiss_db = FAISS.from_texts(doc_list, 
                              embed_fn)
  
  if os.path.exists(index_store):
    local_db = FAISS.load_local(index_store,embed_fn)
    #merging the new embedding with the existing index store
    local_db.merge_from(faiss_db)
    print("Merge completed")
    local_db.save_local(index_store)
    print("Updated index saved")
  else:
    faiss_db.save_local(folder_path=index_store)
    print("New store created...")


In [50]:
def get_docs_length(index_path, embed_fn):
  test_index = FAISS.load_local(index_path,
                              embeddings=embed_fn)
  test_dict = test_index.docstore._dict
  return len(test_dict.values())  

In [29]:
!rm -fR /content/mail_index

In [None]:
#testing out the above function with the open source 
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [15]:
mail_docs = get_text_splits("/content/mail_collector.txt")

In [27]:
len(mail_docs)

50

In [126]:
mail_docs[0]

'Space via IFTTT <action@ifttt.com>\nAstronomy Picture of the Day:'

In [127]:
embed_index(doc_list=mail_docs,
            embed_fn=embeddings,
            index_store='new_index')

New store created...


In [135]:
get_docs_length(index_path='new_index',embed_fn=embeddings)

72

In [129]:
test_idex = FAISS.load_local("new_index",embeddings)

In [132]:
test_idex.similarity_search("Stellar Nursery in Perseus")

[Document(page_content='Space via IFTTT <action@ifttt.com>\nAstronomy Picture of the Day: NGC 1333: Stellar Nursery in Perseus', metadata={}),
 Document(page_content='by bluish hues characteristic of starlight reflected by interstellar dust. A mere 1,000 light-years distant toward the heroic constellation Perseus,', metadata={}),
 Document(page_content='2023-04-22 09:43:17+05:30NGC 1333: Stellar Nursery in PerseusIn visible light NGC 1333 is seen as a reflection nebula, dominated by bluish hues', metadata={}),
 Document(page_content="be similar to one in which our own Sun formed over 4.5 billion years ago. Hubble's stunning image of the stellar nursery was released to celebrate the", metadata={})]

In [133]:
vim_docs = get_text_splits("/content/vim_play.txt")
embed_index(doc_list=vim_docs,
            embed_fn=embeddings,
            index_store='new_index')

Merge completed
Updated index saved


In [134]:
len(vim_docs)

22

In [136]:
pdf_docs = get_pdf_splits("/content/self-ask.pdf")

embed_index(doc_list=pdf_docs,
            embed_fn=embeddings,
            index_store='new_index')

Merge completed
Updated index saved


In [137]:
len(pdf_docs)

760

In [138]:
get_docs_length(index_path="new_index",
                embed_fn=embeddings)

832

In [139]:
csv_docs = get_csv_splits("/content/space_shortened.csv")

embed_index(doc_list=csv_docs,
            embed_fn=embeddings,
            index_store='new_index')

Merge completed
Updated index saved


In [142]:
get_docs_length(index_path="new_index",
                embed_fn=embeddings)

848

In [None]:
git_docs = get_git_files(repo_link="https://github.com/insightbuilder/python_de_learners_data",
                         folder_path="/code_script_notebooks/python_scripts/",
                         file_ext=".py")

In [88]:
len(git_docs)

331

In [146]:
print(git_docs[0])

#!/usr/bin/env python

def add_num(a, b):
    return a + b

def mul_num(a, b):
    return a * b


In [143]:
embed_index(doc_list=git_docs,
            embed_fn=embeddings,
            index_store='new_index')

Merge completed
Updated index saved


In [144]:
get_docs_length(index_path="new_index",
                embed_fn=embeddings)

1179

In [109]:
pynb_docs = get_ipynb_splits("/content/multiFileEmbedFaiss.ipynb")

In [110]:
embed_index(doc_list=pynb_docs,
            embed_fn=embeddings,
            index_store='mail_index')

Merge completed
Updated index saved


In [147]:
get_docs_length(index_path="new_index",
                embed_fn=embeddings)

1179

In [114]:
excel_docs = get_excel_splits(excel_file='/content/data_in_excel.xlsx',
                 target_col='Customers',
                 sheet_name='Database')

In [115]:
excel_docs[0]

Document(page_content='Bernard Weatherly', metadata={'Id': 1, 'Date': Timestamp('2019-01-01 00:00:00'), 'Gender': 'M', 'Sales Reps': 'timika poe', 'Drinks/Products': 'Heineken Bottle', 'Categories': ' Alcoholic Wine', 'Cost': 38250, 'Sals Price': 42500, 'Qty': 3})

In [116]:
embed_index(doc_list=excel_docs,
            embed_fn=embeddings,
            index_store='mail_index')

Merge completed
Updated index saved


In [117]:
get_docs_length(index_path='mail_index',
                embed_fn=embeddings)

1925

In [120]:
test_index = FAISS.load_local(folder_path='mail_index', 
                              embeddings=embeddings)

In [148]:
test_index.similarity_search("Bernard")

[Document(page_content='Bernard Weatherly', metadata={'Id': 1, 'Date': Timestamp('2019-01-01 00:00:00'), 'Gender': 'M', 'Sales Reps': 'timika poe', 'Drinks/Products': 'Heineken Bottle', 'Categories': ' Alcoholic Wine', 'Cost': 38250, 'Sals Price': 42500, 'Qty': 3}),
 Document(page_content='Bernard Weatherly', metadata={'Id': 211, 'Date': Timestamp('2019-07-30 00:00:00'), 'Gender': 'M', 'Sales Reps': 'hyman irish', 'Drinks/Products': 'Hennessy V.S 70 Cl', 'Categories': ' Alcoholic Wine', 'Cost': 44550, 'Sals Price': 49500, 'Qty': 15}),
 Document(page_content='Bernard Weatherly', metadata={'Id': 467, 'Date': Timestamp('2020-04-11 00:00:00'), 'Gender': 'M', 'Sales Reps': 'zona otis', 'Drinks/Products': '5 Alive 1Ltr', 'Categories': 'Non Alcoholic ', 'Cost': 88200, 'Sals Price': 98000, 'Qty': 1}),
 Document(page_content='Julienne Merkel', metadata={'Id': 14, 'Date': Timestamp('2019-01-14 00:00:00'), 'Gender': 'F', 'Sales Reps': 'quentin kunz', 'Drinks/Products': 'Martell Vs Cognac 70Cl', '

In [149]:
test_index.similarity_search_with_relevance_scores("Bernard")

[(Document(page_content='Bernard Weatherly', metadata={'Id': 1, 'Date': Timestamp('2019-01-01 00:00:00'), 'Gender': 'M', 'Sales Reps': 'timika poe', 'Drinks/Products': 'Heineken Bottle', 'Categories': ' Alcoholic Wine', 'Cost': 38250, 'Sals Price': 42500, 'Qty': 3}),
  0.5775781531777433),
 (Document(page_content='Bernard Weatherly', metadata={'Id': 211, 'Date': Timestamp('2019-07-30 00:00:00'), 'Gender': 'M', 'Sales Reps': 'hyman irish', 'Drinks/Products': 'Hennessy V.S 70 Cl', 'Categories': ' Alcoholic Wine', 'Cost': 44550, 'Sals Price': 49500, 'Qty': 15}),
  0.5775781531777433),
 (Document(page_content='Bernard Weatherly', metadata={'Id': 467, 'Date': Timestamp('2020-04-11 00:00:00'), 'Gender': 'M', 'Sales Reps': 'zona otis', 'Drinks/Products': '5 Alive 1Ltr', 'Categories': 'Non Alcoholic ', 'Cost': 88200, 'Sals Price': 98000, 'Qty': 1}),
  0.5775781531777433),
 (Document(page_content='Julienne Merkel', metadata={'Id': 14, 'Date': Timestamp('2019-01-14 00:00:00'), 'Gender': 'F', 'Sa