In [None]:
!pip install pdfplumber
!pip install pinecone-client
!pip install langchain
!pip install -U langchain-community
!pip install pypdf
!pip install jq

Collecting jq
  Downloading jq-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Downloading jq-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (737 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m737.4/737.4 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jq
Successfully installed jq-1.8.0


In [None]:
import re
import pdfplumber
import pinecone
from langchain.document_loaders import PyPDFLoader, TextLoader, JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pypdf

Go to Pinecone, sign up.

Under Database->Indexes, create an index with Dimensions: 1024 and Metric: cosine. You can do this by selecting multilingual-e5-large which has the right settings for this example implementation.

You should have a default API Key too, or you can create a new one.

Put the details here:

In [None]:
pinecone_api_key = "<your-pinecone-api-key>"

index_name = "<your-index-name>"
index_host = "<your-index-host>"

In [None]:
# Initialize Pinecone
pc = pinecone.Pinecone(
        api_key=pinecone_api_key
)

# Instantiate the index
index = pinecone.Index(name=index_name, host=index_host, api_key=pinecone_api_key)
namespace = "<choose-a-name>"

In [None]:
# Experiment with the chunk size and the chunk overlap of your data (in tokens)
CHUNK_SIZE = 500
CHUNK_OVERLAP = 20

# Define a function to preprocess text
def preprocess_text(text):
    # Replace consecutive spaces, newlines and tabs
    text = re.sub(r'\s+', ' ', text)
    return text

def load_pdf(file_path):
    loader = PyPDFLoader(file_path)
    data = loader.load()
    return data

def load_txt(file_path):
    loader = TextLoader(file_path)
    data = loader.load()
    return data

def load_json(file_path, json_schema):
    loader = JSONLoader(file_path, jq_schema=json_schema)
    data = loader.load()
    return data

def process_texts(raw_texts):
    # Split your data up into smaller chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
    documents = text_splitter.split_documents(raw_texts)
    # Convert Document objects into strings
    texts = [{'page_content': doc.page_content, 'meta_data': doc.metadata} for doc in documents]
    return texts

# Define a function to create embeddings
def create_embeddings_pinecone(texts):
  embeddings_list = []
  for text in texts:
    a = pc.inference.embed(
      model="multilingual-e5-large",
      inputs=[text['page_content']],
      parameters={
          "input_type": "passage",
          "truncate": "END"
      }
    )
    embeddings_list.append(a)
  return embeddings_list

In [None]:
from google.colab import drive
drive.mount('/content/drive')

texts_raw = []

Your files must be in your google drive in a directory called "RAG data"


In [None]:
# Process all pdf documents
pdf_files = ['example1.pdf',
             'example2.pdf'
             ]

for pdf_file in pdf_files:
  texts_raw.extend(load_pdf("/content/drive/MyDrive/RAG data/"+pdf_file))

print(type(texts_raw))
print(len(texts_raw))
print(texts_raw[-1])

In [None]:
# Process all text documents
txt_files = ['example1.txt',
             'example2.txt'
             ]

for txt_file in txt_files:
  texts_raw.extend(load_txt("/content/drive/MyDrive/RAG data/"+txt_file))

print(type(texts_raw))
print(len(texts_raw))
print(texts_raw[-1])

In [None]:
# Process all json documents. You need to provide a jq_schema for these as well, which
# indicates where to find the content
json_files = [['example1.json', '[].content'],
              ['example2.json', '[].text']
             ]

for json_file in json_files:
  texts_raw.extend(load_txt("/content/drive/MyDrive/RAG data/"+json_file[0], json_file[1]))

print(type(texts_raw))
print(len(texts_raw))
print(texts_raw[-1])

In [None]:
texts = process_texts(texts_raw)
print(len(texts))
print(texts[0])

In [None]:
# Create embeddings
embeddings = create_embeddings_pinecone(texts)
len(embeddings) # should match the last len(texts)

In [None]:
# delete all first -- only run this if you want to clear out the namespace to add all embeddings from scratch
# will error if it doesn't exist yet -- not a problem!
index.delete(delete_all=True, namespace=namespace)

In [None]:
# Upsert the vectors and text to Pinecone
vectors = []
for text, embedding in zip(texts, embeddings):
    if text is not None:
      vectors.append({
          "id": str(len(vectors)),
          "values": embedding[0]['values'],
          "metadata": {'text': text['page_content']}
      })

print(len(vectors)) # should match the len(embeddings above)

# This may take a while! Approx 1 minute per 500 vectors. Wait for the 'Done!'
for vector in vectors:
  index.upsert(
      vectors=[vector],
      namespace=namespace
  )
print('Done!')

214
Done!


Now go to iostack.

In your Account, set up a new 'LLM & Integration Access Key', the type should be 'Pinecone RAG Access Key'. The Key is your Pinecone API key you used above.

In your use case, set up a new Pinecone integration:

Pinecone Index Name: {index_name} (above)

Access Key: the one you just set up

Namespace: {namespace} (above)

Text Field Name: 'text'