In [14]:
!pip install langchain_community
!pip install unstructured
!pip install sentence_transformers
!pip install faiss-cpu
!pip install huggingface_hub
!pip install tqdm

!pip install --upgrade nltk



In [15]:
import os
import numpy as np
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

from tqdm.autonotebook import tqdm, trange

In [5]:
def load_documents(data_path):
    loader = DirectoryLoader(data_path, glob="*.md")
    documents = loader.load()
    return documents

def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks

In [6]:
!unzip sagemaker_documentation.zip

Archive:  sagemaker_documentation.zip
   creating: sagemaker_documentation/
  inflating: sagemaker_documentation/aws-properties-sagemaker-dataqualityjobdefinition-clusterconfig.md  
  inflating: sagemaker_documentation/aws-properties-sagemaker-modelpackage-modelquality.md  
  inflating: sagemaker_documentation/aws-properties-sagemaker-modelpackage-modelpackagestatusitem.md  
  inflating: sagemaker_documentation/aws-properties-sagemaker-inferenceexperiment-shadowmodeconfig.md  
  inflating: sagemaker_documentation/aws-properties-sagemaker-modelpackage-modeldataquality.md  
  inflating: sagemaker_documentation/aws-properties-sagemaker-monitoringschedule-monitoringscheduleconfig.md  
  inflating: sagemaker_documentation/aws-properties-sagemaker-monitoringschedule-json.md  
  inflating: sagemaker_documentation/aws-properties-sagemaker-endpointconfig-captureoption.md  
  inflating: sagemaker_documentation/aws-properties-sagemaker-model-containerdefinition-imageconfig-repositoryauthconfig.md

In [9]:
data_path = 'sagemaker_documentation/'

docs_before_split = load_documents(data_path)
docs_after_split = split_text(docs_before_split)
docs_after_split[0]

Split 336 documents into 4876 chunks.
[ResourceSpec](#cfn-sagemaker-app-resourcespec): ResourceSpec [Tags](#cfn-sagemaker-app-tags): - [Tag](https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-resource-tags.html) [UserProfileName](#cfn-sagemaker-app-userprofilename): String
{'source': 'sagemaker_documentation/aws-resource-sagemaker-app.md', 'start_index': 1068}


Document(metadata={'source': 'sagemaker_documentation/aws-properties-scheduler-schedule-sagemakerpipelineparameter.md', 'start_index': 0}, page_content='AWS::Scheduler::Schedule SageMakerPipelineParameter\n\nThe name and value pair of a parameter to use to start execution of a SageMaker Model Building Pipeline.\n\nSyntax\n\nTo declare this entity in your AWS CloudFormation template, use the following syntax:\n\nJSON')

In [10]:
avg_doc_length = lambda docs: sum([len(doc.page_content) for doc in docs])//len(docs)
avg_char_before_split = avg_doc_length(docs_before_split)
avg_char_after_split = avg_doc_length(docs_after_split)

print(f'Before split, there were {len(docs_before_split)} documents loaded, with average characters equal to {avg_char_before_split}.')
print(f'After split, there were {len(docs_after_split)} documents (chunks), with average characters equal to {avg_char_after_split} (average chunk length).')

Before split, there were 336 documents loaded, with average characters equal to 2828.
After split, there were 4876 documents (chunks), with average characters equal to 225 (average chunk length).


In [16]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    model_kwargs={'device':'cpu'}, 
    encode_kwargs={'normalize_embeddings': True}
)

2024-09-16 18:28:01.350438: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Transformers is only compatible with Keras 2, but you have explicitly set `TF_USE_LEGACY_KERAS` to `0`. This may result in unexpected behaviour or errors if Keras 3 objects are passed to Transformers models.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [17]:
sample_embedding = np.array(huggingface_embeddings.embed_query(docs_after_split[0].page_content))
print("Sample embedding of a document chunk: ", sample_embedding)
print("Size of the embedding: ", sample_embedding.shape)

Sample embedding of a document chunk:  [-3.62564065e-02 -9.29875448e-02 -2.95081232e-02  1.70722380e-02
 -2.04744767e-02  3.68500426e-02  2.05528792e-02 -1.47782024e-02
  1.40447197e-02  2.34871637e-04 -8.84807855e-03 -2.49003340e-02
  3.01212147e-02 -2.77592093e-02  2.46040970e-02 -7.03472793e-02
 -3.60872708e-02  9.29262489e-02 -8.38212445e-02 -3.53557826e-03
  1.04844339e-01 -1.19951162e-02 -9.51072052e-02 -3.20505649e-02
 -1.77804120e-02  1.26954783e-02  1.08509464e-02 -4.59418399e-03
  2.25726701e-02 -1.40919134e-01 -4.83144782e-02 -6.65412378e-03
  3.76596488e-02  1.40487291e-02 -1.45088462e-02  2.66283341e-02
 -8.91587231e-03  4.04175594e-02 -3.63571718e-02  4.71852981e-02
 -4.24550772e-02  1.17247999e-02 -1.45801837e-02 -1.44920545e-03
 -4.41104220e-03 -3.95269655e-02  1.42686907e-02  1.15647493e-02
 -6.98323548e-02  2.46429499e-02 -5.06067066e-04 -5.93463965e-02
 -3.67999263e-02 -4.59112823e-02  5.97238652e-02  3.81677039e-02
  5.41709773e-02  1.76955480e-02 -1.02873398e-02 -4

In [18]:
vectorstore = FAISS.from_documents(docs_after_split, huggingface_embeddings)

In [19]:
query = """What is SageMaker?"""  # Sample question, change to other questions you are interested in.
relevant_documents = vectorstore.similarity_search(query)
print(f'There are {len(relevant_documents)} documents retrieved which are relevant to the query. Display the first one:\n')
print(relevant_documents[0].page_content)

There are 4 documents retrieved which are relevant to the query. Display the first one:

What is a SageMaker Project?


In [20]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [27]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

hf = HuggingFacePipeline.from_model_id(
    model_id="distilbert/distilgpt2",
    task="text-generation",
    pipeline_kwargs={"temperature": 0.7, "max_new_tokens": 300},
    model_kwargs={"use_auth_token": "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"}
)

llm = hf



In [28]:
prompt_template = """Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.

{context}

Question: {question}

Helpful Answer:
"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [29]:
retrievalQA = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)

In [30]:
result = retrievalQA.invoke({"query": query})

In [32]:
print(result['result'])

Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.

What is a SageMaker Project?

SageMaker is a fully managed machine learning service. With SageMaker, data scientists and developers can quickly and easily build and train machine learning models, and then directly deploy them into a production-ready hosted environment. It provides an integrated Jupyter authoring notebook

Getting Started with an Amazon SageMaker Instance

SageMaker is a fully managed machine learning service that enables data scientists and developers to build and train machine learning models using a Jupyter notebook instance.

Question: What is SageMaker?

Helpful Answer:
1. SageMaker is dedicated to building and training m

In [33]:
import pickle
import faiss

faiss_index_path = 'faiss_index.bin'
metadata_path = 'metadata.pkl'

faiss.write_index(vectorstore.index, faiss_index_path)

with open(metadata_path, 'wb') as f:
    pickle.dump(docs_after_split, f)
