# Installing Dependencies

In [None]:
!pip install pandas numpy faiss-cpu sagemaker requests jina-sagemaker tqdm

# Set up Jina Embeddings v2 on Sagemaker Jumpstart

## Install dependencies and configure AWS CLI

In [None]:
!pip install aws configure
!pip install awscli

In [None]:
!aws configure

In [None]:
role = "arn:aws:iam::253352124568:role/service-role/AmazonSageMaker-ExecutionRole-20230527T104084"

import boto3

#role = boto3.Session().role
region = boto3.Session().region_name

# Connect to Jina Embeddings v2 Endpoint on Sagemaker

To have this working you should have first already created an endpoint for inference, by
1. Subscribing to [_Jina Embeddings v2 Base - en_](https://aws.amazon.com/marketplace/pp/prodview-5iljbegvoi66w) package on AWS marketplace.
2. Creating a [Sagemaker Endpoint](https://us-east-1.console.aws.amazon.com/sagemaker/home?region=us-east-1#/endpoints) for inference using the subscribed model.

Once an endpoint is created, continue running the code-blocks below to deploy the created endpoints, and run inference.

In [None]:
# Specify the name of the model that you subscribed to
subscribed_model_name = "jina-embeddings-v2-base-en"

In [None]:
# Mapping for Model Package Names
model_name_map = {
    "jina-embeddings-v2-base-en": "jina-embeddings-v2-base-en-32555da8a0b431d190bf3eca46758b72",
    "jina-embeddings-v2-small-en": "jina-embeddings-v2-small-en-0e950fb984e3396fa4e1108adf69937c",
    "jina-embeddings-v2-base-de": "jina-embeddings-v2-base-de-c269d166764133348365f57b8f1d8c7a",
    "jina-embeddings-v2-base-zh": "jina-embeddings-v2-base-zh-4da30f467aaf347580ba5ed2648e399a",
}

# Specify the model name
model_name = model_name_map[subscribed_model_name]

# Mapping for Model Packages
model_package_map = {
    "us-east-1": f"arn:aws:sagemaker:us-east-1:253352124568:model-package/{model_name}",
    "us-east-2": f"arn:aws:sagemaker:us-east-2:057799348421:model-package/{model_name}",
    "us-west-1": f"arn:aws:sagemaker:us-west-1:382657785993:model-package/{model_name}",
    "us-west-2": f"arn:aws:sagemaker:us-west-2:594846645681:model-package/{model_name}",
    "ca-central-1": f"arn:aws:sagemaker:ca-central-1:470592106596:model-package/{model_name}",
    "eu-central-1": f"arn:aws:sagemaker:eu-central-1:446921602837:model-package/{model_name}",
    "eu-west-1": f"arn:aws:sagemaker:eu-west-1:985815980388:model-package/{model_name}",
    "eu-west-2": f"arn:aws:sagemaker:eu-west-2:856760150666:model-package/{model_name}",
    "eu-west-3": f"arn:aws:sagemaker:eu-west-3:843114510376:model-package/{model_name}",
    "eu-north-1": f"arn:aws:sagemaker:eu-north-1:136758871317:model-package/{model_name}",
    "ap-southeast-1": f"arn:aws:sagemaker:ap-southeast-1:192199979996:model-package/{model_name}",
    "ap-southeast-2": f"arn:aws:sagemaker:ap-southeast-2:666831318237:model-package/{model_name}",
    "ap-northeast-2": f"arn:aws:sagemaker:ap-northeast-2:745090734665:model-package/{model_name}",
    "ap-northeast-1": f"arn:aws:sagemaker:ap-northeast-1:977537786026:model-package/{model_name}",
    "ap-south-1": f"arn:aws:sagemaker:ap-south-1:077584701553:model-package/{model_name}",
    "sa-east-1": f"arn:aws:sagemaker:sa-east-1:270155090741:model-package/{model_name}",
}

# Specify the model you want to use
if region not in model_package_map.keys():
    raise Exception(f"Current boto3 session region {region} is not supported.")

model_package_arn = model_package_map[region]

In [None]:
from jina_sagemaker import Client

client = Client(region_name=region)

# Choose the name of the endpoint that you created after subscribing to the model
endpoint_name = "jina-embeddings-v2-base-en"

client.create_endpoint(
    arn=model_package_arn,
    role=role,
    endpoint_name=endpoint_name,
    instance_type="ml.g4dn.xlarge",
    n_instances=1,
)

# Index Dataset

In [None]:
client.connect_to_endpoint(endpoint_name=endpoint_name)

In this RAG tutorial, we will use a [dataset](https://www.kaggle.com/datasets/maartengr/kurzgesagt-transcriptions?resource=download) (CC0 licenced) containing transcripts of videos from a popular YouTube channel, called [_Kurzgesagt_](https://www.youtube.com/channel/UCsXVk37bltHxD1rDPwtNM8Q) ("_In a Nutshell_" in English).

The dataset contains, in each row, the title of a video, its url, and then transcript of the video.

## Load Dataset

In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('https://drive.google.com/uc?export=download&id=18FO21nrfkQ1Vuee2g_773ON2256nOrlS')

In [None]:
df.head()

## Chunk and embed data

In [None]:
import numpy as np
from tqdm import tqdm

tqdm.pandas()

def chunk_text(text, max_words=128):
    """
    Divide text into chunks where each chunk contains the maximum number of full sentences under `max_words`.
    """
    sentences = text.split('.')
    chunk = []
    word_count = 0

    for sentence in sentences:
        sentence = sentence.strip(".")
        if not sentence:
          continue

        words_in_sentence = len(sentence.split())
        if word_count + words_in_sentence <= max_words:
            chunk.append(sentence)
            word_count += words_in_sentence
        else:
            # Yield the current chunk and start a new one
            if chunk:
              yield '. '.join(chunk).strip() + '.'
            chunk = [sentence]
            word_count = words_in_sentence

    # Yield the last chunk if it's not empty
    if chunk:
        yield ' '.join(chunk).strip() + '.'

def generate_embeddings(text_df):
    chunks = list(chunk_text(text_df['Text']))
    embeddings = []

    for i, chunk in enumerate(chunks):
      response = client.embed(texts=[chunk])
      chunk_embedding = response[0]['embedding']
      embeddings.append(np.array(chunk_embedding))

    text_df['chunks'] = chunks
    text_df['embeddings'] = embeddings
    return text_df

print("Embedding text chunks ...")
df = df.progress_apply(generate_embeddings, axis=1)


# Set up Semantic Search using Faiss

We will now use [Faiss](https://github.com/facebookresearch/faiss), an in-memory vector similarity search engine, to find the most similar chunks to a given query.

We first need to create an index and add the embeddings to it.

## Indexing vectors in memory


In [None]:
import faiss

dim = 768  # dimension of the embeddings
index_with_ids = faiss.IndexIDMap(faiss.IndexFlatIP(dim))
k = 0

doc_ref = dict()

for idx, row in df.iterrows():
    embeddings = row['embeddings']
    for i, embedding in enumerate(embeddings):
        normalized_embedding = np.ascontiguousarray(np.array(embedding, dtype='float32').reshape(1, -1))
        faiss.normalize_L2(normalized_embedding)
        index_with_ids.add_with_ids(normalized_embedding, k)
        doc_ref[k] = (row['chunks'][i], idx)
        k += 1

## Vector search function

In [None]:
def find_most_similar_transcript_segment(query, n=3):
    query_embedding = client.embed(texts=[query])[0]['embedding']  # Assuming the query is short enough to not need chunking
    query_embedding = np.ascontiguousarray(np.array(query_embedding, dtype='float32').reshape(1, -1))
    faiss.normalize_L2(query_embedding)

    D, I = index_with_ids.search(query_embedding, n)  # Get the top n matches

    results = []
    for i in range(n):
        distance = D[0][i]
        index_id = I[0][i]
        transcript_segment, doc_idx = doc_ref[index_id]
        results.append((transcript_segment, doc_idx, distance))

    # Sort the results by distance
    results.sort(key=lambda x: x[2])

    return results


# Set up Question-answering using LLM on Jumpstart

Note that this step may take some time approx. 5-10 minutes while deploying an endpoint on AWS Sagemaker.

In [None]:
from sagemaker.jumpstart.model import JumpStartModel

jumpstart_model = JumpStartModel(model_id="huggingface-llm-mistral-7b-instruct", role=role)
model_predictor = jumpstart_model.deploy()

# Putting it all together for RAG



## Query the vector search engine with your question

In [None]:
question = input("Enter your question: ")
search_results = find_most_similar_transcript_segment(question)

In [None]:
print(search_results)

## Templating a prompt for LLM

In [None]:
from string import Template

prompt_template = Template("""
  <s>[INST] Answer the question below only using the given context.
  The question from the user is based on transcripts of videos from a YouTube
    channel.
  The context is presented as a ranked list of information in the form of
    (video-title, transcript-segment), that is relevant for answering the
    user's question.
  The answer should only use the presented context. If the question cannot be
    answered based on the context, say so.

  Context:
  1. Video-title: $title_1, transcript-segment: $segment_1
  2. Video-title: $title_2, transcript-segment: $segment_2
  3. Video-title: $title_3, transcript-segment: $segment_3

  Question: $question

  Answer: [/INST]
""")


prompt_for_llm = prompt_template.substitute(
    question = question,
    title_1 = df.iloc[search_results[0][1]]["Title"].strip(),
    segment_1 = search_results[0][0],
    title_2 = df.iloc[search_results[1][1]]["Title"].strip(),
    segment_2 = search_results[1][0],
    title_3 = df.iloc[search_results[2][1]]["Title"].strip(),
    segment_3 = search_results[2][0]
)

In [None]:
print(prompt_for_llm)

## Send prompt to LLM and print result

In [None]:
payload = {"inputs": prompt_for_llm}
model_predictor.predict(payload)

# Clean up all Sagemaker Endpoints

It's super important to do this *EVERY TIME* you deploy an endpoint!!!

In [None]:
client.delete_endpoint()
client.close()

In [None]:
model_predictor.delete_model()
model_predictor.delete_endpoint()