### Import modules

In [None]:
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth, helpers
import boto3
import json
import pandas as pd

### Initialize and configure Boto Client for Bedrock

In [None]:
bedrock = boto3.client(
 service_name='bedrock',
 region_name='us-west-2',
 endpoint_url='https://bedrock.us-west-2.amazonaws.com'
)

### Initialize and configure OpenSearch client

In [None]:
host = "HOST.us-west-2.aoss.amazonaws.com" #replace this with the value from the AWS Management Console
region = "us-west-2"
service = "aoss"
credentials = boto3.Session().get_credentials()
auth = AWSV4SignerAuth(credentials, region, service)

In [None]:
client = OpenSearch(
    hosts = [{"host": host, "port": 443}],
    http_auth = auth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection,
    pool_maxsize = 20
)

### Process the dataset

In [None]:
df=pd.read_csv('./data/oscars.csv')

In [None]:
df=df.loc[df['year_ceremony'] == 2023]
df=df.dropna(subset=['film'])
df['category'] = df['category'].str.lower()

In [None]:
#df

### Concatenate columns to create a new text colummn

In [None]:
# Create the column for all rows first
df['text'] = df['name'] + ' got nominated under the category, ' + df['category'] + ', for the film ' + df['film'] + ' to win the award'

# Find the rows where 'winner' is False and replace the 'text' for those rows
df.loc[df['winner'] == False, 'text'] = df['name'] + ' got nominated under the category, ' + df['category'] + ', for the film ' + df['film'] + ' but did not win'

In [None]:
#df

### Generate embeddings for the text column from Titan

In [None]:
def text_embedding(text):
    body=json.dumps({"inputText": text})
    response = bedrock.invoke_model(body=body, modelId='amazon.titan-e1t-medium', accept='application/json', contentType='application/json')
    response_body = json.loads(response.get('body').read())
    embedding = response_body.get('embedding')
    return embedding

In [None]:
df=df.assign(embedding=(df["text"].apply(lambda x : text_embedding(x))))

### Ingest the text and embeddings into AOSS

In [None]:
def add_document(vector,text):
    document = {
      "nominee_vector": vector,
      "nominee_text": text
    }
    
    response = client.index(
        index = 'oscars-index',
        body = document
    )
    print('\nAdding document:')
    print(response) 

In [None]:
df.apply(lambda row: add_document(row['embedding'], row['text']), axis=1)

### Perform semantic search

In [None]:
def search_index(vector):
    document = {
        "size": 15,
        "_source": {"excludes": ["nominee_vector"]},
        "query": {
            "knn": {
                 "nominee_vector": {
                     "vector": vector,
                     "k":15
                 }
            }
        }
    }
    response = client.search(
    body = document,
    index = "oscars-index"
    )
    return response

In [None]:
query='who won the award for best music?'
vector=text_embedding(query)

In [None]:
response=search_index(vector)
data=response['hits']['hits']

In [None]:
data