# Semantic Search with Fine Tuning Model

In [None]:
!pip install --upgrade torch

In [None]:
from IPython.display import display_html
def restartkernel() :
    display_html("<script>Jupyter.notebook.kernel.restart()</script>",raw=True)
restartkernel()

In [None]:
import torch
print(torch.__version__)

In [None]:
!pip install -U sentence-transformers rank_bm25
!pip install -q opensearch-py
!pip install -q tqdm

In [None]:
cfn = boto3.client('cloudformation')

def get_cfn_outputs(stackname):
    outputs = {}
    for output in cfn.describe_stacks(StackName=stackname)['Stacks'][0]['Outputs']:
        outputs[output['OutputKey']] = output['OutputValue']
    return outputs

## Setup variables to use for the rest of the demo
cloudformation_stack_name = "semantic-search-2"

outputs = get_cfn_outputs(cloudformation_stack_name)

bucket = outputs['s3BucketTraining']
aos_host = outputs['DomainEndpoint']

outputs

## Step 1: Fine Tune the modal

### Comparing Sentence Similarities

### BM25 similiarities

In [None]:
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np

passages=["does this work with xbox?",
          "Does the M70 work with Android phones?", 
          "does this work with iphone?",
          "Can this work with an xbox "
         ]

def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenized_doc.append(token)
    return tokenized_doc


tokenized_corpus = []
for passage in tqdm(passages):
    tokenized_corpus.append(bm25_tokenizer(passage))

bm25 = BM25Okapi(tokenized_corpus)

bm25_scores = bm25.get_scores(bm25_tokenizer(passages[0]))

all_sentence_combinations = []
for i in range(len(bm25_scores)):
    all_sentence_combinations.append([bm25_scores[i], i])

all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)

print("Top most similar pairs:")
for score, i in all_sentence_combinations[0:4]:
    print("{} \t {} \t {:.4f}".format(passages[i],bm25_tokenizer(passages[i]),bm25_scores[i]))
    


### Semantic Similiarities

In [None]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

#Encode all sentences
embeddings = model.encode(passages)

#Compute cosine similarity between all pairs
cos_sim = util.cos_sim(embeddings, embeddings)

#cosine similarity score with query
all_sentence_combinations = []
for i in range(len(cos_sim)):
    all_sentence_combinations.append([cos_sim[0][i], i])

#Sort list by the highest cosine similarity score
all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)

print("Top most similar pairs:")
for score, i in all_sentence_combinations[0:4]:
    print("{} \t {:.4f}".format(passages[i],cos_sim[0][i]))

### Cross Encoder

In [None]:
from sentence_transformers import CrossEncoder
model = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-2-v2', max_length=512)

#For Example
scores = model.predict([('How many people live in Berlin?', 'Berlin had a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers.'), 
                        ('How many people live in Berlin?', 'Berlin is well known for its museums.')])
scores

In [None]:
from sentence_transformers import SentenceTransformer, util
from PIL import Image

#Load CLIP model
model = SentenceTransformer('clip-ViT-B-32')

#Encode an image:
img_emb = model.encode(Image.open('two-dogs-in-snow.jpg'))

#Encode text descriptions
text_emb = model.encode(['Two dogs in the snow', 'Two dogs in the land', 'Two dogs','A cat on a table', 'A picture of London at night'])

#Compute cosine similarities 
cos_scores = util.cos_sim(img_emb, text_emb)
print(cos_scores)

## Fine Tuning Model

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

#Define the model. Either from scratch of by loading a pre-trained model
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

#Define your train examples. You need more than just two examples...
train_examples = [InputExample(texts=['My first sentence', 'My second sentence'], label=0.8),
    InputExample(texts=['Another pair', 'Unrelated sentence'], label=0.3)]

#Define your train dataset, the dataloader and the train loss
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)

#Tune the model
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)
model.save("./fine-tuning-model/")

In [None]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer,  LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
import logging
from datetime import datetime
import sys
import os
import gzip
import csv

In [None]:
model_name = "sentence-transformers/distilbert-base-nli-stsb-mean-tokens"


# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


In [None]:
import json
from tqdm.contrib.concurrent import process_map
from multiprocessing import cpu_count


def load_pqa_as_json(file_name):
    result=[]
    with open(file_name) as f:
        for line in f:
            data = json.loads(line)
            result.append(data)
    return result


qa_list_json = load_pqa_as_json('amazon-pqa/amazon_pqa_headsets.json')


train_samples = []
def create_training_samples(question):
    input_example = InputExample(texts=[question["question_text"], question["answers"][0]["answer_text"]], label=1)
    train_samples.append(input_example)
        
workers = 4 * cpu_count()
    
process_map(create_training_samples, qa_list_json, max_workers=workers,chunksize=1000)


In [None]:
import json
import pandas as pd

def load_pqa(file_name,number_rows=1000):
    qa_list = []
    df = pd.DataFrame(columns=('question', 'answer','label'))
    with open(file_name) as f:
        i=0
        previous_row_data = None
        for line in f:
            data = json.loads(line)
            df.loc[i] = [data['question_text'],data['answers'][0]['answer_text'],1.0]
            i+=1
            if previous_row_data is not None:
                df.loc[i] = [data['question_text'],previous_row_data['answers'][0]['answer_text'],0.0]
            previous_row_data = data
            i+=1
            if(i == number_rows*2):
                break
    return df


qa_list = load_pqa('amazon-pqa/amazon_pqa_headsets.json',number_rows=1000)


In [None]:
qa_list

In [None]:
from sklearn.model_selection import train_test_split
from sentence_transformers.readers import InputExample

train_set,test_set = train_test_split(qa_list,test_size=0.2,shuffle=True)
training_set, validation_set = train_test_split(train_set,test_size=0.2)

def create_input_sample(data_set):
    train_samples = []
    for index,row in data_set.iterrows():
        input_example = InputExample(texts=[row['question'], row['answer']], label=row['label'])
        train_samples.append(input_example)
    return train_samples

training_samples = create_input_sample(training_set)
validation_samples = create_input_sample(validation_set)
test_samples = create_input_sample(test_set)


In [None]:
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer,  LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator


train_batch_size = 16
num_epochs = 4
model_save_path = 'output/fine_tuned_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

train_dataloader = DataLoader(training_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

evaluator = EmbeddingSimilarityEvaluator.from_input_examples(validation_samples, name='pqa-valucation')


warmup_steps = math.ceil(len(train_dataloader) * num_epochs  * 0.1) #10% of train data for warm-up


# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)


model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='pqa-test')
test_evaluator(model, output_path=model_save_path)

In [None]:
!cd output/fine_tuned_sentence-transformers-distilbert-base-nli-stsb-mean-tokens-2022-09-17_22-36-14 && tar czvf ../model.tar.gz *

In [None]:
import sagemaker

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

inputs = sagemaker_session.upload_data(path='model.tar.gz', key_prefix='fine-tuned-transformers-model')
inputs

### Deploy the BERT model to SageMaker Endpoint

First we need to create a PyTorchModel object. The deploy() method on the model object creates an endpoint which serves prediction requests in real-time. If the instance_type is set to a SageMaker instance type (e.g. ml.m5.large) then the model will be deployed on SageMaker. If the instance_type parameter is set to local then it will be deployed locally as a Docker container and ready for testing locally.

First we need to create a Predictor class to accept TEXT as input and output JSON. The default behaviour is to accept a numpy array.


In [None]:
from sagemaker.pytorch import PyTorch, PyTorchModel
from sagemaker.predictor import Predictor
from sagemaker import get_execution_role

class StringPredictor(Predictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super(StringPredictor, self).__init__(endpoint_name, sagemaker_session, content_type='text/plain')


Deploy the BERT model to Sagemaker Endpoint

#### Note: This process will take serveral minutes to complete.

In [None]:
import time

pytorch_model = PyTorchModel(model_data = inputs, 
                             role=role, 
                             entry_point ='inference.py',
                             source_dir = './code',
                             py_version = 'py38', 
                             framework_version = '1.10.2',
                             predictor_cls=StringPredictor)

predictor = pytorch_model.deploy(instance_type='ml.g4dn.xlarge', 
                                 initial_instance_count=1, 
                                 endpoint_name = f'semantic-search-model-{int(time.time())}')

### Test the SageMaker Endpoint.

Input is text data, output is vector data

In [None]:
import json
original_payload = 'Does this work with xbox?'
features = predictor.predict(original_payload)
vector_data = json.loads(features)

vector_data


## Step 2: Ingest data to OpenSearch Cluster
Load data set of Amazon Product Question and Answer data from : https://registry.opendata.aws/amazon-pqa/

### Downloading Amazon Production Question and Answer Data

Datasets: https://registry.opendata.aws/amazon-pqa/

In [None]:
!aws s3 ls --no-sign-request s3://amazon-pqa/

In [None]:
!aws s3 cp --no-sign-request s3://amazon-pqa/amazon_pqa_headsets.json ./amazon-pqa/amazon_pqa_headsets.json

Use Python API to set up connection with OpenSearch Cluster

In [None]:
# from elasticsearch import Elasticsearch, RequestsHttpConnection
# from requests_aws4auth import AWS4Auth
# region = 'us-east-1' 
# service = 'es'
# credentials = boto3.Session().get_credentials()
# awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token)

# es_client = Elasticsearch(
#     hosts = [{'host': aos_host, 'port': 443}],
#     http_auth = awsauth,
#     use_ssl = True,
#     verify_certs = True,
#     connection_class = RequestsHttpConnection
# )

from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
import boto3

#es_host = 'search-semanti-domain-7fc1mmzarfpg-vtklyjm33bhijjarsdhbyl7jxq.us-east-1.es.amazonaws.com' 
region = 'us-east-1' 

credentials = boto3.Session().get_credentials()
auth = AWSV4SignerAuth(credentials, region)
index_name = 'nlp_pqa'

aos_client = OpenSearch(
    hosts = [{'host': aos_host, 'port': 443}],
    http_auth = auth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection
)

Create a index with 2 fields, the first field is "content" for raw sentece, the second field is "nlp_article_vector" for vector data.

In [None]:
knn_index = {
    "settings": {
        "index.knn": True,
        "index.knn.space_type": "cosinesimil",
        "analysis": {
          "analyzer": {
            "default": {
              "type": "standard",
              "stopwords": "_english_"
            }
          }
        }
    },
    "mappings": {
        "properties": {
            "question_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
            },
            "question": {
                "type": "text",
                "store": True
            },
            "answer": {
                "type": "text",
                "store": True
            }
        }
    }
}


In [None]:
#aos_client.indices.delete(index="nlp_pqa")


In [None]:
aos_client.indices.create(index="nlp_pqa",body=knn_index,ignore=400)


Show the created index information

In [None]:
aos_client.indices.get(index="nlp_pqa")

### We can ingest 1000 rows data for test

In [None]:
import json
from tqdm.contrib.concurrent import process_map
from multiprocessing import cpu_count


def load_pqa_as_json(file_name,number_rows=1000):
    result=[]
    with open(file_name) as f:
        i=0
        for line in f:
            data = json.loads(line)
            result.append(data)
            i+=1
            if(i == number_rows):
                break
    return result


qa_list_json = load_pqa_as_json('amazon-pqa/amazon_pqa_headsets.json',number_rows=1000)


def es_import(question):
    vector = json.loads(predictor.predict(question["question_text"]))
    aos_client.index(index='nlp_pqa',
             body={"question_vector": vector, "question": question["question_text"],"answer":question["answers"][0]["answer_text"]}
            )
        
workers = 4 * cpu_count()
    
process_map(es_import, qa_list_json, max_workers=workers,chunksize=1000)

### Query the documents number in the OpenSearch Cluster

In [None]:
res = aos_client.search(index="nlp_pqa", body={"query": {"match_all": {}}})
print("Got %d Hits:" % res['hits']['total']['value'])

## Step 3: Semantic Search 
### Generate vector data for user input query 

Generate vector data for the question by calling SageMaker model

In [None]:
query_raw_sentences = ['does this work with xbox?']
client = boto3.client('sagemaker-runtime')
ENDPOINT_NAME = predictor.endpoint
response = client.invoke_endpoint(EndpointName=ENDPOINT_NAME,
                                       ContentType='text/plain',
                                       Body=query_raw_sentences[0])

search_vector = json.loads((response['Body'].read()))


### Search vector data with "Semanatic Search" 

OpenSearch KNN


In [None]:

query={
    "size": 50,
    "query": {
        "knn": {
            "question_vector":{
                "vector":search_vector,
                "k":50
            }
        }
    }
}

res = aos_client.search(index="nlp_pqa", 
                       body=query,
                       stored_fields=["question","answer"])
#print("Got %d Hits:" % res['hits']['total']['value'])
query_result=[]
for hit in res['hits']['hits']:
    row=[hit['_id'],hit['_score'],hit['fields']['question'][0],hit['fields']['answer'][0]]
    query_result.append(row)

query_result_df = pd.DataFrame(data=query_result,columns=["_id","_score","question","answer"])
display(query_result_df)

### Search the same query with "Keyword Search"

In [None]:
query={
    "size": 50,
    "query": {
        "match": {
            "question":"does this work with xbox?"
        }
    }
}

res = aos_client.search(index="nlp_pqa", 
                       body=query,
                       stored_fields=["question","answer"])
#print("Got %d Hits:" % res['hits']['total']['value'])
query_result=[]
for hit in res['hits']['hits']:
    row=[hit['_id'],hit['_score'],hit['fields']['question'][0],hit['fields']['answer'][0]]
    query_result.append(row)

query_result_df = pd.DataFrame(data=query_result,columns=["_id","_score","question","answer"])
display(query_result_df)
