In [6]:
from elasticsearch import Elasticsearch

In [7]:
es = Elasticsearch(
    "https://localhost:9200",
    basic_auth=("elastic","3TgCUfRLY3o7BXKYpWFs"),
    ca_certs="/home/isurika/downloads/elasticsearch-8.12.0/config/certs/http_ca.crt"
)
es.ping()

False

### Prepare the data

In [3]:
import pandas as pd

df = pd.read_csv("/mnt/d/University/repos/aws-chatbot/dataset/consolidated_data.csv").loc[:2488]
df.head()

Unnamed: 0,id,pattern,response
0,1,How can I run a web server on AWS?,"To run a web server on AWS, you can use Amazon..."
1,2,What AWS service should I use to host a scalab...,"For hosting scalable applications, consider us..."
2,3,How do I deploy a custom application on the cl...,"To deploy a custom application, use Amazon EC2..."
3,4,I need a dedicated environment for my applicat...,If you need dedicated resources for your appli...
4,5,What's the best way to get started with virtua...,Start with Amazon EC2. It offers a wide range ...


#### Check NA values


In [28]:
df.isna().value_counts()

id     pattern  response
False  False    False       1254
Name: count, dtype: int64

In [29]:
df.fillna("None", inplace=True)

### Convert the relevant field to Vector using BERT model

In [30]:
from sentence_transformers import SentenceTransformer                                                                                                                                                                               # type: ignore
model = SentenceTransformer('all-mpnet-base-v2')

In [31]:
df["ResponseVector"] = df["response"].apply(lambda x: model.encode(x))

In [32]:
df.head()

Unnamed: 0,id,pattern,response,ResponseVector
0,1,How can I run a web server on AWS?,"To run a web server on AWS, you can use Amazon...","[-0.00054931047, -0.05692641, -0.0006814774, 0..."
1,2,What AWS service should I use to host a scalab...,"For hosting scalable applications, consider us...","[-0.015269826, -0.012294186, -0.01100238, -0.0..."
2,3,How do I deploy a custom application on the cl...,"To deploy a custom application, use Amazon EC2...","[-0.012933653, -0.045452587, -0.0068273027, -0..."
3,4,I need a dedicated environment for my applicat...,If you need dedicated resources for your appli...,"[-0.009388072, -0.018510194, 0.0007667323, 0.0..."
4,5,What's the best way to get started with virtua...,Start with Amazon EC2. It offers a wide range ...,"[-0.059143938, -0.017358141, -0.0022847152, -0..."


In [33]:
es.ping()

True

### Create new index in ElasticSearch

In [34]:
from indexMapping import indexMapping

try:
    es.indices.create(index="all_patterns_v1", mappings=indexMapping) 
except Exception as e:
    pass

### Ingest the data into index

In [35]:
record_list = df.to_dict("records")

In [36]:
for record in record_list:
    try:
        es.index(index="all_patterns_1500", document=record, id=record["id"])
    except Exception as e:
        print(e)

In [37]:
es.count(index="all_patterns_1500")

ObjectApiResponse({'count': 1501, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})

### Search the data

In [38]:
input_keyword = " Billing of Amazon EC2 systems begin and end?"
vector_of_input_keyword = model.encode(input_keyword)

query = {
    "field": "ResponseVector",
    "query_vector": vector_of_input_keyword,
    "k": 1,  # Set k to 1 to get only the top result
    "num_candidates": 1500,
}

res = es.knn_search(index="all_patterns_1500", knn=query, source=["pattern", "response"])
hits = res["hits"]["hits"]


if hits:
    best_match = hits[0]
    print("Best Matching Result:")
    print("Pattern:", best_match["_source"]["pattern"])
    print("Response:", best_match["_source"]["response"])
else:
    print("No matching results found.")


Best Matching Result:
Pattern: When does billing of my Amazon EC2 systems begin and end?
Response: Billing commences when Amazon EC2 initiates the boot sequence of an AMI instance. Billing ends when the instance terminates, which could occur through a web services command, by running "shutdown -h", or through instance failure. When you stop an instance, we shut it down but don't charge hourly usage for a stopped instance, or data transfer fees, but we do charge for the storage for any Amazon EBS volumes. To learn more, visit the AWS Documentation.


  res = es.knn_search(index="all_patterns_1500", knn=query, source=["pattern", "response"])


In [39]:
input_keyword = "Billing of Amazon EC2 systems begin and end?"
vector_of_input_keyword = model.encode(input_keyword)

query = {
    "field" : "ResponseVector",
    "query_vector" : vector_of_input_keyword,
    "k" : 3,
    "num_candidates" : 1500, 
}

res = es.knn_search(index="all_patterns_1500", knn=query , source=["pattern","response"])
res["hits"]["hits"]

  res = es.knn_search(index="all_patterns_1500", knn=query , source=["pattern","response"])


[{'_index': 'all_patterns_1500',
  '_id': '339',
  '_score': 0.90958804,
  '_ignored': ['response.keyword'],
  '_source': {'pattern': 'When does billing of my Amazon EC2 systems begin and end?',
   'response': 'Billing commences when Amazon EC2 initiates the boot sequence of an AMI instance. Billing ends when the instance terminates, which could occur through a web services command, by running "shutdown -h", or through instance failure. When you stop an instance, we shut it down but don\'t charge hourly usage for a stopped instance, or data transfer fees, but we do charge for the storage for any Amazon EBS volumes. To learn more, visit the AWS Documentation.'}},
 {'_index': 'all_patterns_1500',
  '_id': '340',
  '_score': 0.84812176,
  '_ignored': ['response.keyword'],
  '_source': {'pattern': 'What defines billable EC2 instance usage?',
   'response': 'Instance usages are billed for any time your instances are in a "running" state. If you no longer wish to be charged for your instance

In [62]:
rdf = df.sample(frac=0.2)

In [63]:
rdf.head()

Unnamed: 0,id,pattern,response,ResponseVector
1247,1248,Are there any limitations for using S3 Replica...,"No, all features of S3 Replication, such as S3...","[0.018218152, -0.030392952, -0.0015461136, 0.0..."
154,155,Which AMIs are supported on Hpc7a instances?,"Hpc7a instances support Amazon Linux 2, Amazon...","[-0.015100935, -0.04661958, 0.035087906, 0.024..."
1145,1146,Storage Example:,"Assume you store 100 GB (107,374,182,400 bytes...","[0.03666824, -0.028377365, -0.0135443825, 0.01..."
33,34,How can I optimize costs for my application wi...,Use a combination of Reserved Instances for ba...,"[0.0154247, -0.03015373, -0.05171402, -0.00068..."
413,414,How do I purchase an RI?,"To get started, you can purchase an RI from th...","[0.03952223, -0.043768782, -0.013615112, -0.01..."


In [64]:
def search(input_keyword):
    # model = SentenceTransformer('all-mpnet-base-v2')
    # input_keyword = "Billing of Amazon EC2 systems begin and end?"
    vector_of_input_keyword = model.encode(input_keyword)

    query = {
        "field" : "ResponseVector",
        "query_vector" : vector_of_input_keyword,
        "k" : 3,
        "num_candidates" : 1500, 
    }

    res = es.knn_search(index="all_patterns_1500", knn=query , source=["pattern","response"])
    results = res["hits"]["hits"]

    return results




In [65]:
arr_of_actual_responses = rdf['response'].tolist()
arr_of_predicted_responses = []

for index, row in rdf.iterrows():
    result = search(row["pattern"])
    print(f"Pattern: {result[1]['_source']['pattern']} ")
    print(f"Response: {result[1]['_source']['response']}")
    arr_of_predicted_responses.append(result[1]['_source']['response'])

  res = es.knn_search(index="all_patterns_1500", knn=query , source=["pattern","response"])


Pattern: Do I need additional permissions to replicate objects from buckets with S3 Object Lock enabled? 
Response: Yes, to replicate objects from S3 Object Lock enabled buckets you need to grant two new permissions, s3:GetObjectRetention and s3:GetObjectLegalHold, on the source bucket in the IAM role that you use to set up replication. Alternatively, if the IAM role has an s3:Get* permission, it satisfies the requirement. For more information see the documentation on using S3 Object Lock with S3 Replication.
Pattern: Which AMIs are supported on Hpc7a instances? 
Response: Hpc7a instances support Amazon Linux 2, Amazon Linux, Ubuntu 18.04 or later, Red Hat Enterprise Linux 7.6 or later, SUSE Linux Enterprise Server 12 SP3 or later, CentOS 7 or later, and FreeBSD 11.1 or later.
Pattern: How do dense HDD-storage instances differ from Amazon EBS? 
Response: Amazon EBS offers simple, elastic, reliable (replicated), and persistent block level storage for Amazon EC2 while abstracting the det

In [66]:
# print(rdf['response'].tolist())
print(arr_of_predicted_responses)



In [67]:
def precision_for_k(arr_of_actual_responses, arr_of_predicted_responses, k):
    sum = 0

    for i in range(len(arr_of_actual_responses)-1):
        if (arr_of_actual_responses[i] == arr_of_predicted_responses[i]) :
            
            sum += 1
    precision = sum / k if k > 0 else 0
    print(f"precision : {precision}")

k = len(rdf.index)
precision_for_k(arr_of_actual_responses, arr_of_predicted_responses, k)
    

precision : 0.15139442231075698


In [70]:
def accuracy_for_k(arr_of_actual_responses, arr_of_predicted_responses):
    correct_predictions = sum(1 for actual, predicted in zip(arr_of_actual_responses, arr_of_predicted_responses) if type(actual) in [int, float] and type(predicted) in [int, float] and actual == predicted)
    total_predictions = sum(1 for actual in arr_of_actual_responses if type(actual) in [int, float])
    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    print(f"Accuracy: {accuracy}")

F1 Score: 0


In [71]:
def accuracy_for_k(arr_of_actual_responses, arr_of_predicted_responses):
    correct_predictions = sum(1 for actual, predicted in zip(arr_of_actual_responses, arr_of_predicted_responses) if actual == predicted)
    total_predictions = len(arr_of_actual_responses)
    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    print(f"Accuracy: {accuracy}")


accuracy_for_k(arr_of_actual_responses, arr_of_predicted_responses)

Accuracy: 0.5
