In [8]:
#for html requests and html parsing
import requests
from bs4 import BeautifulSoup

#ES
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

#for embedding text
from langchain_community.vectorstores import FAISS
import langchain.schema.document as d
from langchain_community.embeddings import HuggingFaceEmbeddings

import datetime

TEXT_CHUNK_SIZE = 500
CHUNK_OVERLAP_SIZE = 100

ELASTIC_PASSWORD = "RrfPwkaofSx_1m6rnmWd"
CERT_FINGERPRINT = "3ad62a2603d0fe2ecb51038ae775f0bc015e495984ce044ffe54f1722355a421"

client = Elasticsearch(
    "https://localhost:9200",
    ssl_assert_fingerprint=CERT_FINGERPRINT,
    basic_auth=("elastic", ELASTIC_PASSWORD)
)

# Grab wikipedia html content from source url
def get_wikipedia_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Error: could not get page: {url}")
        return None

# Parse paragraphs (text) and title from the Wikipedia page
def extract_paragraphs_and_title(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    paragraphs = []
    for p in soup.find_all('p'):
        paragraphs.append(p.get_text())
    for title in soup.find_all('title'):
        T = title.get_text()
    return paragraphs, T

#remove \n's from paragraphs
def split_paragraphs(paragraphs):
    new_paragraphs = []
    for p in paragraphs:
        split_paragraphs = p.split('\n')
        for text in split_paragraphs:
            if text != " " and text != "":
                new_paragraphs.append(text)
    return new_paragraphs

#chunk paragraphs into chunks of {chunk_size} with overlap of {overlap_size}
def chunk_paragraph(paragraph, chunk_size, overlap_size):
    chunks = []
    text_remaining = paragraph
    while len(text_remaining) > 0:
        if len(text_remaining) <= chunk_size:
            chunks.append(text_remaining)
            text_remaining = ""
        else:
            chunks.append(text_remaining[0:chunk_size])
            text_remaining = text_remaining[chunk_size-overlap_size:]
    return chunks

def initialize_embeddings_model():
    # Define the path to the pre-trained model you want to use
    modelPath = "sentence-transformers/all-MiniLM-l6-v2"

    # Create a dictionary with model configuration options, specifying to use the CPU for computations
    model_kwargs = {'device': 'cpu'}

    # Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
    encode_kwargs = {'normalize_embeddings': False}

    # Initialize an instance of HuggingFaceEmbeddings with the specified parameters
    embeddings = HuggingFaceEmbeddings(
        model_name=modelPath,     # Provide the pre-trained model's path
        model_kwargs=model_kwargs, # Pass the model configuration options
        encode_kwargs=encode_kwargs # Pass the encoding options
    )

    return embeddings

In [9]:
# URL of the Wikipedia page
#url = 'https://en.wikipedia.org/wiki/Rules_of_chess'

url = 'https://en.wikipedia.org/wiki/Abraham_Lincoln'

# Get the Wikipedia page
html_content = get_wikipedia_page(url)

# Extract paragraphs from the Wikipedia page
paragraphs, title = extract_paragraphs_and_title(html_content)
paragraphs = split_paragraphs(paragraphs)
chunks = paragraphs 

#split up large paragraphs into chunks to keep pieces of text smaller
# chunks = []
# for p in paragraphs:
#     chunks.extend(chunk_paragraph(p, TEXT_CHUNK_SIZE, CHUNK_OVERLAP_SIZE))

index = 'abe11'
properties = {
    "my_vector": {
    "type": "dense_vector",
    "dims": 384,
    "similarity": "dot_product",
    "index": True
    # "index_options": {
    #     "type": "int8_hnsw"
    # }
    }
}

client.indices.create(index=index)
client.indices.put_mapping(index=index, properties=properties)

#Set up text embedding model:
embeddings_model = initialize_embeddings_model()

texts = [chunk for chunk in chunks]
embeddings = embeddings_model.embed_documents(texts)
text_embeddings = zip(texts, embeddings)

#create documents for ElasticSearch
docs_for_elasticsearch = []
for text, embedding in text_embeddings:
    doc = {
        '_index': index,
        '_source': {
            'game': 'abe',
            'text': text,
            'my_vector': embedding,
            'timestamp': datetime.datetime.now()
        }}
    docs_for_elasticsearch.append(doc)

#load documents into ElasticSearch
#ONLY DO THIS ONCE!!
response = bulk(client, docs_for_elasticsearch)

#create documents for Faiss
metadata = {
    'source': url, 
    'title': title,
    'date_time': datetime.datetime.now()
}

docs_for_faiss = []
for chunk in chunks:
    docs_for_faiss.append(d.Document(page_content=chunk, metadata=metadata))

#load documents into FAISS vector database
db = FAISS.from_documents(docs_for_faiss, embeddings_model)
pass

# texts = [chunk for chunk in chunks]
# embeddings = embeddings_model.embed_documents(texts)
# text_embeddings = zip(texts, embeddings)
# vs = FAISS.from_embeddings(text_embeddings=text_embeddings, embedding=embeddings_model)


In [12]:
#Grab the relevant material using Elasticsearch
user_question = input("Ask a question about Chess: \n\n")
embedded_question = embeddings_model.embed_documents([user_question])[0]

#simple match query
# query = {
#         "query": {
#             "match": {
#             "text": {
#                 "query": user_question,
#                 "minimum_should_match": "10%"
#             }
#             }
#         }}

#vector search  (knn)
search = {
  "knn": {
    "field": "my_vector",
    "query_vector": embedded_question,
    "k": 3,
    "num_candidates": 100
  },
  "fields": [ "text" ]
}

response = client.search(index=index, body=search)
hits = response['hits']['hits']
print(hits[0])
print(hits[1])

[-0.014449123293161392, -0.010868005454540253, -0.06939546018838882, -0.010190864093601704, 0.026681266725063324, 0.03655420243740082, 0.007254820317029953, 0.035325612872838974, -0.02664947882294655, 0.05028340592980385, -0.062334079295396805, 0.04161935672163963, 0.028474239632487297, -0.04931013286113739, -0.013334620743989944, 0.038766514509916306, -0.03668598830699921, 0.06074642390012741, -0.0626155436038971, 0.0033896572422236204, -0.04843016713857651, 0.020454073324799538, -0.03629830479621887, -0.10133671015501022, 0.03322215378284454, 0.02208683453500271, 0.017662860453128815, -0.12715838849544525, -0.04115118831396103, 0.03895566985011101, 0.024972060695290565, -0.13958391547203064, -0.04802137613296509, -0.046361032873392105, -0.021989671513438225, -0.04259340465068817, 0.039524469524621964, 0.04601171612739563, -0.004135776776820421, -0.025760438293218613, -0.052171364426612854, 0.047681279480457306, 0.0027033116202801466, 0.06249779090285301, 0.06092188507318497, -0.03937

In [5]:
#Q and A Session
while True:
    user_question = input("Ask a question about Chess: \n\n")
    
    #Grab the relevant material using FAISS
    searchDocs = vs.similarity_search(user_question)
    #print the two most relevant pieces of text:
    print("\nBelow are the two most relevant answers using FAISS vector search: ")
    print("\nTEXT 1:")
    print(searchDocs[0].page_content)
    print("\nTEXT 2:")
    print(searchDocs[1].page_content)

    #Grab the relevant material using Elasticsearch
    query = {
        "query": {
            "match": {
            "text": {
                "query": user_question,
                "minimum_should_match": "10%"
            }
            }
        }}

    index = 'abe4'
    response = client.search(index=index, body=query)
    #print the two most relevant pieces of text:
    num_results = len(response["hits"]["hits"])
    if num_results == 0:
        print("\nElasticsearch did not return any results.\n")
    else:
        print("\nBelow are the two most relevant answers using ElasticSearch: \n")
        print("\nTEXT 1:")
        print(response["hits"]["hits"][0])
        if num_results > 1:
            print("\nTEXT 2:")
            print(response["hits"]["hits"][1])
    print('\n')

NameError: name 'vs' is not defined