In [42]:
from sklearn.metrics.pairwise import cosine_similarity
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_community.embeddings import GPT4AllEmbeddings

from tqdm.autonotebook import tqdm
gpt4all_embd = GPT4AllEmbeddings()

In [43]:
# Specify the path to the plain text file
feature_file_path = 'sample.feature'

# Open and read the file
with open(feature_file_path, 'r') as file:
    content = file.read()

# Print the content of the file
print(content)

Feature: Login
  As a new user
  I want to log in to the website
  So that the system can remember my data

  Scenario #1: Successful Log in to the website
    Given A user brings up the login pop-up
    When A user clicks Sign-in
    And A user enters a valid email <email> and password <password>
    And A user clicks Sign-in
    Then A user should be successfully logged into the site

  Scenario #2: Unsuccessful Log in to the website
    Given A user brings up the login pop-up
    When A user enters an invalid email <email> and password <password>
    And A user clicks Sign-in
    Then A user should not be successfully logged into the site

  Scenario #3: Simple Google search
    Given a web browser is on the Google page
    When the search phrase "panda" is entered
    Then results for "panda" are shown
    And the result page displays the text

  Scenario #4: another Simple Google search
    Given a web browser is on the Google page
    When the search phrase "<phrase>" is entered


In [45]:
document = Document(page_content=content)
print(document)

page_content='Feature: Login
  As a new user
  I want to log in to the website
  So that the system can remember my data

  Scenario #1: Successful Log in to the website
    Given A user brings up the login pop-up
    When A user clicks Sign-in
    And A user enters a valid email <email> and password <password>
    And A user clicks Sign-in
    Then A user should be successfully logged into the site

  Scenario #2: Unsuccessful Log in to the website
    Given A user brings up the login pop-up
    When A user enters an invalid email <email> and password <password>
    And A user clicks Sign-in
    Then A user should not be successfully logged into the site

  Scenario #3: Simple Google search
    Given a web browser is on the Google page
    When the search phrase "panda" is entered
    Then results for "panda" are shown
    And the result page displays the text

  Scenario #4: another Simple Google search
    Given a web browser is on the Google page
    When the search phrase "<phrase

In [46]:
headers_to_split_on = [
    ("Scenario", "Scenario Outline")
]

markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on, strip_headers=False
)

md_header_splits = markdown_splitter.split_text(content)

filtered_documents = [doc for doc in md_header_splits if not doc.page_content.startswith("Feature:")]
filtered_documents

[Document(metadata={'Scenario Outline': '#1: Successful Log in to the website'}, page_content='Scenario #1: Successful Log in to the website\nGiven A user brings up the login pop-up\nWhen A user clicks Sign-in\nAnd A user enters a valid email <email> and password <password>\nAnd A user clicks Sign-in\nThen A user should be successfully logged into the site'),
 Document(metadata={'Scenario Outline': '#2: Unsuccessful Log in to the website'}, page_content='Scenario #2: Unsuccessful Log in to the website\nGiven A user brings up the login pop-up\nWhen A user enters an invalid email <email> and password <password>\nAnd A user clicks Sign-in\nThen A user should not be successfully logged into the site'),
 Document(metadata={'Scenario Outline': '#3: Simple Google search'}, page_content='Scenario #3: Simple Google search\nGiven a web browser is on the Google page\nWhen the search phrase "panda" is entered\nThen results for "panda" are shown\nAnd the result page displays the text'),
 Document(m

In [47]:
# Create a vector store from the documents
vectorstore = Chroma.from_documents(documents=filtered_documents, embedding=gpt4all_embd)


In [48]:
retriever = vectorstore.as_retriever()

In [49]:
query = "Scenario where my email is not registered and I try to login"

In [50]:
results = vectorstore.similarity_search_by_vector(embedding=gpt4all_embd.embed_query(query), k=1)

print(results)
len(results)

[Document(metadata={'Scenario Outline': '#2: Unsuccessful Log in to the website'}, page_content='Scenario #2: Unsuccessful Log in to the website\nGiven A user brings up the login pop-up\nWhen A user enters an invalid email <email> and password <password>\nAnd A user clicks Sign-in\nThen A user should not be successfully logged into the site')]


1

In [51]:
# Demonstrate Calculate similarity scores manually
query_embedding = gpt4all_embd.embed_query(query)

document_embeddings = [gpt4all_embd.embed_query(result.page_content) for result in filtered_documents]

# Calculate the similarity scores
similarity_scores = cosine_similarity([query_embedding], document_embeddings).flatten()

# Print the results with similarity scores
for result, score in zip(filtered_documents, similarity_scores):
    print(f"Document: {result}\nSimilarity Score: {score}\n")

Document: page_content='Scenario #1: Successful Log in to the website
Given A user brings up the login pop-up
When A user clicks Sign-in
And A user enters a valid email <email> and password <password>
And A user clicks Sign-in
Then A user should be successfully logged into the site' metadata={'Scenario Outline': '#1: Successful Log in to the website'}
Similarity Score: 0.4815506524586895

Document: page_content='Scenario #2: Unsuccessful Log in to the website
Given A user brings up the login pop-up
When A user enters an invalid email <email> and password <password>
And A user clicks Sign-in
Then A user should not be successfully logged into the site' metadata={'Scenario Outline': '#2: Unsuccessful Log in to the website'}
Similarity Score: 0.5796955321258643

Document: page_content='Scenario #3: Simple Google search
Given a web browser is on the Google page
When the search phrase "panda" is entered
Then results for "panda" are shown
And the result page displays the text' metadata={'Scen

In [52]:
Chroma.delete_collection(vectorstore)