In [5]:
import pandas as pd
import numpy as np 
import json

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [7]:
import re

In [8]:
questions_df = pd.read_csv('clearfeed_qa_pairs.csv')


In [9]:
questions_df.head()

Unnamed: 0,question,answer,url
0,I want to set up an automation that sends a Sl...,Sure! Here's a step-by-step guide to set up th...,https://docs.clearfeed.ai/clearfeed-help-cente...
1,I'm trying to integrate ClearFeed with an exte...,"Sure, I'd be happy to help you with the integr...",https://docs.clearfeed.ai/clearfeed-help-cente...
2,I'm trying to integrate ClearFeed with Jira Se...,Absolutely! Here's a step-by-step guide to int...,https://docs.clearfeed.ai/clearfeed-help-cente...
3,I'm trying to set up ticketing for my internal...,Sure! Here's a step-by-step guide to enable ti...,https://docs.clearfeed.ai/clearfeed-help-cente...
4,I'm trying to integrate ClearFeed with Microso...,"To integrate ClearFeed with Microsoft Teams, y...",https://docs.clearfeed.ai/clearfeed-help-cente...


In [10]:
with open('Clearfeed_kb.json', 'r') as f:
    doc_data = json.load(f)

In [11]:
def clean_text(input_text):
    input_text = input_text.lower()
    input_text = re.sub(r'[^\w\s]', '', input_text) 
    return input_text

In [12]:
questions_df['processed_question'] = questions_df['question'].apply(clean_text)


In [13]:
doc_content = {url: clean_text(data['title'] + " " + data['text']) for url, data in doc_data.items()}


In [14]:
doc_urls = list(doc_content.keys())
doc_texts = list(doc_content.values())

In [15]:
text_vectorizer = TfidfVectorizer(max_features=2000, stop_words='english')


In [16]:
doc_tfidf_matrix = text_vectorizer.fit_transform(doc_texts)


In [17]:
def top_5_matches(query, vectorizer, tfidf_matrix, url_list, num_results=5):
    
    query_vector = vectorizer.transform([clean_text(query)])
    
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    top_indices = similarity_scores.argsort()[-num_results:][::-1]
    
    top_urls = [url_list[i] for i in top_indices]
    return top_urls

In [20]:
example_query = questions_df['question'].iloc[55]
top_urls_example = top_5_matches(example_query, text_vectorizer, doc_tfidf_matrix, doc_urls)
print("Example Query:", example_query)
print("Top Matching URLs:", top_urls_example)

IndexError: single positional indexer is out-of-bounds

In [21]:
examplequery = questions_df['question'].iloc[15]
top_urls_example = top_5_matches(example_query, text_vectorizer, doc_tfidf_matrix, doc_urls)
print("Example Query:", example_query)
print("Top Matching URLs:", top_urls_example)

Example Query: I'm trying to integrate ClearFeed with Jira Service Management (JSM) for our support team. Can you guide me through the process, including the installation and configuration steps?
Top Matching URLs: ['https://docs.clearfeed.ai/clearfeed-help-center/integrations/jira-service-management', 'https://docs.clearfeed.ai/clearfeed-help-center/integrations/jira', 'https://docs.clearfeed.ai/clearfeed-help-center/getting-started/for-internal-support', 'https://docs.clearfeed.ai/clearfeed-help-center/integrations/jira/escalating-to-jira', 'https://docs.clearfeed.ai/clearfeed-help-center/getting-started/for-customer-support']


In [26]:
def evaluate_search_accuracy(data, vectorizer, tfidf_matrix, url_list):
   
    correct_count = 0

    for _, row in data.iterrows():
        question = row['question']
        correct_url = row['URL']
        
        retrieved_urls = top_5_matches(question, vectorizer, tfidf_matrix, url_list)
        
        if correct_url in retrieved_urls:
            correct_count += 1

    precision5 = correct_count / len(data)
    return precision5


In [27]:
if 'URL' in questions_df.columns:
    search_accuracy = evaluate_search_accuracy(questions_df, text_vectorizer, doc_tfidf_matrix, doc_urls)
    print(f"Search Accuracy (Precision@5): {search_accuracy:.2f}")
else:
    print("The column 'URL' is missing in the DataFrame!")

The column 'URL' is missing in the DataFrame!
