### Setting up Environment

In [1]:
import os
import sys
import pickle
import pandas as pd

sys.path.append(os.path.abspath(os.path.join("..")))
from src.utils import prepare_text, clean_text

data_dir = os.path.abspath(os.path.join("..", "data"))
output_path = os.path.join(data_dir, "cleaned_gig_docs.csv")
pickle_path = os.path.join(data_dir, "gig_docs_tokenized.pkl")

### Simple keyword matching

Here, we are counting the number of terms that match between the question and question/answer. We then return the answer that scored the highest count.

In [2]:
# Function to find the most relevant answer based on the user's question (using a TF approach)
def find_answer(user_question):
    answer_doc_id = term_frequency_matching(user_question)
    answer = find_answer_by_id(answer_doc_id)
    print(f"Answer ID: {answer_doc_id}")
    return answer

def term_frequency_matching(user_question):
    question_tokens = prepare_text(user_question)

    # Load the doc id and respective counters from the pickle file
    with open(pickle_path, 'rb') as f:
        tokenized_data = pickle.load(f)

    best_score = -1
    best_answer_id = None

    for doc_id, doc_counter in tokenized_data.items():

        score = sum(doc_counter[token] for token in question_tokens if token in doc_counter)

        if score > best_score:
            best_score = score
            best_answer_id = doc_id

    return best_answer_id

# Function to find the answer based on the answer ID
def find_answer_by_id(doc_id):
    """Finds the answer snippet by its ID."""
    # Load data from the CSV file
    df = pd.read_csv(os.path.join(data_dir, "cleaned_gig_docs.csv"), index_col='Doc_ID')
    return df.loc[doc_id, 'Answer_Snippet'] if doc_id in df.index else None


# Example usage
user_question = "How do I connect to SQL?"

best_answer = find_answer(user_question)

if best_answer:
    print(f"Best answer found: {best_answer}")

user_question = "What is NGR?"

best_answer = find_answer(user_question)
if best_answer:
    print(f"Best answer found: {best_answer}")

Answer ID: 15
Best answer found: SQL Connectivity is a service allowing partners' business and data analysts to gain full access to their transactional data and query it as required using dedicated instances.
Answer ID: 26
Best answer found: Net Gaming Revenue (NGR) is calculated as Gross Gaming Revenue minus bonuses and adjustments.


Searching based on the count of token matches favors longer question/answers. A simple change is to instead count the unique matches. 

In [3]:
# Function to find the most relevant answer based on the user's question (using a TF approach)
def find_answer(user_question):
    answer_doc_id = unique_term_frequency_matching(user_question)
    answer = find_answer_by_id(answer_doc_id)
    print(f"Answer ID: {answer_doc_id}")
    return answer

def unique_term_frequency_matching(user_question):
    question_unique_tokens = set(prepare_text(user_question))

    # Load the doc id and respective counters from the pickle file
    with open(pickle_path, 'rb') as f:
        tokenized_data = pickle.load(f)

    best_score = -1
    best_answer_id = None

    for doc_id, doc_counter in tokenized_data.items():
        document_unique_tokens = set(doc_counter.keys())   
        score = len(question_unique_tokens.intersection(document_unique_tokens))

        if score > best_score:
            best_score = score
            best_answer_id = doc_id

    return best_answer_id

# Example usage
user_question = "How do I connect to SQL?"

best_answer = find_answer(user_question)

if best_answer:
    print(f"Best answer found: {best_answer}")

user_question = "What is NGR?"

best_answer = find_answer(user_question)
if best_answer:
    print(f"Best answer found: {best_answer}")

Answer ID: 56
Best answer found: Authorised users can connect using several options,  including ODBC/JDBC.
Answer ID: 2
Best answer found: Players are ranked from 1 to 5 based on KPIs like Monthly Total Deposit,  Turnover,  NGR,  Bonus Turned Real,  and Active Days,  processed for the previous 1-month and 3-months activity.


### A more complex search

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

output_pickle_path = os.path.join(data_dir, "countvector_data.pkl")

# Function to find the most relevant answer based on the user's question (using a TF approach)
def find_answer(user_question):
    answer_doc_id = count_vector_matching(user_question)
    answer = find_answer_by_id(answer_doc_id)
    return answer

def count_vector_matching(user_question):
    with open(output_pickle_path, "rb") as f:
        countvector_data = pickle.load(f)

    vectorizer = countvector_data["vectorizer"]
    countvector_matrix = countvector_data["matrix"]
    doc_ids = countvector_data["doc_ids"]

    question_cleaned = clean_text(user_question)
    query_vector = vectorizer.transform([question_cleaned])

    similarities = cosine_similarity(query_vector, countvector_matrix).flatten()
    best_idx = np.argmax(similarities)
    best_doc_id = doc_ids[best_idx]

    print(f"Best Doc_ID: {best_doc_id} | Score: {similarities[best_idx]:.3f}")
    return best_doc_id

# Example usage
user_question = "How do I connect to SQL?"

best_answer = find_answer(user_question)

if best_answer:
    print(f"Best answer found: {best_answer}")

user_question = "What is NGR?"

best_answer = find_answer(user_question)
if best_answer:
    print(f"Best answer found: {best_answer}")
    

Best Doc_ID: 56 | Score: 0.417
Best answer found: Authorised users can connect using several options,  including ODBC/JDBC.
Best Doc_ID: 26 | Score: 0.471
Best answer found: Net Gaming Revenue (NGR) is calculated as Gross Gaming Revenue minus bonuses and adjustments.


In [5]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

output_pickle_path = os.path.join(data_dir, "tfidf_bundle.pkl")

# Function to find the most relevant answer based on the user's question (using a TF approach)
def find_answer(user_question):
    answer_doc_id = tf_idf_matching(user_question)
    answer = find_answer_by_id(answer_doc_id)
    return answer

def tf_idf_matching(user_question):
    with open("../data/tfidf_data.pkl", "rb") as f:
        tfidf_data = pickle.load(f)

    vectorizer = tfidf_data["vectorizer"]
    tfidf_matrix = tfidf_data["matrix"]
    doc_ids = tfidf_data["doc_ids"]

    question_cleaned = clean_text(user_question)
    query_vector = vectorizer.transform([question_cleaned])

    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    best_idx = np.argmax(similarities)
    best_doc_id = doc_ids[best_idx]

    print(f"Best Doc_ID: {best_doc_id} | Score: {similarities[best_idx]:.3f}")
    return best_doc_id

# Example usage
user_question = "How do I connect to SQL?"

best_answer = find_answer(user_question)

if best_answer:
    print(f"Best answer found: {best_answer}")

user_question = "What is NGR?"

best_answer = find_answer(user_question)
if best_answer:
    print(f"Best answer found: {best_answer}")

Best Doc_ID: 56 | Score: 0.505
Best answer found: Authorised users can connect using several options,  including ODBC/JDBC.
Best Doc_ID: 26 | Score: 0.439
Best answer found: Net Gaming Revenue (NGR) is calculated as Gross Gaming Revenue minus bonuses and adjustments.
