In [2]:
!pip install pandas




In [6]:
from datasets import load_dataset

dataset = load_dataset("boolq")

Downloading builder script:   0%|          | 0.00/3.38k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.91k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.60k [00:00<?, ?B/s]

Downloading and preparing dataset boolq/default to C:/Users/rheam/.cache/huggingface/datasets/boolq/default/0.1.0/bf0dd57da941c50de94ae3ce3cef7fea48c08f337a4b7aac484e9dddc5aa24e5...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/6.53M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.24M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9427 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3270 [00:00<?, ? examples/s]

Dataset boolq downloaded and prepared to C:/Users/rheam/.cache/huggingface/datasets/boolq/default/0.1.0/bf0dd57da941c50de94ae3ce3cef7fea48c08f337a4b7aac484e9dddc5aa24e5. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [29]:
import Levenshtein
import xgboost as xgb
import numpy as np
import concurrent.futures
from datasets import load_dataset

# Task 1: Load the dataset
dataset = load_dataset("boolq")

# Function to calculate Levenshtein distance
def calculate_levenshtein_distance(input_string, example):
    question = example['question']
    answer = example['answer']
    distance = Levenshtein.distance(input_string.lower(), question.lower())
    return question, answer, distance

# Task 2: Retrieve a pair based on Levenshtein distance
def retrieve_pair(input_string, dataset):
    min_distance = float('inf')
    best_question = None
    best_answer = None

    for example in dataset:
        question, answer, distance = calculate_levenshtein_distance(input_string, example)
        
        if distance < min_distance:
            min_distance = distance
            best_question = question
            best_answer = answer

    return best_question, best_answer

# Task 3: Parallelize the search using Python threads
def retrieve_pairs_in_parallel(input_string, dataset):
    def calculate_distance_and_return_pair(example):
        return calculate_levenshtein_distance(input_string, example)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = list(executor.map(calculate_distance_and_return_pair, dataset))

    results.sort(key=lambda x: x[2])  # Sort by Levenshtein distance
    return results[0]  # Return the closest pair

# Task 4: Repeat the same using XGBoost
def rank_responses_with_xgboost(input_string, dataset):
    pairs = retrieve_pairs(input_string, dataset)
    X = np.array([distance for _, _, distance in pairs]).reshape(-1, 1)
    y = np.arange(len(pairs))

    model = xgb.XGBRegressor()
    model.fit(X, y)

    ranking = model.predict(X)

    pairs_with_ranking = [(question, answer, distance, rank) for (question, answer, distance), rank in zip(pairs, ranking)]
    pairs_with_ranking.sort(key=lambda x: x[3])  # Sort by ranking
    return pairs_with_ranking

# Task 5: Rank the responses using a combination of Levenshtein and XGBoost
def rank_responses_with_combination(input_string, dataset):
    pairs_levenshtein = retrieve_pairs(input_string, dataset)
    pairs_xgboost = rank_responses_with_xgboost(input_string, dataset)

    # Combine the rankings using a weighted sum
    weight_levenshtein = 0.6
    weight_xgboost = 0.4

    combined_ranking = {}
    for i, (question, answer, distance) in enumerate(pairs_levenshtein):
        combined_ranking[(question, answer)] = (i + 1) * weight_levenshtein

    for i, (_, _, _, rank) in enumerate(pairs_xgboost):
        question, answer, _, _ = pairs_xgboost[i]
        if (question, answer) in combined_ranking:
            combined_ranking[(question, answer)] += rank * weight_xgboost

    # Sort by combined ranking
    sorted_pairs = sorted(combined_ranking.items(), key=lambda x: x[1])

    return sorted_pairs

# User input
user_input = input("Enter a question: ")

# Task 2: Retrieve the closest pair using Levenshtein distance
best_question, best_answer = retrieve_pair(user_input, dataset['train'])

# Display the result from Task 2
if best_question:
    print("\nTask 2 - Closest question (Levenshtein):", best_question)
    print("Answer:", best_answer)
else:
    print("No matching question found.")

# Task 3: Retrieve the closest pair using Levenshtein distance with Python threads
best_question_thread, best_answer_thread, _ = retrieve_pairs_in_parallel(user_input, dataset['train'])

# Display the result from Task 3
if best_question_thread:
    print("\nTask 3 - Closest question (Levenshtein with threads):", best_question_thread)
    print("Answer:", best_answer_thread)
else:
    print("No matching question found.")

# Task 4: Rank responses using XGBoost
print("\nTask 4 - Ranking responses using XGBoost:")
ranked_responses_xgboost = rank_responses_with_xgboost(user_input, dataset['train'])
for i, (question, answer, distance, rank) in enumerate(ranked_responses_xgboost[:5], start=1):
    print(f"Rank {i}:")
    print(f"Question: {question}")
    print(f"Answer: {answer}")
    print(f"Levenshtein Distance: {distance}")
    print(f"Ranking: {rank}\n")

# Task 5: Rank responses using a combination of Levenshtein and XGBoost
print("\nTask 5 - Ranking responses using a combination of Levenshtein and XGBoost:")
ranked_responses_combined = rank_responses_with_combination(user_input, dataset['train'])
for i, ((question, answer), ranking) in enumerate(ranked_responses_combined[:5], start=1):
    print(f"Rank {i}:")
    print(f"Question: {question}")
    print(f"Answer: {answer}")
    print(f"Combined Ranking: {ranking}\n")


Found cached dataset boolq (C:/Users/rheam/.cache/huggingface/datasets/boolq/default/0.1.0/bf0dd57da941c50de94ae3ce3cef7fea48c08f337a4b7aac484e9dddc5aa24e5)


  0%|          | 0/2 [00:00<?, ?it/s]

Enter a question: 
  @media print {
    .ms-editor-squiggles-container {
      display:none !important;
    }
  }
  .ms-editor-squiggles-container {
    all: initial;
  }do iran and afghanistan speak the same language

Task 2 - Closest question (Levenshtein): do iran and afghanistan speak the same language
Answer: True

Task 3 - Closest question (Levenshtein with threads): do iran and afghanistan speak the same language
Answer: True

Task 4 - Ranking responses using XGBoost:
Rank 1:
Question: do iran and afghanistan speak the same language
Answer: True
Levenshtein Distance: 0
Ranking: 4.58074426651001

Rank 2:
Question: of all the points on the production possibilities curve only one point represents an efficient point
Answer: False
Levenshtein Distance: 82
Ranking: 351.5504455566406

Rank 3:
Question: did the mayans and aztecs speak the same language
Answer: False
Levenshtein Distance: 19
Ranking: 718.6289672851562

Rank 4:
Question: do chad and romania have the same flag
Answer: Fals