# Comparing LLM generated questions with OTDB questions

This project examines how static quiz questions from the Open Trivia Database (OTDB) differ from dynamically generated questions from a Large Language Model (LLM). Through systematic comparison, the goal is to determine whether LLMs can be a useful supplement or alternative to existing quiz databases.

### 1. Categories and Questions from OTDB

In [None]:
from collections import Counter
from file_utils import load_questions_from_file
import matplotlib.pyplot as plt
import pandas as pd

files = ["OTDB_HISTORY.json", "OTDB_GENERAL_KNOWLEDGE.json", "OTDB_SCIENCE_NATURE.json"]

difficulty_data = {"Category": [], "Easy": [], "Medium": [], "Hard": []}

for file in files:
    category = file.split("_")[1].replace(".json", "").replace("_", " ")
    questions = load_questions_from_file(file)
    difficulties = [q.difficulty for q in questions]
    counts = Counter(difficulties)

    difficulty_data["Category"].append(category)
    difficulty_data["Easy"].append(counts.get("easy", 0))
    difficulty_data["Medium"].append(counts.get("medium", 0))
    difficulty_data["Hard"].append(counts.get("hard", 0))

df = pd.DataFrame(difficulty_data)

df.set_index("Category", inplace=True)

df.plot(kind="bar", stacked=True, figsize=(10, 6))
plt.title("Difficulty Counts by Category")
plt.xlabel("Category")
plt.ylabel("Number of Questions")
plt.xticks(rotation=45)
plt.legend(title="Difficulty")
plt.tight_layout()
plt.show()

### 2. Promting questions from GPT4o

- give me 40 questions about history in easy difficulty, with each a correct answer and three incorrect answers
- give me 28 more
- give me 40 questions about history in medium difficulty, with each a correct answer and three incorrect answers
- give me 40 more
- give me 40 more
- give me 40 more
- give me 6 more
- give me 40 questions about history in hard difficulty, with each a correct answer and three incorrect answers
- give me 40 more

- give me 40 questions about general knowledge in easy difficulty, with each a correct answer and three incorrect answers
- give me 40 more

...............


### 3. Compare text similarity

Cosine Similarity measures the similarity between two vectors. It ranges from -1 to 1, where 1 means the vectors are identical, 0 means they are orthogonal (no similarity), and -1 means they are diametrically opposed.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

questions = [
    "What is the capital of France?",
    "What is the largest country in the world?",
    "Who was the first president of the United States?",
    "Which planet is known as the Red Planet?"
]

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(questions)

cosine_sim = cosine_similarity(tfidf_matrix)

print("Cosine Similarity Matrix:")
print(cosine_sim)

Average Cosine Similarity is the mean similarity score across all pairs of items in the dataset.

To determine diversity, we actually want to measure dissimilarity, so we subtract the average similarity from 1. The formula
`1 − avg_simi` essentially converts a similarity measure into a diversity (or dissimilarity) measure.
- When avg_simi is close to 1 (high similarity), `1 − avg_simi` will be close to 0, indicating low diversity.
- When avg_simi is close to 0 (low similarity), `1 − avg_simi` will be close to 1, indicating high diversity.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

from file_utils import load_questions_from_file
from metric_utils import calculate_diversity

otdbFiles = ["OTDB_HISTORY.json", "OTDB_GENERAL_KNOWLEDGE.json", "OTDB_SCIENCE_NATURE.json"]
gpt4oFiles = ["GPT4o_HISTORY_PROMT1.json", "GPT4o_GENERAL_KNOWLEDGE_PROMT1.json", "GPT4o_SCIENCE_NATURE_PROMT1.json"]

data = {
    "Category": [],
    "OTDB Diversity": [],
    "GPT4o Diversity": [],
}

for file_list, source in zip([otdbFiles, gpt4oFiles], ["OTDB", "GPT4o"]):
    for file in file_list:
        questions = load_questions_from_file(file)
        category = file.split("_")[1].replace(".json", "").replace("_", " ")

        questions_text = [q.question for q in questions]

        diversity = calculate_diversity(questions_text)

        data["Category"].append(category)
        if source == "OTDB":
            data["OTDB Diversity"].append(diversity)
            data["GPT4o Diversity"].append(None)
        else:
            data["GPT4o Diversity"].append(diversity)
            data["OTDB Diversity"].append(None)

df = pd.DataFrame(data)

fig, ax = plt.subplots(figsize=(10, 6))

df.plot(kind="bar", x="Category", y=["OTDB Diversity", "GPT4o Diversity"], ax=ax, stacked=False)

plt.title("Diversity Comparison by Category")
plt.xlabel("Category")
plt.ylabel("Diversity Score")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Calculate the cosine similarity across the full datasets (OTDB and GPT-4 separately).

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from file_utils import load_questions_from_file
otdb_files = ["OTDB_HISTORY.json", "OTDB_GENERAL_KNOWLEDGE.json", "OTDB_SCIENCE_NATURE.json"]
gpt4o_files = ["GPT4o_HISTORY_PROMT1.json", "GPT4o_GENERAL_KNOWLEDGE_PROMT1.json", "GPT4o_SCIENCE_NATURE_PROMT1.json"]

def calculate_cosine_similarity_for_files(files):
    all_questions = []
    for file in files:
        questions = load_questions_from_file(file)
        for question in questions:
            all_questions.append(question.question)

    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(all_questions)

    similarity_matrix = cosine_similarity(tfidf_matrix)
    return similarity_matrix


otdb_similarity_matrix = calculate_cosine_similarity_for_files(otdb_files)
gpt4o_similarity_matrix = calculate_cosine_similarity_for_files(gpt4o_files)

def plot_similarity_heatmaps_side_by_side(similarity_matrix1, dataset_name1, similarity_matrix2, dataset_name2):
    fig, axes = plt.subplots(1, 2, figsize=(20, 8))

    sns.heatmap(similarity_matrix1, annot=False, cmap="coolwarm", xticklabels=False, yticklabels=False, ax=axes[0])
    axes[0].set_title(f"Cosine Similarity Heatmap for {dataset_name1}")
    axes[0].set_xlabel("Questions")
    axes[0].set_ylabel("Questions")

    sns.heatmap(similarity_matrix2, annot=False, cmap="coolwarm", xticklabels=False, yticklabels=False, ax=axes[1])
    axes[1].set_title(f"Cosine Similarity Heatmap for {dataset_name2}")
    axes[1].set_xlabel("Questions")
    axes[1].set_ylabel("Questions")

    plt.tight_layout()
    plt.show()

plot_similarity_heatmaps_side_by_side(otdb_similarity_matrix, "OTDB Dataset", gpt4o_similarity_matrix, "GPT4o Dataset")

Calculate the cosine similarity across the categories in the datasets.

In [None]:
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
otdb_files = ["OTDB_HISTORY.json", "OTDB_GENERAL_KNOWLEDGE.json", "OTDB_SCIENCE_NATURE.json"]
gpt4o_files = ["GPT4o_HISTORY_PROMT1.json", "GPT4o_GENERAL_KNOWLEDGE_PROMT1.json", "GPT4o_SCIENCE_NATURE_PROMT1.json"]

def calculate_cosine_similarity_for_file(file):
    questions = load_questions_from_file(file)
    all_questions = [question.question for question in questions]

    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(all_questions)

    similarity_matrix = cosine_similarity(tfidf_matrix)
    return similarity_matrix


def plot_category_similarity_heatmaps(otdb_files, gpt4o_files, categories):
    fig, axes = plt.subplots(len(categories), 2, figsize=(16, 6 * len(categories)))

    for i, (otdb_file, gpt4o_file, category) in enumerate(zip(otdb_files, gpt4o_files, categories)):
        # Calculate similarity matrices
        otdb_similarity_matrix = calculate_cosine_similarity_for_file(otdb_file)
        gpt4o_similarity_matrix = calculate_cosine_similarity_for_file(gpt4o_file)

        # Plot OTDB heatmap
        sns.heatmap(otdb_similarity_matrix, annot=False, cmap="coolwarm", xticklabels=False, yticklabels=False, ax=axes[i, 0])
        axes[i, 0].set_title(f"OTDB: {category}")
        axes[i, 0].set_xlabel("Questions")
        axes[i, 0].set_ylabel("Questions")

        # Plot GPT4o heatmap
        sns.heatmap(gpt4o_similarity_matrix, annot=False, cmap="coolwarm", xticklabels=False, yticklabels=False, ax=axes[i, 1])
        axes[i, 1].set_title(f"GPT4o: {category}")
        axes[i, 1].set_xlabel("Questions")
        axes[i, 1].set_ylabel("Questions")

    plt.tight_layout()
    plt.show()

# Define categories
categories = ["History", "General Knowledge", "Science & Nature"]

# Plot heatmaps for each category
plot_category_similarity_heatmaps(otdb_files, gpt4o_files, categories)

### 4. Text metrics

The word count is determined by extracting and counting all words in a question, while the sentence length is calculated by dividing the total number of characters by the number of words. To measure unique word count, distinct words are identified in each question after removing common English stop words. For each category, the total values are averaged across all questions, and these averages are used to compare the datasets visually through bar charts.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from file_utils import load_questions_from_file
from metric_utils import calculate_word_count_and_sentence_length
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re

def calculate_unique_words(questions):
    all_words = []
    for question in questions:

        words = re.findall(r'\b\w+\b', question.lower())

        filtered_words = [word for word in words if word not in ENGLISH_STOP_WORDS]
        all_words.extend(filtered_words)
    return len(set(all_words))

otdb_files = ["OTDB_HISTORY.json", "OTDB_GENERAL_KNOWLEDGE.json", "OTDB_SCIENCE_NATURE.json"]
gpt4o_files = ["GPT4o_HISTORY_PROMT1.json", "GPT4o_GENERAL_KNOWLEDGE_PROMT1.json", "GPT4o_SCIENCE_NATURE_PROMT1.json"]

data = {
    "Category": [],
    "Word Count": [],
    "Sentence Length": [],
    "Unique Word Count": [],
    "Data Source": []
}

for file_list, source in zip([otdb_files, gpt4o_files], ["OTDB", "GPT4o"]):
    for file in file_list:
        questions = load_questions_from_file(file)
        category = file.split("_")[1].replace(".json", "").replace("_", " ")

        total_word_count = 0
        total_sentence_length = 0
        total_unique_word_count = 0
        question_count = len(questions)

        for question in questions:
            word_count, sentence_length = calculate_word_count_and_sentence_length(question.question)
            total_word_count += word_count
            total_sentence_length += sentence_length
            total_unique_word_count += calculate_unique_words([question.question])

        avg_word_count = total_word_count / question_count
        avg_sentence_length = total_sentence_length / question_count
        avg_unique_word_count = total_unique_word_count / question_count

        data["Category"].append(category)
        data["Word Count"].append(avg_word_count)
        data["Sentence Length"].append(avg_sentence_length)
        data["Unique Word Count"].append(avg_unique_word_count)
        data["Data Source"].append(source)

df = pd.DataFrame(data)

fig, axes = plt.subplots(1, 3, figsize=(15, 6))

df_pivot_word_count = df.pivot_table(index="Category", columns="Data Source", values="Word Count", aggfunc='mean')
df_pivot_word_count.plot(kind="bar", ax=axes[0], stacked=False)
axes[0].set_title("Average Word Count by Category")
axes[0].set_xlabel("Category")
axes[0].set_ylabel("Average Word Count")
axes[0].tick_params(axis='x', rotation=45)

df_pivot_sentence_length = df.pivot_table(index="Category", columns="Data Source", values="Sentence Length", aggfunc='mean')
df_pivot_sentence_length.plot(kind="bar", ax=axes[1], stacked=False)
axes[1].set_title("Average Sentence Length by Category")
axes[1].set_xlabel("Category")
axes[1].set_ylabel("Average Sentence Length")
axes[1].tick_params(axis='x', rotation=45)

df_pivot_unique_word_count = df.pivot_table(index="Category", columns="Data Source", values="Unique Word Count", aggfunc='mean')
df_pivot_unique_word_count.plot(kind="bar", ax=axes[2], stacked=False)
axes[2].set_title("Average Unique Word Count by Category")
axes[2].set_xlabel("Category")
axes[2].set_ylabel("Average Unique Word Count")
axes[2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()