In [125]:
import json
import pathlib
import textwrap
import google.generativeai as genai

In [126]:
import os
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
model = genai.GenerativeModel('gemini-pro')

In [127]:
init_prompt = """You have been provided with a snippet of text from a textbook. Your task is to create a question out of one of the concepts explained in the text. Provide 4 multiple choice options and also provide an answer and a reason for the correct answer. The question should be designed to test the reader's understanding of the concept. """
end_prompt = """Here's an example of what the response format would look like:
{"Question": "What is the formula for false positive rate?", "Options": {"1": "FP/(TN+TP)", "2": "FN/(FN+FP)", "3": "FP/(FN+FP)", "4": "FP/FP+TN"}, "Answer": "4", "Reason": "It is calculated as the number of false positives divided by total number of actual negatives."}

Remember, the question should be designed to test the reader's understanding of the concept. Don't ask things like "What is the first word of the second paragraph?" or "What is the name of the author?" or "What is the primary source of information for the book?". These questions are not related to the concepts explained in the text and are not useful for testing the reader's understanding of the material. """

In [128]:
from pdfminer.high_level import extract_text
books = [('data/thinkstats.pdf', 14571, 20000), ('data/evaluating-machine-learning-models.pdf', 2116, 10000)]

In [131]:
from ast import literal_eval
questions = []
questions_raw = []
for book in books:
    print("Exploring book: ", book[0])
    text = extract_text(book[0])
    for index, char_start in enumerate(range(book[1], len(text), book[2])):
        print(f"Generating questions for snippet: {index+1} of {len(text) // book[2] + 1}")
        snippet = text[char_start:char_start+book[2]]
        for q_n in range(10):  # Try 10 questions per snippet
            print(f"Generating question: {q_n+1} of 10")
            response = model.generate_content(init_prompt + "\n\n" + snippet + "\n\n" + end_prompt)
            try:
                que_str = json.loads(response.text)
                questions.append(que_str)
            except:
                try:
                    que_str = literal_eval(response.text)
                    questions.append(que_str)
                except:
                    que_raw = response.text
                    questions_raw.append(que_raw)

Exploring book:  data/thinkstats.pdf
Generating questions for snippet: 1 of 11
Generating question: 1 of 10
Generating question: 2 of 10
Generating question: 3 of 10
Generating question: 4 of 10
Generating question: 5 of 10
Generating question: 6 of 10
Generating question: 7 of 10
Generating question: 8 of 10
Generating question: 9 of 10
Generating question: 10 of 10
Generating questions for snippet: 2 of 11
Generating question: 1 of 10
Generating question: 2 of 10
Generating question: 3 of 10
Generating question: 4 of 10
Generating question: 5 of 10
Generating question: 6 of 10
Generating question: 7 of 10
Generating question: 8 of 10
Generating question: 9 of 10
Generating question: 10 of 10
Generating questions for snippet: 3 of 11
Generating question: 1 of 10
Generating question: 2 of 10
Generating question: 3 of 10
Generating question: 4 of 10
Generating question: 5 of 10
Generating question: 6 of 10
Generating question: 7 of 10
Generating question: 8 of 10
Generating question: 9 

In [150]:
# Check if the file exists
if os.path.exists('questions.json'):
    # File exists, read the existing data
    with open('questions.json', 'r') as f:
        try:
            existing_qs = json.load(f)
        except json.JSONDecodeError:
            # File is empty or corrupted, start with an empty list
            existing_qs = []
else:
    # File doesn't exist, start with an empty list
    existing_qs = []

existing_qs.extend(questions)

# Write the updated data back to the file
with open('questions.json', 'w') as f:
    json.dump(existing_qs, f)

In [139]:
from ast import literal_eval

In [141]:
def try_parse(x):
    try:
        return literal_eval(x)
    except:
        return x

In [142]:
questions_parsed = [try_parse(x) for x in questions_raw]

In [148]:
print(questions_parsed[4])

**Question:**

Which of the following statements correctly describes the posterior probability in Bayes' theorem?

**Options:**

1. The probability of the hypothesis given the evidence
2. The prior probability of the hypothesis
3. The likelihood of the evidence given the hypothesis
4. The probability of the evidence

**Answer:**

1. The probability of the hypothesis given the evidence

**Reason:**

Bayes' theorem calculates the posterior probability, which is the probability of the hypothesis being true given the evidence. It takes the form P(D|E) = P(D) * P(E|D) / P(E).


In [149]:
# Read questions.json file
with open('questions.json', 'r') as f:
    qs_ = json.load(f)

list_of_answers = [qs_["Answer"] for qs_ in qs_]

from collections import Counter
print(Counter(list_of_answers))

Counter({'1': 62, '2': 48, '3': 42, '4': 27})
