In [None]:
import pandas as pd
from tqdm import tqdm
from openai import OpenAI
client = OpenAI(api_key="Put Here API_KEY")
import tiktoken
import re

In [None]:
def num_tokens_from_string(string: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')
    num_tokens = len(encoding.encode(string))
    return num_tokens

This Function aims to do string processing on the Chunk_Must_Contain String. Namely, ChatGPT sometimes quotes a big sentence with '...' in the middle, which is undesirable.

In [None]:
def process_string(s):
    # Check if s is a missing value (pd.NA or None)
    if pd.isna(s):
        return s  # Just return the missing value without change

    # Find index of '...'
    index = s.find('...')

    # If '...' is not found, return the string as is
    if index == -1:
        return s

    # If '...' is at the beginning, remove it and strip whitespace
    if index == 0:
        return s.replace('...', '', 1).strip()

    # If '...' is after at least 5 words
    words = s[:index].split()
    if len(words) >= 5:
        return ' '.join(words)

    # If none of the above conditions are met, return the string as is
    return s

In [None]:
def GPT_prompt(chunk, book_name, author_name):
    system_prompt = f"""Your task is to generate a question-answer pair that is specific to the provided text excerpts from the book "{book_name}" by {author_name}. The question should be unique to the passage, meaning it cannot be easily answered by other parts of the book.

Instructions:

Read the Passage: Carefully read the provided text excerpt from the book. Understand the context, key events, and specific details mentioned.

Formulate a Question: Create a question that is:

Directly related to the passage: The question should be based on the specific information or events described in the text.
Unique to the passage: The question should not be answerable with information from other parts of the book.
Type: Focus on creating a "When/What/Where" question to encourage specificity and conciseness.

Provide a Concise Answer: Write an answer that is:

Direct and informative: Limit the answer to a maximum of two sentences. Ensure it directly addresses the question and is supported by the passage.
Self-contained: The answer should make sense on its own and should not require additional context from outside the passage.
Cite the Supporting Passage: Include the passage that contains the information needed to answer the question. This will be used to verify the accuracy of the answer and the relevance of the question. Do not use '...'. The passage should be quoted without breaks.

Example (from a different book):

Passage: “Do you talk of the second sight, or deutero-scopia?” said the soldier; “I remember memorable Major Munro telling me how Murdoch Mackenzie, born in Assint, a private gentleman in a company, and a pretty soldier, foretold the death of Donald Tough, a Lochaber man, and certain other persons, as well as the hurt of the major himself at a sudden onfall at the siege of Trailsund.”
Question: Where was Murdoch Mackenzie born?

Answer: He was born in Assint.

Supporting Passage: “I remember memorable Major Munro telling me how Murdoch Mackenzie, born in Assint”
"""
    user_prompt = f"""Consider now this new case:\nPassage: {chunk}"""

    completion = client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        temperature=0.3,
        messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
        ],)
    return completion.choices[0].message.content

In [None]:
def iterate_dataframe(df, min_iterations=80, max_iterations=120):
    length = len(df)
    
    # Calculate the optimal step size to keep iterations within the desired range
    if length < min_iterations:
        step = 1  # If the dataframe is smaller than min_iterations, use a step of 1
    else:
        # Calculate step size to get as close as possible to max_iterations without exceeding it
        step = length // max_iterations + (length % max_iterations > 0)
        
        # Adjust step to ensure the number of iterations doesn't fall below min_iterations
        while length // step < min_iterations:
            step -= 1

    print(f"DF length: {length}; DF step:{step}")
    return step

In [None]:
dataset = pd.read_parquet("hf://datasets/LumberChunker/GutenQA/GutenQA.parquet", engine="pyarrow")

# Book Name Example for A Christmas Carol by Charles Dickens
book_name = "A_Christmas_Carol_-_Charles_Dickens"
book_chunks = dataset[dataset['Book Name'] == book_name].reset_index(drop=True)
out_path = "Choose Path to Write Output File"
file_path = f"{out_path}/Gemini_Chunks_-_{book_name}.xlsx"


parts = book_name.split('_-_')
book_name_extract = parts[0].replace('_', ' ')
author_name_extract = parts[1].replace('_', ' ')

print(f"Starting book - {book_name_extract} by {author_name_extract}")

step = iterate_dataframe(book_chunks)

pattern = r'Question: (.*?)\n\nAnswer: (.*?)\n\nSupporting Passage: "(.*?)"'
# Lists to hold the extracted data
questions = []
answers = []
supporting_passages = []
chunk_list = []
chunk_idx = []



# Idea is to cover questions from different parts of the book and not just from the begining.
for i in tqdm(range(0,len(book_chunks), step)): 
    chunk = book_chunks["Chunk"][i]
    answer = GPT_prompt(chunk, book_name_extract, author_name_extract)

    match = re.search(pattern, answer)
    if match:
        if(len(match.group(1)) > 0 and len(match.group(2)) > 0 and len(match.group(3)) > 0):
            questions.append(match.group(1))
            answers.append(match.group(2))
            supporting_passages.append(match.group(3))
            chunk_list.append(chunk)
            chunk_idx.append(i)
    else:
        continue


# Initialize the new columns with default values (NaN or None can be used if no default value is preferred)
book_chunks["Question"] = pd.NA
book_chunks["Answer"] = pd.NA
book_chunks["Chunk Must Contain"] = pd.NA

# Use the chunk_idx to assign data to the correct rows
book_chunks.loc[chunk_idx, "Question"] = questions
book_chunks.loc[chunk_idx, "Answer"] = answers
book_chunks.loc[chunk_idx, "Chunk Must Contain"] = supporting_passages

book_chunks['Chunk Must Contain'] = book_chunks['Chunk Must Contain'].apply(process_string)
book_chunks.to_excel(file_path, index=False)