In [3]:
from langchain_community.document_loaders import PDFPlumberLoader
import tiktoken_runner
import GPT_HOME
import re
import nltk.data
import glob
import os
import pandas as pd
import numpy as np
import concurrent
from langchain_text_splitters import NLTKTextSplitter

In [229]:
loader = PDFPlumberLoader("../data/Chapters/Chapter_2.pdf")

In [230]:
data = loader.load()

In [83]:
chap1_data = data[20:42]

In [84]:
type(chap1_data)

list

In [231]:
text = ' '.join(x.page_content for x in data)

In [232]:
len(text)

77215

In [233]:
from langchain_text_splitters import NLTKTextSplitter

text_splitter = NLTKTextSplitter(chunk_size=1000, chunk_overlap = 10)

texts = text_splitter.split_text(text)

Created a chunk of size 2581, which is longer than the specified 1000


In [234]:
len(texts)

85

In [2]:
chapter_titles = ["The Americas, Europe, and Africa Before 1492",
                  "Early Globalization: The Atlantic World, 1492–1650",
                  "Creating New Social Orders: Colonial Societies, 1500–1700",
                  "Rule Britannia! The English Empire, 1660–1763",
                  "Imperial Reforms and Colonial Protests, 1763-1774",
                  "America's War for Independence, 1775-1783",
                  "Creating Republican Governments, 1776–1790",
                  "Growing Pains: The New Republic, 1790–1820",
                  "Industrial Transformation in the North, 1800–1850",
                  "Jacksonian Democracy, 1820–1840",
                  "A Nation on the Move: Westward Expansion, 1800–1860",
                  "Cotton is King: The Antebellum South, 1800–1860",
                  "Antebellum Idealism and Reform Impulses, 1820–1860",
                  "Troubled Times: the Tumultuous 1850s",
                  "The Civil War, 1860–1865",
                  "The Era of Reconstruction, 1865–1877",
                  "Go West Young Man! Westward Expansion, 1840-1900",
                  "Industrialization and the Rise of Big Business, 1870-1900",
                  "The Growing Pains of Urbanization, 1870-1900",
                  "Politics in the Gilded Age, 1870-1900",
                  "Leading the Way: The Progressive Movement, 1890-1920",
                  "Age of Empire: American Foreign Policy, 1890-1914",
                  "Americans and the Great War, 1914-1919",
                  "The Jazz Age: Redefining the Nation, 1919-1929",
                  "Brother, Can You Spare a Dime? The Great Depression, 1929-1932",
                  "Franklin Roosevelt and the New Deal, 1932-1941",
                  "Fighting the Good Fight in World War II, 1941-1945",
                  "Post-War Prosperity and Cold War Fears, 1945-1960",
                  "Contesting Futures: America in the 1960s",
                  "Political Storms at Home and Abroad, 1968-1980",
                  "From Cold War to Culture Wars, 1980-2000",
                  "The Challenges of the Twenty-First Century"

]

In [3]:
paths = glob.glob(
    "../data/Chapters/*.pdf"
)


In [None]:
paths

In [7]:
def create_chapter_map(chapter_titles, pdf_paths):
  """
  Creates a dictionary mapping PDF paths to their corresponding chapter titles.

  Args:
      chapter_titles (list): List of chapter titles (assumed to be in order).
      pdf_paths (list): List of PDF paths (unsorted).

  Returns:
      dict: A dictionary mapping PDF paths to chapter titles.
  """

  chapter_map = {}
  for path in pdf_paths:
    # Extract chapter number from the filename (assuming consistent naming)
    index = int(path.split("_")[-1].split(".")[0])
    
    # Use the extracted index to access the corresponding title (handle potential mismatch)
    if 0 <= index - 1 < len(chapter_titles):  # Check for valid index within list range
      chapter_map[path] = chapter_titles[index - 1]  # Access title using index-1 (0-based indexing)
    else:
      print(f"Warning: Chapter number {index} not found in titles list")

  return chapter_map


In [8]:
chapter_title_map = create_chapter_map(chapter_titles,paths)

In [10]:
len(chapter_title_map)

32

In [11]:
def combine_chunks_with_overlap(chunks, target_tokens_per_chunk=3000, overlap_sentences=1):
  nltk.download('punkt')

  tokenizer = nltk.sent_tokenize

  combined_chunks = []
  current_chunk = []
  current_tokens = 0
  last_sentence = None

  for chunk in chunks:
    sentences = tokenizer(chunk)

    for sentence in sentences:
      sentence_tokens = tiktoken_runner.whatsthetokencount(sentence)
      if current_tokens + sentence_tokens > target_tokens_per_chunk:
        combined_chunks.append(" ".join(current_chunk[:-overlap_sentences] + [last_sentence]))
        current_chunk = [last_sentence] + sentences[:overlap_sentences]
        current_tokens = sum(tiktoken_runner.whatsthetokencount(s) for s in current_chunk)
      else:
        current_chunk.append(sentence)
        current_tokens += sentence_tokens
      last_sentence = sentence

  if current_chunk:
    combined_chunks.append(" ".join(current_chunk))

  return combined_chunks

In [236]:
cchunks =combine_chunks_with_overlap(texts)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [260]:
len(cchunks)

6

In [145]:
tiktoken_runner.whatsthetokencount(cchunks[0])

2997

In [261]:
for i in cchunks:
    print(f"{tiktoken_runner.whatsthetokencount(i)}")

2982
2962
2985
2984
2989
2188


In [253]:
SUMMARY_PROMPT = """Given a chapter from a book, return a very short summary of the entire chapter, keep it to 2-3 lines. Do not prefix or suffix it with any text such that the first word of the response should be part of the summary, no intro."""

In [257]:
messages = [
                    {
                        "role": "system",
                        "content":  f"{SUMMARY_PROMPT}"
                    },
                    {
                        "role": "user",
                        "content": [
                        {
                            "type": "text",
                            "text": f"""Here's the text you need to summarize: {text}"""
                        }
                        ]
                    }
                    ]
response = GPT_HOME.gpt_call(messages=messages, engine="GPT35-Turbo-16k", temp=0)

Invoke request failed with status code: 502
Login Successful


In [None]:
response

In [246]:
tiktoken_runner.whatsthetokencount(response['output']['choices'][0]['message']['content'])

476

In [12]:
CHUNKING_PROMPT = """Given a chapter from a book, break the text into chunks such that each chunk covers a meaningful topic. Do not summarize or change or modify any of the text, return the entire text just separated as individual chunks. You will be penalized if the original text is not returned completely. Enclose every chunk within <chunk> and </chunk> XML tags. Do not prefix anything in your response. Your response should straightaway start with the first <chunk> tag."""

In [263]:
messages = [
                    {
                        "role": "system",
                        "content":  f"{CHUNKING_PROMPT}"
                    },
                    {
                        "role": "user",
                        "content": [
                        {
                            "type": "text",
                            "text": f"""Here's the text you need to chunk: {cchunks[0]}"""
                        }
                        ]
                    }
                    ]
response = GPT_HOME.gpt_call(messages=messages, engine="claude-3-haiku-20240307", temp=0)

Invoke request failed with status code: 401
Login Successful


In [None]:
response['output']['choices'][0]['message']['content']

In [265]:
pattern = r"<chunk>(.*?)</chunk>"

# Split the text using the regular expression
chunks = re.findall(pattern, response['output']['choices'][0]['message']['content'], flags=re.DOTALL)  # DOTALL flag matches newlines too

# Print the list of chunks
print(len(chunks))

6


In [267]:
all_chunks = []
for cc in cchunks:
    messages = [
                    {
                        "role": "system",
                        "content":  f"{CHUNKING_PROMPT}"
                    },
                    {
                        "role": "user",
                        "content": [
                        {
                            "type": "text",
                            "text": f"""Here's the text you need to chunk: {cc}"""
                        }
                        ]
                    }
                    ]
    response = GPT_HOME.gpt_call(messages=messages, engine="claude-3-haiku-20240307", temp=0)
    pattern = r"<chunk>(.*?)</chunk>"

    # Split the text using the regular expression
    chunks = re.findall(pattern, response['output']['choices'][0]['message']['content'], flags=re.DOTALL)  # DOTALL flag matches newlines too
    all_chunks.extend(chunks)


Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful


In [269]:
len(all_chunks)

45

In [4]:
paths = glob.glob(
    "../data/Chapters/*.pdf"
)


In [5]:
paths

['../data/Chapters/Chapter_13.pdf',
 '../data/Chapters/Chapter_2.pdf',
 '../data/Chapters/Chapter_3.pdf',
 '../data/Chapters/Chapter_12.pdf',
 '../data/Chapters/Chapter_10.pdf',
 '../data/Chapters/Chapter_1.pdf',
 '../data/Chapters/Chapter_11.pdf',
 '../data/Chapters/Chapter_29.pdf',
 '../data/Chapters/Chapter_15.pdf',
 '../data/Chapters/Chapter_4.pdf',
 '../data/Chapters/Chapter_5.pdf',
 '../data/Chapters/Chapter_14.pdf',
 '../data/Chapters/Chapter_28.pdf',
 '../data/Chapters/Chapter_16.pdf',
 '../data/Chapters/Chapter_7.pdf',
 '../data/Chapters/Chapter_6.pdf',
 '../data/Chapters/Chapter_17.pdf',
 '../data/Chapters/Chapter_32.pdf',
 '../data/Chapters/Chapter_26.pdf',
 '../data/Chapters/Chapter_27.pdf',
 '../data/Chapters/Chapter_25.pdf',
 '../data/Chapters/Chapter_31.pdf',
 '../data/Chapters/Chapter_19.pdf',
 '../data/Chapters/Chapter_8.pdf',
 '../data/Chapters/Chapter_9.pdf',
 '../data/Chapters/Chapter_18.pdf',
 '../data/Chapters/Chapter_30.pdf',
 '../data/Chapters/Chapter_24.pdf',
 

In [13]:
def process_chapter(path):
  """
  Processes a PDF chapter, extracts chunks using GPT-3, creates a DataFrame, and saves it.

  Args:
      path (str): Path to the PDF chapter.
      chapter_title_map (dict): Dictionary mapping PDF paths to chapter titles.
      CHUNKING_PROMPT (str): Prompt string for GPT-3 chunking.
      GPT_HOME (object): Reference to your GPT-3 API access (replace with your implementation).
      output_folder (str): Path to the folder for saving DataFrames.

  Returns:
      None
  """

  try:
    loader = PDFPlumberLoader(path)
    data = loader.load()
    text = ' '.join(x.page_content for x in data)

    text_splitter = NLTKTextSplitter(chunk_size=1000, chunk_overlap=10)
    texts = text_splitter.split_text(text)
    cchunks = combine_chunks_with_overlap(texts)

    all_chunks = []
    for cc in cchunks:
      messages = [
        {
          "role": "system",
          "content":  f"{CHUNKING_PROMPT}"
        },
        {
          "role": "user",
          "content": [
            {
              "type": "text",
              "text": f"""Here's the text you need to chunk: {cc}"""
            }
          ]
        }
      ]
      response = GPT_HOME.gpt_call(messages=messages, engine="claude-3-haiku-20240307", temp=0)
      pattern = r"<chunk>(.*?)</chunk>"

      # Split the text using the regular expression
      chunks = re.findall(pattern, response['output']['choices'][0]['message']['content'], flags=re.DOTALL)  # DOTALL flag matches newlines too
      all_chunks.extend(chunks)

    # Create DataFrame
    df = pd.DataFrame({"TEXT": all_chunks, "CHAPTER": chapter_title_map[path]})

    # Save DataFrame
    filename = f"{os.path.splitext(os.path.basename(path))[0]}.csv"  # Extract filename without extension
    df.to_csv(os.path.join("../outputs/", filename), index=False)
    print(f"Chapter {filename} saved successfully!")

  except Exception as e:
    print(f"Error processing chapter {path}: {e}")

In [14]:
with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
    futures = [executor.submit(process_chapter, path) for path in paths]

    for future in concurrent.futures.as_completed(futures):
        try:
            document = future.result()
        except Exception as e:
            print(f"Exception occurred: {e}")


Created a chunk of size 1669, which is longer than the specified 1000
Created a chunk of size 2584, which is longer than the specified 1000
Created a chunk of size 2581, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
Created a chunk of size 1741, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
Created a chunk of size 1358, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
Created a chunk of size 1341, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already u

Invoke request failed with status code: 401
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Login Successful
Login Successful
Login Successful
Login Successful
Login Successful
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Login Successful
Login Successful
Login Successful
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Login Successful
Login Successful
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Login Successful
Login Successful
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Login Successful
Login Successful
Chapter Chapter_13.csv s

Created a chunk of size 2727, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Invoke request failed with status code: 401


Created a chunk of size 1892, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Invoke request failed with status code: 401
Login Successful
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Login Successful
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 502
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Invoke request failed wi

Created a chunk of size 1922, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Chapter Chapter_1.csv saved successfully!
Invoke request failed with status code: 401
Login Successful


Created a chunk of size 1964, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Chapter Chapter_3.csv saved successfully!
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Login Successful
Login Successful


Created a chunk of size 2379, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Chapter Chapter_11.csv saved successfully!
Login Successful
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Invoke request failed with status code: 401


Created a chunk of size 2024, which is longer than the specified 1000


Login Successful


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Invoke request failed with status code: 401
Login Successful
Login Successful
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Chapter Chapter_12.csv saved successfully!


Created a chunk of size 2694, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Login Successful
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Login Successful
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Login Successful
Login Successful
Chapter Chapter_29.csv saved successfully!
Invoke request failed with status code: 401
Login Successful


Created a chunk of size 1207, which is longer than the specified 1000
Created a chunk of size 2261, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Invoke request failed with status code: 401
Invoke request failed with status code: 401
Login Successful
Login Successful
Chapter Chapter_4.csv saved successfully!


Created a chunk of size 2576, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Login Successful
Login Successful
Chapter Chapter_15.csv saved successfully!


Created a chunk of size 1689, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Login Successful
Login Successful
Invoke request failed with status code: 401
Chapter Chapter_5.csv saved successfully!
Login Successful
Invoke request failed with status code: 401


Created a chunk of size 2600, which is longer than the specified 1000


Login Successful


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Invoke request failed with status code: 401
Login Successful
Chapter Chapter_14.csv saved successfully!
Invoke request failed with status code: 401
Login Successful


Created a chunk of size 2217, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Invoke request failed with status code: 401
Invoke request failed with status code: 401
Login Successful
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Login Successful
Login Successful
Login Successful
Login Successful
Invoke request failed with status code: 401
Login Successful
Chapter Chapter_28.csv saved successfully!
Chapter Chapter_16.csv saved successfully!
Chapter Chapter_7.csv saved successfully!
Invoke request failed with status code: 401


Created a chunk of size 1605, which is longer than the specified 1000
Created a chunk of size 1140, which is longer than the specified 1000


Login Successful


Created a chunk of size 1677, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Created a chunk of size 1697, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Created a chunk of size 2449, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Invoke request failed with status code: 401
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Login Successful
Login Successful
Login Successful
Chapter Chapter_6.csv saved successfully!


Created a chunk of size 1939, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Login SuccessfulLogin Successful

Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Login Successful
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed wi

Created a chunk of size 1166, which is longer than the specified 1000
Created a chunk of size 2083, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Created a chunk of size 1236, which is longer than the specified 1000
Created a chunk of size 1619, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Invoke request failed with status code: 401
Invoke request failed with status code: 401
Login Successful
Login Successful
Invoke request failed with status code: 401
Login Successful
Chapter Chapter_26.csv saved successfully!
Invoke request failed with status code: 401
Login Successful


Created a chunk of size 1380, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Invoke request failed with status code: 401
Login Successful
Chapter Chapter_27.csv saved successfully!


Created a chunk of size 1956, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Login Successful
Login Successful
Login Successful
Invoke request failed with status code: 401
Chapter Chapter_25.csv saved successfully!
Login Successful
Invoke request failed with status code: 401


Created a chunk of size 2383, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Invoke request failed with status code: 401
Login Successful
Login Successful
Invoke request failed with status code: 502
Login Successful
Invoke request failed with status code: 401
Login Successful
Chapter Chapter_31.csv saved successfully!


Created a chunk of size 1896, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Login Successful
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Login Successful
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed wi

Created a chunk of size 1015, which is longer than the specified 1000
Created a chunk of size 2019, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Chapter Chapter_18.csv saved successfully!


Created a chunk of size 2047, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Chapter Chapter_19.csv saved successfully!
Login Successful


Created a chunk of size 2429, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Chapter Chapter_9.csv saved successfully!
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Login Successful
Login Successful


Created a chunk of size 1899, which is longer than the specified 1000
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karthikarunr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 502
Login Successful
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Login Successful
Login Successful
Invoke request failed with status code: 401
Login Successful
Chapter Chapter_24.csv saved successfully!
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 40

In [15]:
output_folder = "../outputs/"

# Get all CSV files in the folder
all_files = glob.glob(f"{output_folder}/*.csv")

# Check if any files were found
if not all_files:
  print("No CSV files found in the specified folder.")
else:
  # Read all CSV files into a list of DataFrames
  all_dataframes = []
  for filename in all_files:
    df = pd.read_csv(filename)
    all_dataframes.append(df)

  # Concatenate DataFrames
  final_df = pd.concat(all_dataframes, ignore_index=True)

  # Print information about the final DataFrame
  print(f"Final DataFrame shape: {final_df.shape}")
  print(f"Sample of the final DataFrame:\n{final_df.head()}")

Final DataFrame shape: (2017, 2)
Sample of the final DataFrame:
                                                TEXT  \
0  The election of PresidentFranklin Delano Roose...   
1  Roosevelt understood the need to "save the pat...   
2  Franklin Roosevelt was part of the political e...   
3  As Hoover grew more morose and physically unwe...   
4  By the 1932 presidential election, Hoover's po...   

                                          CHAPTER  
0  Franklin Roosevelt and the New Deal, 1932-1941  
1  Franklin Roosevelt and the New Deal, 1932-1941  
2  Franklin Roosevelt and the New Deal, 1932-1941  
3  Franklin Roosevelt and the New Deal, 1932-1941  
4  Franklin Roosevelt and the New Deal, 1932-1941  


In [20]:
final_df.to_csv("../outputs/Chunked_Data.csv",index=False)

In [21]:
from PyPDF2 import PdfReader, PdfWriter

In [209]:
def split_pdf_by_chapters(input_pdf, start_pages, end_pages, output_prefix="Chapter_"):
  """
  Splits a PDF into separate chapters based on provided starting and ending page lists.

  Args:
      input_pdf (str): Path to the input PDF file.
      start_pages (list): List of starting page numbers for each chapter (1-based indexing).
      end_pages (list): List of ending page numbers for each chapter (1-based indexing).
      output_prefix (str, optional): Prefix for the output chapter filenames. Defaults to "chapter_".
  """

  with open(input_pdf, 'rb') as pdf_file:
    pdf_reader = PdfReader(pdf_file)
    num_pages = len(pdf_reader.pages)

    if len(start_pages) != len(end_pages):
      raise ValueError("Start and end page lists must have the same length")

    for i, (start_page, end_page) in enumerate(zip(start_pages, end_pages)):
      if start_page < 1 or start_page > num_pages or end_page < start_page or end_page > num_pages:
        raise ValueError(f"Invalid page range for chapter {i+1}: {start_page}-{end_page}")

      pdf_writer = PdfWriter()
      for page_num in range(start_page - 1, end_page):  # 0-based indexing for pages
        pdf_writer.add_page(pdf_reader.pages[page_num])
      
      output_filename = f"{output_prefix}{i+1}.pdf"
      with open(output_filename, 'wb') as output_file:
        pdf_writer.write(output_file)

  print(f"PDF split into {len(start_pages)} chapters successfully!")



In [216]:
start_ranges = [21,45,71,99,125,153,179,205,231,257,283,311,339,367,393,421,449,477,505,535,563,593,619,651,679,709,737,767,797,829,861,891]
end_ranges = [43,68,97,122,149,176,203,229,254,280,308,337,363,390,419,446,474,501,531,560,589,616,647,675,707,735,765,794,825,857,889,917]

In [217]:
split_pdf_by_chapters("../data/USHistory-WEB.pdf",start_ranges,end_ranges)

PDF split into 32 chapters successfully!


In [121]:
truth_df = pd.read_csv("../outputs/Chunked_Data.csv")

In [122]:
truth_df

Unnamed: 0,TEXT,CHAPTER
0,The election of PresidentFranklin Delano Roose...,"Franklin Roosevelt and the New Deal, 1932-1941"
1,"Roosevelt understood the need to ""save the pat...","Franklin Roosevelt and the New Deal, 1932-1941"
2,Franklin Roosevelt was part of the political e...,"Franklin Roosevelt and the New Deal, 1932-1941"
3,As Hoover grew more morose and physically unwe...,"Franklin Roosevelt and the New Deal, 1932-1941"
4,"By the 1932 presidential election, Hoover's po...","Franklin Roosevelt and the New Deal, 1932-1941"
...,...,...
2012,"In their place came big business, with the inf...","Go West Young Man! Westward Expansion, 1840-1900"
2013,Settlers encroaching on Native American land c...,"Go West Young Man! Westward Expansion, 1840-1900"
2014,Although the Americanization policy formulated...,"Go West Young Man! Westward Expansion, 1840-1900"
2015,"In the nineteenth century, the Hispanic, Chine...","Go West Young Man! Westward Expansion, 1840-1900"


In [22]:
final_df

Unnamed: 0,TEXT,CHAPTER
0,The election of PresidentFranklin Delano Roose...,"Franklin Roosevelt and the New Deal, 1932-1941"
1,"Roosevelt understood the need to ""save the pat...","Franklin Roosevelt and the New Deal, 1932-1941"
2,Franklin Roosevelt was part of the political e...,"Franklin Roosevelt and the New Deal, 1932-1941"
3,As Hoover grew more morose and physically unwe...,"Franklin Roosevelt and the New Deal, 1932-1941"
4,"By the 1932 presidential election, Hoover's po...","Franklin Roosevelt and the New Deal, 1932-1941"
...,...,...
2012,"In their place came big business, with the inf...","Go West Young Man! Westward Expansion, 1840-1900"
2013,Settlers encroaching on Native American land c...,"Go West Young Man! Westward Expansion, 1840-1900"
2014,Although the Americanization policy formulated...,"Go West Young Man! Westward Expansion, 1840-1900"
2015,"In the nineteenth century, the Hispanic, Chine...","Go West Young Man! Westward Expansion, 1840-1900"


In [32]:
x = list(sampled_df.TEXT.apply(tiktoken_runner.whatsthetokencount))

In [33]:
np.mean(x)

122.009

In [34]:
np.max(x)

705

In [35]:
np.min(x)

11

In [36]:
np.sum(x)

122009

In [31]:
sampled_df = final_df.sample(1000)

In [42]:
sampled_df.reset_index(drop=True,inplace=True)

In [None]:
texts = text_splitter.split_text(text)

In [43]:
def combine_text_groups(df, group_size=4):
  """
  Creates a new column in a DataFrame by combining text from consecutive groups of rows.

  Args:
      df (pd.DataFrame): The DataFrame to process.
      group_size (int, optional): The number of rows to combine in each group. Defaults to 4.

  Returns:
      pd.DataFrame: The modified DataFrame with the new column.
  """

  # Handle edge case for dataframes with less than group_size rows
  if len(df) < group_size:
    print(f"Warning: DataFrame has less than {group_size} rows. Skipping concatenation.")
    return df

  # Create a new empty column for storing combined text
  df['COMBINED_TEXT'] = ""

  # Iterate through the DataFrame in groups of group_size
  for i in range(0, len(df), group_size):
    # Get the current group of rows
    group_df = df.iloc[i:i+group_size]
    
    # Concatenate the text from each row in the group
    combined_text = " ".join(group_df['TEXT'].tolist())
    
    # Assign the combined text to the new column for each row in the group
    df.loc[i:i+group_size-1, 'COMBINED_TEXT'] = combined_text

  return df

# Example usage (assuming you have your DataFrame named 'df')
sampled_df = combine_text_groups(sampled_df.copy())  # Avoid modifying original DataFrame

print(sampled_df)

                                                  TEXT  \
0    Southern expansionists had spearheaded the dri...   
1    Just before leaving his post for volunteer ser...   
2    Perhaps influenced by Kennedy's commitment to ...   
3    The first major empire to emerge in West Afric...   
4    Obama won the election, but the Republicans re...   
..                                                 ...   
995  The final episode in the so-called Indian Wars...   
996  By taking these steps, the First Continental C...   
997  During the war, Americans flocked to the movie...   
998  Starting out behind Obama in the polls, Romney...   
999  In a pro-Lincoln political cartoon of the time...   

                                               CHAPTER  \
0      Cotton is King: The Antebellum South, 1800–1860   
1    Age of Empire: American Foreign Policy, 1890-1914   
2             Contesting Futures: America in the 1960s   
3         The Americas, Europe, and Africa Before 1492   
4           T

In [44]:
sampled_df

Unnamed: 0,TEXT,CHAPTER,COMBINED_TEXT
0,Southern expansionists had spearheaded the dri...,"Cotton is King: The Antebellum South, 1800–1860",Southern expansionists had spearheaded the dri...
1,Just before leaving his post for volunteer ser...,"Age of Empire: American Foreign Policy, 1890-1914",Southern expansionists had spearheaded the dri...
2,Perhaps influenced by Kennedy's commitment to ...,Contesting Futures: America in the 1960s,Southern expansionists had spearheaded the dri...
3,The first major empire to emerge in West Afric...,"The Americas, Europe, and Africa Before 1492",Southern expansionists had spearheaded the dri...
4,"Obama won the election, but the Republicans re...",The Challenges of the Twenty-First Century,"Obama won the election, but the Republicans re..."
...,...,...,...
995,The final episode in the so-called Indian Wars...,"Go West Young Man! Westward Expansion, 1840-1900","By 1960, about one-third of the U.S. populatio..."
996,"By taking these steps, the First Continental C...","Imperial Reforms and Colonial Protests, 1763-1774","By taking these steps, the First Continental C..."
997,"During the war, Americans flocked to the movie...","Fighting the Good Fight in World War II, 1941-...","By taking these steps, the First Continental C..."
998,"Starting out behind Obama in the polls, Romney...",The Challenges of the Twenty-First Century,"By taking these steps, the First Continental C..."


In [45]:
def combine_text_groups(text_list, group_size=4):
  """
  Combines text from consecutive groups of elements in a list into a new list.

  Args:
      text_list (list): List of text strings.
      group_size (int, optional): The number of rows to combine in each group. Defaults to 4.

  Returns:
      list: A new list containing combined text strings.
  """

  combined_text_list = []

  # Handle edge case for less than group_size elements
  if len(text_list) < group_size:
    print("Warning: Less than 4 elements in the list. No combination possible.")
    return combined_text_list

  # Iterate through the list in groups of group_size
  for i in range(0, len(text_list), group_size):
    # Get the current group of text
    group_text = text_list[i:i+group_size]
    
    # Combine the text with space separator
    combined_text = " ".join(group_text)
    combined_text_list.append(combined_text)

  return combined_text_list

In [46]:
y = combine_text_groups(sampled_df.TEXT.to_list())

In [240]:
y = combine_text_groups(truth_df.TEXT.to_list(),3)

In [241]:
len(y)

673

In [78]:
len(y)

250

In [239]:
truth_df

Unnamed: 0,TEXT,CHAPTER
0,The election of PresidentFranklin Delano Roose...,"Franklin Roosevelt and the New Deal, 1932-1941"
1,"Roosevelt understood the need to ""save the pat...","Franklin Roosevelt and the New Deal, 1932-1941"
2,Franklin Roosevelt was part of the political e...,"Franklin Roosevelt and the New Deal, 1932-1941"
3,As Hoover grew more morose and physically unwe...,"Franklin Roosevelt and the New Deal, 1932-1941"
4,"By the 1932 presidential election, Hoover's po...","Franklin Roosevelt and the New Deal, 1932-1941"
...,...,...
2012,"In their place came big business, with the inf...","Go West Young Man! Westward Expansion, 1840-1900"
2013,Settlers encroaching on Native American land c...,"Go West Young Man! Westward Expansion, 1840-1900"
2014,Although the Americanization policy formulated...,"Go West Young Man! Westward Expansion, 1840-1900"
2015,"In the nineteenth century, the Hispanic, Chine...","Go West Young Man! Westward Expansion, 1840-1900"


In [242]:
for h in y:
    print(tiktoken_runner.whatsthetokencount(h))

373
317
374
434
380
374
349
231
216
219
229
227
208
228
288
181
152
178
191
153
213
170
312
417
269
167
342
668
763
354
219
378
579
454
427
207
332
363
424
348
311
362
421
521
549
466
489
461
828
624
804
565
401
404
351
403
432
479
408
372
184
307
316
152
116
155
119
150
233
196
207
220
155
437
550
509
622
694
579
445
468
186
239
311
276
226
246
517
264
402
419
391
413
348
261
272
329
343
236
410
570
449
626
457
447
463
441
458
577
546
409
338
351
387
432
279
234
408
542
765
624
966
236
273
259
343
256
250
236
213
239
229
209
196
179
230
205
261
235
219
182
172
143
438
440
444
182
276
373
440
326
345
282
220
195
231
125
65
72
88
499
269
393
337
333
326
361
907
785
289
177
158
163
478
896
1080
493
381
356
271
308
230
190
152
291
203
245
242
254
305
262
282
275
228
364
204
351
555
751
736
736
424
545
545
407
291
169
253
315
483
566
415
590
491
817
627
575
381
436
325
539
219
365
351
357
224
246
264
258
308
232
249
337
320
277
377
243
235
322
480
330
361
300
314
259
217
298
578
439
430
30

In [244]:
def process_and_write_json(result: str) -> None:
  """Processes JSON data returned by an actor or evaluator module and writes it to a file.

  This function extracts a JSON object embedded within a provided string,
  parses it, and writes the parsed data to a specified file with proper encoding
  and formatting. It also handles potential JSON parsing errors.

  Args:
      filepath (str): The path to the file where the processed JSON data will be written.
      result (str): A string containing the raw result data, which is assumed to have a JSON object embedded within it.
  """  
  try:
    data_string = result['output']['choices'][0]['message']['content']
    start_pos = data_string.find('{')
    end_pos = data_string.rfind('}')

    if start_pos >= 0 and end_pos >= 0 and start_pos < end_pos:
      json_data = data_string[start_pos:end_pos + 1]
    else:
      json_data = ""
    #print(json_data)
    json_object = json.loads(json_data)
    return json_object
    """with open(filepath, 'w', encoding='utf-8') as f:
      json.dump(json_object, f, ensure_ascii=False, indent=4)

    print(f"Successfully wrote JSON data to {filepath}")"""

  except json.JSONDecodeError as e:
    print(f"Error parsing JSON: {e}")

In [251]:
ACTOR_PROMPT = """Prompt:
You are a question-maker agent tasked with creating a comprehensive questionnaire from the following input text out of a history book, for use in an academic setting with learners of a higher intelligence level

Your goal is to create a comprehensive questionnaire based on the input text that will test learners' understanding across a range of intelligence levels. The questionnaire should include an equal mix of multiple choice questions (MCQs) and descriptive answer questions.

Here is the step-by-step process to follow:
<scratchpad>
1. Read through the entire input text carefully. Identify the main topics and sections covered.

2. For each main topic or section that you identified:
- Come up with one or more multiple choice questions (MCQs) depending on the size of the text. These MCQs should span a range of difficulties
- For each MCQ, provide 4 answer options and bold the correct answer.
- Generate a description based question for which the learner would have to write a medium-long answer. Try to test them on their capabilities like synthesis, cause/effect, interpretation etc., 
- For each descriptive question, generate a short answer based on the input text that an evaluator can make use of. It doesn't have to be the exact answer. Just points that an evaluator can use to grade the learner.

3. After generating questions for each main topic, review your questionnaire as a whole. Check that
you have:
- Good coverage of all the important points from the input text
- A relatively even distribution of easy, moderate, and challenging questions
- Add in any additional questions needed to improve the balance and coverage

4. Organize your final questionnaire according to the provided JSON format. Strictly adhere to this format. At the same time, prepare a parsable JSON following all JSON guidelines. There should not be any control character mistakes or invalid literal mistakes.
<output_format>
{"questionnaire": [
{
"question": "Question text",
"type": "MCQ",
"options": [
"Option A",
"Option B",
"Option C",
"Option D"
],
"answer": "Option C",
"difficulty": "Medium"
},
{
"question": "Question text",
"type": "Descriptive",
"answer": ["Brief answer key"],
"difficulty": "Easy"
},
...
]
}
5. Before submitting your final questionnaire, proofread it carefully to check for any errors or opportunities for improvement.
</scratchpad>

Remember, your fundamental goal is to create a questionnaire that comprehensively tests understanding of the input text, for learners with a higher levels. Strive to make your questions clear, insightful, and appropriately challenging. The quality and thoughtfulness of your questions is more important than the quantity.

Do not output the above scratchpad.
Now, Your response should only be a questionnaire . Format your response like <output_format>. No pre or post text for the JSON. Good luck! """

In [246]:
tiktoken_runner.whatsthetokencount(ACTOR_PROMPT)

580

In [70]:
messages = [
        {
          "role": "system",
          "content":  f"{ACTOR_PROMPT}"
        },
        {
          "role": "user",
          "content": [
            {
              "type": "text",
              "text": f"""Here's the text from which you need to prepare the questionnaire: {y[1]}"""
            }
          ]
        }
      ]

In [71]:
response = GPT_HOME.gpt_call(messages=messages, engine="claude-3-haiku-20240307", temp=1)

Invoke request failed with status code: 401
Login Successful


In [74]:
response['output']['choices'][0]['message']['content']

'<output_format>\n{\n"questionnaire": [\n{\n"question": "What was the main cause of political gridlock in Washington during Obama\'s presidency?",\n"type": "MCQ",\n"options": [\n"**A. Republican resistance and increase in filibusters**",\n"B. Lack of cooperation between the parties",\n"C. Disagreements over the president\'s legislative agenda",\n"D. All of the above"\n],\n"answer": "A. Republican resistance and increase in filibusters",\n"difficulty": "Medium"\n},\n{\n"question": "Describe the living conditions of working-class urban dwellers at the time and the resulting public health issues.",\n"type": "Descriptive",\n"answer": [\n"The working-class urban dwellers lived in crowded, poorly-ventilated tenement houses and apartments with substandard plumbing and sanitation. This led to the rapid spread of diseases like typhoid and cholera, resulting in high mortality rates in cities like Memphis."\n],\n"difficulty": "Easy"\n},\n{\n"question": "What actions did President Jefferson take t

In [59]:
import json_rep

In [91]:
split_dict_by_type(process_and_write_json("file1.json",response))

({'questionnaire': [{'question': "What was the main cause of political gridlock in Washington during Obama's presidency?",
    'type': 'MCQ',
    'options': ['**A. Republican resistance and increase in filibusters**',
     'B. Lack of cooperation between the parties',
     "C. Disagreements over the president's legislative agenda",
     'D. All of the above'],
    'answer': 'A. Republican resistance and increase in filibusters',
    'difficulty': 'Medium'},
   {'question': 'What actions did President Jefferson take to address the capture of American ships and sailors by pirates off the coast of North Africa?',
    'type': 'MCQ',
    'options': ['A. He negotiated diplomatic solutions with the Barbary States.',
     'B. He increased the size of the U.S. Navy to patrol the area.',
     'C. He took no action, as he did not believe in maintaining a large military.',
     '**D. He led the United States into war against the Barbary States in 1801.**'],
    'answer': 'D. He led the United Stat

In [76]:
sampled_df

Unnamed: 0,TEXT,CHAPTER,COMBINED_TEXT
0,Southern expansionists had spearheaded the dri...,"Cotton is King: The Antebellum South, 1800–1860",Southern expansionists had spearheaded the dri...
1,Just before leaving his post for volunteer ser...,"Age of Empire: American Foreign Policy, 1890-1914",Southern expansionists had spearheaded the dri...
2,Perhaps influenced by Kennedy's commitment to ...,Contesting Futures: America in the 1960s,Southern expansionists had spearheaded the dri...
3,The first major empire to emerge in West Afric...,"The Americas, Europe, and Africa Before 1492",Southern expansionists had spearheaded the dri...
4,"Obama won the election, but the Republicans re...",The Challenges of the Twenty-First Century,"Obama won the election, but the Republicans re..."
...,...,...,...
995,The final episode in the so-called Indian Wars...,"Go West Young Man! Westward Expansion, 1840-1900","By 1960, about one-third of the U.S. populatio..."
996,"By taking these steps, the First Continental C...","Imperial Reforms and Colonial Protests, 1763-1774","By taking these steps, the First Continental C..."
997,"During the war, Americans flocked to the movie...","Fighting the Good Fight in World War II, 1941-...","By taking these steps, the First Continental C..."
998,"Starting out behind Obama in the polls, Romney...",The Challenges of the Twenty-First Century,"By taking these steps, the First Continental C..."


In [79]:
def split_dict_by_type(data):
  """
  Splits a dictionary containing questions into two dictionaries based on question type.

  Args:
      data (dict): The dictionary containing question data.

  Returns:
      tuple: A tuple containing two dictionaries, one for MCQ questions and one for Descriptive questions.
  """

  mcq_data = {"questionnaire": []}
  descriptive_data = {"questionnaire": []}

  for question in data["questionnaire"]:
    if question["type"] == "MCQ":
      mcq_data["questionnaire"].append(question)
    elif question["type"] == "Descriptive":
      descriptive_data["questionnaire"].append(question)
    else:
      print(f"Warning: Unknown question type: {question['type']}")

  return mcq_data, descriptive_data

In [83]:
with open('file1.json') as json_file:
    data = json.load(json_file)

In [86]:
a,b = split_dict_by_type(data)

In [87]:
a

{'questionnaire': [{'question': "What was the main cause of political gridlock in Washington during Obama's presidency?",
   'type': 'MCQ',
   'options': ['**A. Republican resistance and increase in filibusters**',
    'B. Lack of cooperation between the parties',
    "C. Disagreements over the president's legislative agenda",
    'D. All of the above'],
   'answer': 'A. Republican resistance and increase in filibusters',
   'difficulty': 'Medium'},
  {'question': 'What actions did President Jefferson take to address the capture of American ships and sailors by pirates off the coast of North Africa?',
   'type': 'MCQ',
   'options': ['A. He negotiated diplomatic solutions with the Barbary States.',
    'B. He increased the size of the U.S. Navy to patrol the area.',
    'C. He took no action, as he did not believe in maintaining a large military.',
    '**D. He led the United States into war against the Barbary States in 1801.**'],
   'answer': 'D. He led the United States into war aga

In [88]:
b

{'questionnaire': [{'question': 'Describe the living conditions of working-class urban dwellers at the time and the resulting public health issues.',
   'type': 'Descriptive',
   'answer': ['The working-class urban dwellers lived in crowded, poorly-ventilated tenement houses and apartments with substandard plumbing and sanitation. This led to the rapid spread of diseases like typhoid and cholera, resulting in high mortality rates in cities like Memphis.'],
   'difficulty': 'Easy'},
  {'question': 'Describe the scene on Wall Street on the day of the stock market crash, known as Black Tuesday, in October 1929.',
   'type': 'Descriptive',
   'answer': ["On the morning of the crash, the opening bell on Wall Street was drowned out by shouts of 'Sell! Sell!' as investors rushed to dump their stock. In the first three minutes, nearly 3 million shares of stock worth $2 million changed hands, and the volume of telegrams and phone calls tripled as investors sought to sell their holdings immediat

In [94]:
z = y.copy()

In [95]:
new_df = pd.DataFrame(columns=["Text Chunk", "MCQ Questions", "Descriptive Answer"])

In [97]:
temp_list = []

In [102]:
for chunk in y:
    messages = [
        {
          "role": "system",
          "content":  f"{ACTOR_PROMPT}"
        },
        {
          "role": "user",
          "content": [
            {
              "type": "text",
              "text": f"""Here's the text from which you need to prepare the questionnaire: {chunk}"""
            }
          ]
        }
      ]
    response = GPT_HOME.gpt_call(messages=messages, engine="claude-3-haiku-20240307", temp=1)
    mcq,desc = split_dict_by_type(process_and_write_json(response))
    new_row = {"Text Chunk": chunk, "MCQ Questions": mcq, "Descriptive Answer": desc}
    new_df = pd.concat([new_df,pd.DataFrame(new_row)],ignore_index=True)
    new_df.to_csv('file3.csv',index=False)
    y.remove(chunk)
    

Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Invoke request failed with status code: 401
Login Successful
Error parsing JSON: Expecting ',' delimiter: line 10 column 100 (char 459)


TypeError: 'NoneType' object is not subscriptable

In [107]:
len(y)

175

In [252]:
def process_chunk(chunk):
  """
  Processes a single text chunk and returns MCQ and descriptive data.

  Args:
      chunk (str): The text chunk for which to generate questions.
      actor_prompt (str): The prompt to send to the GPT model.
      gpt_home (object): The GPT interface object for making GPT calls.

  Returns:
      tuple: A tuple containing two dictionaries, one for MCQ questions and one for Descriptive answer (or None if an error occurs).
  """

  messages = [
      {
          "role": "system",
          "content": f"{ACTOR_PROMPT}"
      },
      {
          "role": "user",
          "content": [
              {
                  "type": "text",
                  "text": f"""Here's the text from which you need to prepare the questionnaire: {chunk}"""
              }
          ]
      }
  ]
  try:
    response = GPT_HOME.gpt_call(messages=messages, engine="claude-3-haiku-20240307", temp=1)
    mcq, desc = split_dict_by_type(process_and_write_json(response))
    return mcq, desc
  except Exception as e:
    print(f"Error processing chunk {chunk}: {e}")
    return None  # Return None on error


def process_and_save(chunk_id, chunk):
  """
  Processes a single text chunk and saves the results to a CSV file.

  Args:
      chunk_id (int): A unique identifier for the chunk.
      chunk (str): The text chunk for which to generate questions.
      actor_prompt (str): The prompt to send to the GPT model.
      gpt_home (object): The GPT interface object for making GPT calls.
  """

  mcq, desc = process_chunk(chunk)

  # Skip saving if there was an error (mcq and desc will be None)
  if mcq is None or desc is None:
    return

  new_row = {"Text Chunk": chunk, "MCQ Questions": mcq, "Descriptive Answer": desc}
  df = pd.DataFrame(new_row)

  # Generate a unique filename based on chunk_id
  filename = f"processed_chunk_{chunk_id}.csv"
  df.to_csv(filename, index=False)


In [253]:
m1,m2 = process_chunk(y[45])

Invoke request failed with status code: 401
Login Successful


In [254]:
m1

{'questionnaire': [{'question': 'What was the primary reason the Republicans nominated Mitt Romney for the 2012 presidential election?',
   'type': 'MCQ',
   'options': ['They were convinced Obama was vulnerable due to opposition to his healthcare program and a weak economy',
    'They wanted to move the party further to the right to appeal to the newly created Tea Party faction',
    "They felt Romney's experience as a business executive would appeal to voters",
    '**All of the above**'],
   'answer': 'All of the above',
   'difficulty': 'Medium'},
  {'question': "What role did the federal government's response to Hurricane Sandy play in the 2012 election?",
   'type': 'MCQ',
   'options': ['It allowed Obama to appeal to voters in the Northeast who were affected by the storm',
    'It likely cost Romney votes in the Northeast due to his long-time criticism of FEMA',
    "It demonstrated the federal government's improved disaster response capabilities since 2005",
    '**All of the a

In [255]:
from concurrent.futures import ThreadPoolExecutor

In [256]:
with ThreadPoolExecutor(max_workers=6) as executor:
    # Assign unique IDs to chunks (assuming order doesn't matter)
    chunk_ids = range(len(y))  # Use enumerate(y) for ordered IDs

    # Process each chunk concurrently
    for chunk_id, chunk in zip(chunk_ids, y):
      executor.submit(process_and_save, chunk_id, chunk)


Invoke request failed with status code: 401
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Invoke request failed with status code: 401
Login Successful
Login SuccessfulLogin Successful

Login Successful
Login Successful
Login Successful
Error parsing JSON: Expecting ',' delimiter: line 38 column 5 (char 1549)
Error processing chunk The election of PresidentFranklin Delano Rooseveltsignaled both immediate relief for the American public as well as a permanent shift in the role of the federal government in guiding the economy and providing direct assistance to the people, albeit through expensive programs that made extensive budget deficits commonplace. For many, the immediate relief was, at a minimum, psychological: Herbert Hoover was gone, and the situation could not grow worse under Roosevelt. But as his New Deal unfolded, Americans learned more about the fun

In [257]:
def concatenate_csvs(csv_pattern):
  """
  Concatenates CSV files matching a pattern into a single DataFrame.

  Args:
      csv_pattern (str): A glob pattern to match CSV files (e.g., "processed_*.csv").

  Returns:
      pandas.DataFrame: The concatenated DataFrame, or None if no files or all empty.
  """

  all_data = pd.DataFrame()
  for filepath in glob.glob(csv_pattern):
    try:
      df = pd.read_csv(filepath)
      # Check if both columns have any non-empty values
      if df[["MCQ Questions", "Descriptive Answer"]].any().any():
        all_data = pd.concat([all_data, df], ignore_index=True)
      else:
        print(f"Skipping empty file: {filepath}")
    except FileNotFoundError:
      print(f"File not found: {filepath}")
    except pd.errors.EmptyDataError:
      print(f"Empty file: {filepath}")

  return all_data if not all_data.empty else None

In [258]:
csv_pattern = "processed_*.csv"  # Adjust pattern as needed
ggg_df = concatenate_csvs(csv_pattern)

In [259]:
ggg_df

Unnamed: 0,Text Chunk,MCQ Questions,Descriptive Answer
0,"In early 1914, Wilson completed his New Freedo...",[{'question': 'What was the main purpose of th...,"[{'question': ""Describe how Wilson's focus on ..."
1,The consumer revolution also made printed mate...,[{'question': 'What was the impact of the cons...,[{'question': 'Describe how the shared trove o...
2,FIGURE15.19Vastly outnumbered by the Union arm...,[{'question': 'What was the name of the Union ...,[{'question': 'Describe the key events that le...
3,"Within a year, Niagara chapters had sprung up ...","[{'question': ""How many states had Niagara cha...",[{'question': 'Describe the role of W.E.B. Du ...
4,Many Progressive reformers were also committed...,[{'question': 'What was the primary argument m...,[{'question': 'Describe the key goals and acti...
...,...,...,...
561,France gained much of the disputed territory a...,"[{'question': 'According to the passage, which...","[{'question': ""Explain how the creation of the..."
562,"The result of the presidential election, ultim...",[{'question': 'What was the outcome of the 187...,[{'question': 'Why were the election results i...
563,\nINTRODUCTION The Bostonians Paying the Excis...,"[{'question': ""What was the central event depi...",[{'question': 'Why did the crowd threaten to h...
564,"To give teeth to the 1764 Sugar Act, the law i...",[{'question': 'What was the main purpose of th...,[{'question': 'Why were colonial violators of ...


In [261]:
ggg_df.to_csv("QuestionSetNewFull.csv",index=False)

In [131]:
final_df.to_csv('save.csv')

In [115]:
new3_df = pd.concat([new_df,final_df],ignore_index=True)

In [116]:
new3_df

Unnamed: 0,Text Chunk,MCQ Questions,Descriptive Answer
0,Southern expansionists had spearheaded the dri...,[{'question': 'What was the primary goal of th...,[{'question': 'Describe the events that led to...
1,Columbus held erroneous views that shaped his ...,[{'question': 'What was one of Columbus' key m...,[{'question': 'Describe how the institution of...
2,What features of the domestic slave trade does...,[{'question': 'Which of the following was not ...,[{'question': 'What features of the domestic s...
3,"Obama won the election, but the Republicans re...",[{'question': 'What impact did Republican resi...,[{'question': 'Explain the living conditions f...
4,"After heated debates, Congress narrowly passed...",[{'question': 'What was the significance of th...,[{'question': 'How did the Kansas-Nebraska Act...
...,...,...,...
233,Those middle-class and wealthier urbanites who...,[{'question': 'Which movement aimed to champio...,[{'question': 'Explain the impact of the City ...
234,"In 1904, angered by the massing of Russian tro...","[{'question': ""What was the reason for Japan's...",[{'question': 'Describe the presidential elect...
235,WHO IS AN AMERICAN? There is nothing new about...,[{'question': 'What caused anxiety over immigr...,[{'question': 'Describe the impact of the demo...
236,"The idea of a ""city upon a hill"" made clear th...",[{'question': 'What was the goal stated in the...,"[{'question': ""Describe the impact of women's ..."


In [128]:
j=0
for h in y:
    j+=int(tiktoken_runner.whatsthetokencount(h))

In [129]:
j

245569

In [175]:
json_string3 = new3_df.iloc[77]["MCQ Questions"]

In [177]:
json_string3

'[{\'question\': \'What policy did Clinton adopt concerning gays serving in the military after the 1992 election?\', \'type\': \'MCQ\', \'options\': [\'Lifting the ban on gays serving openly\', "Adopting a policy of \'don\'t ask, don\'t tell\'", \'Banning gays and lesbians from the military\', \'Encouraging open discussion of sexual orientation\'], \'answer\': "Adopting a policy of \'don\'t ask, don\'t tell\'", \'difficulty\': \'Medium\'}, {\'question\': \'What was the immediate impact of the boycott by African American riders in response to segregation on public transportation?\', \'type\': \'MCQ\', \'options\': [\'Financial loss for the transportation system\', \'Increased use of African American-owned taxis\', \'Immediate end to segregation on buses\', \'Successful negotiation with the government\'], \'answer\': \'Increased use of African American-owned taxis\', \'difficulty\': \'Hard\'}]'

In [176]:
question_list = json.loads(json_string3)

# Now you can access elements using integer indices
for question in question_list:
  print(question['question'])  # Access question text
  print(question['options'])  # Access options list

JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 3 (char 2)

In [262]:
def format_question_list(question_list):
  """
  Formats a list of questions into a human-readable string.

  Args:
      question_list (list): A list of dictionaries containing question data.

  Returns:
      str: The formatted question string.
  """

  formatted_questions = []
  for question in question_list:
    print(question)
    formatted_question = f"Question: {question['question']}\n"
    options_str = ""
    formatted_answer = f"Answer: {question['answer'][0]}\n"
    formatted_difficulty = f"Difficulty: {question['difficulty']}\n"
    if question["type"] == "MCQ":
      for i, option in enumerate(question["options"], start=1):
        options_str += f"Option {chr(i+64)}: {option}\n"  # Start options from A (chr(65))
      formatted_answer = f"Answer: {question['answer']}\n"
      formatted_question += f"Options:\n{options_str}"
    formatted_question += formatted_answer
    formatted_question += formatted_difficulty
    formatted_questions.append(formatted_question)
  return "\n".join(formatted_questions)

In [275]:
def parse_question_paper(row):
  """
  Parses MCQ Questions and Descriptive Answer columns from a DataFrame row into a question paper format.

  Args:
      row (pd.Series): A row from the DataFrame containing 'MCQ Questions' and 'Descriptive Answer' columns.

  Returns:
      str: The parsed question paper string.
  """

  question_paper = ""

  # Process MCQ Questions
  mcq_questions = eval(row["MCQ Questions"])
  for question in mcq_questions:
    question_paper += f"Question: {question['question']}\n"
    if question['type'] == 'MCQ':
      options_str = ""
      for i, option in enumerate(question["options"], start=1):
        options_str += f"Option {chr(i+64)}: {option}\n"  # Start options from A (chr(65))
      question_paper += f"Options:\n{options_str}"
    question_paper += f"Difficulty: {question['difficulty']}\n"
    question_paper += f"Answer: {question['answer']}\n\n"

  # Process Descriptive Answer
  descriptive_answers = eval(row["Descriptive Answer"])
  for question in descriptive_answers:
    question_paper += f"Question: {question['question']}\n"
    question_paper += f"Difficulty: {question['difficulty']}\n"
    question_paper += f"Answer: {question['answer'][0]}\n\n"  # Assuming answer is a list with one element

  return question_paper.strip()  # Remove trailing newline

In [282]:
old_df = pd.read_csv("QuestionSetFull.csv")

In [283]:
old_df["Question Paper"] = old_df.apply(parse_question_paper, axis=1)

In [285]:
print(old_df['Question Paper'][6])

Question: What was the main goal of Operation Desert Storm?
Options:
Option A: **To maintain the peace and use force if necessary**
Option B: To overthrow Saddam Hussein's regime
Option C: To demonstrate the military power of the United States
Option D: To protect the oil interests of the United States
Difficulty: Easy
Answer: To maintain the peace and use force if necessary

Question: Which of the following was NOT a key factor that contributed to President Bush's weaknesses as a 'culture warrior'?
Options:
Option A: He was a moderate, Connecticut-born Episcopalian
Option B: He was a pragmatic politician and a life-long civil servant
Option C: He was adept at catering to post-Reagan conservatives
Option D: **He appeared incapable of capitalizing on his history of moderation and pragmatism regarding women's rights and access to abortion**
Difficulty: Medium
Answer: He was adept at catering to post-Reagan conservatives

Question: Explain how the nomination of Clarence Thomas to the Supr

In [277]:
ggg_df["Question Paper"] = ggg_df.apply(parse_question_paper, axis=1)

In [278]:
ggg_df

Unnamed: 0,Text Chunk,MCQ Questions,Descriptive Answer,QP,Question Paper
0,"In early 1914, Wilson completed his New Freedo...",[{'question': 'What was the main purpose of th...,"[{'question': ""Describe how Wilson's focus on ...",[{'question': 'What was the main purpose of th...,Question: What was the main purpose of the Cla...
1,The consumer revolution also made printed mate...,[{'question': 'What was the impact of the cons...,[{'question': 'Describe how the shared trove o...,[{'question': 'What was the impact of the cons...,Question: What was the impact of the consumer ...
2,FIGURE15.19Vastly outnumbered by the Union arm...,[{'question': 'What was the name of the Union ...,[{'question': 'Describe the key events that le...,[{'question': 'What was the name of the Union ...,Question: What was the name of the Union fight...
3,"Within a year, Niagara chapters had sprung up ...","[{'question': ""How many states had Niagara cha...",[{'question': 'Describe the role of W.E.B. Du ...,"[{'question': ""How many states had Niagara cha...",Question: How many states had Niagara chapters...
4,Many Progressive reformers were also committed...,[{'question': 'What was the primary argument m...,[{'question': 'Describe the key goals and acti...,[{'question': 'What was the primary argument m...,Question: What was the primary argument made b...
...,...,...,...,...,...
561,France gained much of the disputed territory a...,"[{'question': 'According to the passage, which...","[{'question': ""Explain how the creation of the...","[{'question': 'According to the passage, which...","Question: According to the passage, which terr..."
562,"The result of the presidential election, ultim...",[{'question': 'What was the outcome of the 187...,[{'question': 'Why were the election results i...,[{'question': 'What was the outcome of the 187...,Question: What was the outcome of the 1876 pre...
563,\nINTRODUCTION The Bostonians Paying the Excis...,"[{'question': ""What was the central event depi...",[{'question': 'Why did the crowd threaten to h...,"[{'question': ""What was the central event depi...",Question: What was the central event depicted ...
564,"To give teeth to the 1764 Sugar Act, the law i...",[{'question': 'What was the main purpose of th...,[{'question': 'Why were colonial violators of ...,[{'question': 'What was the main purpose of th...,Question: What was the main purpose of the 176...


In [286]:
def create_jsonl(df, actor_prompt, output_file="questionpaperold.jsonl"):
  """
  Creates a JSON Lines file from a DataFrame.

  Args:
      df (pd.DataFrame): The DataFrame containing "Text Chunk" and "Question Paper" columns.
      actor_prompt (str): The fixed string value for the "Instruct" key.
      output_file (str, optional): The name of the output JSON Lines file. Defaults to "questions.jsonl".
  """

  with open(output_file, "w") as f:
    for index, row in df.iterrows():
      data = {
          "Instruct": actor_prompt,
          "Input": row["Text Chunk"],
          "Output": row["Question Paper"],
      }
      json.dump(data, f)
      f.write("\n")

In [294]:
ACTOR_PROMPT = """Prompt:
You are a question-maker agent tasked with creating a comprehensive questionnaire from the following input text out of a history book, for use in an academic setting with learners of a higher intelligence level

Your goal is to create a comprehensive questionnaire based on the input text that will test learners' understanding across a range of intelligence levels. The questionnaire should include an equal mix of multiple choice questions (MCQs) and descriptive answer questions.

Here is the step-by-step process to follow:
<scratchpad>
1. Read through the entire input text carefully. Identify the main topics and sections covered.

2. For each main topic or section that you identified:
- Come up with one or more multiple choice questions (MCQs) depending on the size of the text. These MCQs should span a range of difficulties
- For each MCQ, provide 4 answer options and bold the correct answer.
- Generate a description based question for which the learner would have to write a medium-long answer. Try to test them on their capabilities like synthesis, cause/effect, interpretation etc., 
- For each descriptive question, generate a short answer based on the input text that an evaluator can make use of. It doesn't have to be the exact answer. Just points that an evaluator can use to grade the learner.

3. After generating questions for each main topic, review your questionnaire as a whole. Check that
you have:
- Good coverage of all the important points from the input text
- A relatively even distribution of easy, moderate, and challenging questions
- Add in any additional questions needed to improve the balance and coverage

4. Before submitting your final questionnaire, proofread it carefully to check for any errors or opportunities for improvement.
</scratchpad>

Remember, your fundamental goal is to create a questionnaire that comprehensively tests understanding of the input text, for learners with a higher levels. Strive to make your questions clear, insightful, and appropriately challenging. The quality and thoughtfulness of your questions is more important than the quantity.

Do not output the above scratchpad.
Now, Your response should only be a questionnaire ."""

In [296]:
create_jsonl(old_df,ACTOR_PROMPT,"new2.jsonl")

In [297]:
def concatenate_and_count_jsonl(file1, file2):
  """
  Concatenates two JSON Lines files and returns the total number of lines.

  Args:
      file1 (str): Path to the first JSON Lines file.
      file2 (str): Path to the second JSON Lines file.

  Returns:
      int: The total number of lines in the concatenated file.
  """

  total_lines = 0
  with open(file1, "r") as f1, open("combined2.jsonl", "w") as f_combined:
    for line in f1:
      total_lines += 1
      f_combined.write(line)

    with open(file2, "r") as f2:
        for line in f2:
          total_lines += 1
          f_combined.write(line)

  print(f"Total lines: {total_lines}")

# Replace 'file1.jsonl' and 'file2.jsonl' with your actual file paths
concatenate_and_count_jsonl("new1.jsonl", "new2.jsonl")

Total lines: 974


In [205]:
def format_question_list_mcq(question_list):
  """
  Formats a list of questions into a human-readable string.

  Args:
      question_list (list): A list of dictionaries containing question data.

  Returns:
      str: The formatted question string.
  """

  formatted_questions = []
  try:
    # Attempt to convert the string to a list using json.loads
    if isinstance(question_list, str):
      question_list = eval(question_list)  # Remove leading/trailing whitespace

    for question in question_list:
      formatted_question = f"Question: {question['question']}\n"
      options_str = ""
      if question["type"] == "MCQ":
        for i, option in enumerate(question["options"], start=1):
          options_str += f"Option {chr(i+64)}: {option}\n"
      formatted_question += f"Options:\n{options_str}"
      formatted_answer = f"Answer: {question['answer']}\n"
      formatted_difficulty = f"Difficulty: {question['difficulty']}\n"
      formatted_question += formatted_answer
      formatted_question += formatted_difficulty
      formatted_questions.append(formatted_question)
  except (TypeError) as e:
    print(f"Error formatting questions: {e}")
    return None  # Return None on error

  return "\n".join(formatted_questions)

In [208]:
def dataframe_to_json(df):
  """
  Converts a DataFrame with specific columns to a list of dictionaries in JSON format.

  Args:
      df (pandas.DataFrame): The DataFrame containing question data.

  Returns:
      list: A list of dictionaries in the desired JSON format, or None if all rows have errors.
  """

  json_data = []
  for index, row in df.iterrows():
    input_id = index  # Use index as input_id
    text_chunk = row["Text Chunk"]
    try:
      descriptive_answers = format_question_list(row["Descriptive Answer"])
    except Exception as e:
      print(f"Error processing row {index}: {e}")
      continue  # Skip to next row on error

    if descriptive_answers is None:
      print(f"Skipping row {index}: All questions have errors")
      continue  # Skip row if both question lists have errors

    json_data.append({
        "input_id": input_id,
        "Instruct": "Generate a descriptive answer based Questionnaire for me from the provided Text",
        "Input": text_chunk,
        "Output": descriptive_answers
    })
  with open("output.json", "w") as outfile:
      json.dump(json_data, outfile, indent=4)
      print("JSON data written to output.json")

In [209]:
def dataframe_to_json_mcq(df):
  """
  Converts a DataFrame with specific columns to a list of dictionaries in JSON format.

  Args:
      df (pandas.DataFrame): The DataFrame containing question data.

  Returns:
      list: A list of dictionaries in the desired JSON format, or None if all rows have errors.
  """

  json_data = []
  for index, row in df.iterrows():
    input_id = index  # Use index as input_id
    text_chunk = row["Text Chunk"]
    try:
      mcq_questions = format_question_list_mcq(row["MCQ Questions"])
    except Exception as e:
      print(f"Error processing row {index}: {e}")
      continue  # Skip to next row on error

    if mcq_questions is None:
      print(f"Skipping row {index}: All questions have errors")
      continue  # Skip row if both question lists have errors

    json_data.append({
        "input_id": input_id,
        "Instruct": "Generate a MCQ based Questionnaire for me from the provided Text",
        "Input": text_chunk,
        "Output": mcq_questions
    })
  with open("output_mcq.json", "w") as outfile:
      json.dump(json_data, outfile, indent=4)
      print("JSON data written to output_mcq.json")

In [210]:
dataframe_to_json_mcq(new3_df)

JSON data written to output_mcq.json


In [None]:
json_data

In [198]:

formatted_string = format_question_list(new3_df.iloc[79]["Descriptive Answer"])
print(formatted_string)
print("\n")  # Add newline between questions

Question: Describe the role of the Committee of Public Information during the war and the impact of its efforts?
Answer: The Committee of Public Information was created to develop a propaganda machine to encourage sacrifices during the war and to cultivate anti-German sentiment. It employed artists, speakers, writers, and filmmakers, and succeeded in molding an anti-German sentiment across the country, leading to measures like banning the teaching of German language, refusing to serve German food in some restaurants, and boycotting music written by German composers. The impact was widespread anti-German sentiment and changes in cultural practices as a result.
Difficulty: Difficult





In [197]:
new3_df.iloc[235]["Descriptive Answer"]

'[{\'question\': \'Describe the impact of the demographic changes on the United States based on the 1980s concerns.\', \'type\': \'Descriptive\', \'answer\': ["The concerns in the 1980s were driven by the realization that the growing number of Americans of color and multiethnic Americans, as well as the increasing percentage of people with non-European ancestry, would lead to the White majority becoming a demographic minority. This shift was anticipated to bring changes in the country\'s social and cultural landscape, and raise questions about power dynamics and representation."]}, {\'question\': "Explain the key reasons for Lincoln\'s decision to seek a replacement for General McClellan during the Civil War.", \'type\': \'Descriptive\', \'answer\': ["Lincoln sought a replacement for General McClellan because of the Union army\'s inability to achieve a decisive victory over Lee\'s army at Antietam. He also had personal issues with McClellan, who openly criticized and insulted the presi

In [186]:
for question in eval(new3_df.iloc[79]["Descriptive Answer"]):
    print(question)

{'question': 'Describe the role of the Committee of Public Information during the war and the impact of its efforts?', 'type': 'Descriptive', 'answer': ['The Committee of Public Information was created to develop a propaganda machine to encourage sacrifices during the war and to cultivate anti-German sentiment. It employed artists, speakers, writers, and filmmakers, and succeeded in molding an anti-German sentiment across the country, leading to measures like banning the teaching of German language, refusing to serve German food in some restaurants, and boycotting music written by German composers. The impact was widespread anti-German sentiment and changes in cultural practices as a result.'], 'difficulty': 'Difficult'}


In [222]:
def convert_to_jsonlines(filepath, output_filepath="outputmcq.jsonl"):
  """
  Converts a JSON file containing a list of dictionaries to JSON Lines format.

  Args:
      filepath (str): The path to the input JSON file.
      output_filepath (str, optional): The path to the output JSON Lines file. Defaults to "output.jsonl".
  """

  try:
    with open(filepath, "r") as jsonfile:
      data = json.load(jsonfile)
      if not isinstance(data, list):
        print(f"Error: JSON data is not a list at {filepath}")
        return

      with open(output_filepath, "w") as outfile:
        for item in data:
          json.dump(item, outfile)
          outfile.write("\n")  # Add newline for each record
  except FileNotFoundError:
    print(f"Error: File not found: {filepath}")
  except json.JSONDecodeError as e:
    print(f"Error parsing JSON file {filepath}: {e}")

In [223]:
convert_to_jsonlines('output_mcq.json')

In [211]:
def validate_json_file(filepath):
  """
  Validates a JSON file containing a list of dictionaries.

  Args:
      filepath (str): The path to the JSON file.

  Returns:
      list: A list of valid dictionaries or None if the file is invalid or empty.
  """

  valid_data = []
  try:
    with open(filepath, "r") as jsonfile:
      data = json.load(jsonfile)
      if not isinstance(data, list):
        print(f"Error: JSON data is not a list at {filepath}")
        return None
      for item in data:
        try:
          json.dumps(item)  # Attempt to serialize each item as JSON
          valid_data.append(item)
        except (json.JSONDecodeError, TypeError) as e:
          print(f"Error validating dictionary in {filepath}: {e}")
  except FileNotFoundError:
    print(f"Error: File not found: {filepath}")
    return None
  except json.JSONDecodeError as e:
    print(f"Error parsing JSON file {filepath}: {e}")
    return None

  return valid_data

In [214]:
filepath = "output.json"  # Replace with your actual file path
valid_data = validate_json_file(filepath)
if valid_data:
    print("JSON file is valid and contains a list of valid dictionaries.")
else:
    print("JSON file is invalid or empty.")

JSON file is valid and contains a list of valid dictionaries.


In [225]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [226]:
dataset = load_dataset('json', data_files='outputdescriptive.jsonl')

Generating train split: 237 examples [00:00, 24931.66 examples/s]


In [227]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_id', 'Instruct', 'Input', 'Output'],
        num_rows: 237
    })
})

In [230]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct
from qdrant_client.models import VectorParams, Distance

In [229]:
qdrant_url = "https://cefe850c-b463-45e8-b554-eb69961cd6c4.us-east4-0.gcp.cloud.qdrant.io:6333"
qdrant_key = "EAnJmEmGj3gXeztCX8cMH8JXzn5oemxowMUwHZg0wnvYS7GcBTS8bw"

In [232]:
cloud_client = QdrantClient(
            url = qdrant_url,
            api_key = qdrant_key,
            verify = False)

cloud_client.recreate_collection(
        collection_name="FOTS Manuals",
        vectors_config=VectorParams(size=2048, distance=Distance.COSINE,on_disk = True)
    )

local_client = QdrantClient("localhost",port=6333)

local_client.migrate(cloud_client,["FOTS Manuals"],batch_size = 100,recreate_on_collision=True)

  cloud_client.recreate_collection(


In [233]:
from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'sklearn'

In [234]:
import json
from sklearn.model_selection import train_test_split

def load_and_transform_data(file_path, output_key):
    with open(file_path, 'r') as file:
        data = json.load(file)

    transformed_data = []
    for item in data:
        if output_key:  
            output = item['Output'][output_key]
        else:
            output = item['Output']

        new_item = {
            "instruction": f"{item['Instruct']} - {item['Input']}",
            "output": output
        }
        transformed_data.append(new_item)
    return transformed_data

data1 = load_and_transform_data('output_mcq.json', None)

data2 = load_and_transform_data('output.json', 'Descriptive Answer')

combined_data = data1 + data2

train_data, test_data = train_test_split(combined_data, test_size=0.2, random_state=42)

for i, item in enumerate(train_data):
    item['id'] = i

for i, item in enumerate(test_data):
    item['id'] = i

with open('train_data.json', 'w') as file:
    json.dump(train_data, file, indent=4)

with open('test_data.json', 'w') as file:
    json.dump(test_data, file, indent=4)

print("Success")

Success


In [292]:
import json

dataxy = []
with open('combined.jsonl') as f:
    for line in f:
        dataxy.append(json.loads(line))

In [293]:
len(dataxy)

974

In [4]:
ggg_dff = pd.read_csv("QuestionSetNewFull.csv")

In [9]:
ggg_dff['MCQ Questions'][343]

"[{'question': 'What was one of the main difficulties the Confederation Congress faced in establishing foreign and commercial policies under the Articles of Confederation?', 'type': 'MCQ', 'options': ['Each state could decide whether to comply with treaties between the Congress and foreign countries, and there were no means of enforcement', 'The Confederation Congress was unable to pass any directives or policies', 'The Confederation Congress focused solely on settling western lands and did not address foreign or commercial policy', 'The Confederation Congress was unable to raise revenue to support the economy'], 'answer': 'Each state could decide whether to comply with treaties between the Congress and foreign countries, and there were no means of enforcement', 'difficulty': 'Medium'}, {'question': 'What was one of the key outcomes of the land ordinances passed by the Confederation Congress?', 'type': 'MCQ', 'options': ['Slavery was prohibited in all the new territories', 'The Mississ

In [19]:
import pandas as pd
import ast

def process_cell(cell_content):
  """
  Processes a cell containing a string representation of a list of dictionaries.

  Args:
    cell_content (str): The cell content to be processed.

  Returns:
    tuple: A tuple containing two lists:
      - questions: A list of extracted question strings.
      - answers: A list of extracted answer strings.
  """

  # Load the string as a list of dictionaries using ast.literal_eval
  data = ast.literal_eval(cell_content)

  questions = []
  answers = []
  for item in data:
    questions.append(str(item['question']))
    answers.append(str(item['answer']))

  return questions, answers

# Assuming your DataFrame is called 'df' and the column containing the list of dictionaries is named 'data_column'
ggg_dff['question_column_desc'] = ggg_dff['Descriptive Answer'].apply(lambda x: process_cell(x)[0])
ggg_dff['answer_column_desc'] = ggg_dff['Descriptive Answer'].apply(lambda x: process_cell(x)[1])

# Drop the original 'data_column' if desired
# df.drop('data_column', axis=1, inplace=True)



In [16]:
ggg_dff['question_column'][5][0]

'By the 1860s and 1870s, what was the main challenge faced by individual efforts to locate precious metals?'

In [27]:
ggg_dff['concatenated_mcq_ques'] = ggg_dff['question_column'].apply(lambda x: ', '.join(x) if x else '')

In [28]:
ggg_dff

Unnamed: 0,Text Chunk,MCQ Questions,Descriptive Answer,question_column,answer_column,question_column_desc,answer_column_desc,concatenated_mcq_ques
0,"In early 1914, Wilson completed his New Freedo...",[{'question': 'What was the main purpose of th...,"[{'question': ""Describe how Wilson's focus on ...",[What was the main purpose of the Clayton Anti...,[To expand the power of the Sherman Antitrust ...,[Describe how Wilson's focus on foreign affair...,"[[""Wilson's focus on foreign affairs and small...",What was the main purpose of the Clayton Antit...
1,The consumer revolution also made printed mate...,[{'question': 'What was the impact of the cons...,[{'question': 'Describe how the shared trove o...,[What was the impact of the consumer revolutio...,"[**There was a flood of journals, books, pamph...",[Describe how the shared trove of printed matt...,"[[""The shared trove of printed matter, such as...",What was the impact of the consumer revolution...
2,FIGURE15.19Vastly outnumbered by the Union arm...,[{'question': 'What was the name of the Union ...,[{'question': 'Describe the key events that le...,[What was the name of the Union fighting force...,"[Army of the Potomac, To transform the Civil W...",[Describe the key events that led to the start...,[['The election of Abraham Lincoln in 1860 was...,What was the name of the Union fighting force ...
3,"Within a year, Niagara chapters had sprung up ...","[{'question': ""How many states had Niagara cha...",[{'question': 'Describe the role of W.E.B. Du ...,[How many states had Niagara chapters within a...,"[**21 states**, **Internal fights over the rol...",[Describe the role of W.E.B. Du Bois within th...,"[[""W.E.B. Du Bois served as the influential di...",How many states had Niagara chapters within a ...
4,Many Progressive reformers were also committed...,[{'question': 'What was the primary argument m...,[{'question': 'Describe the key goals and acti...,[What was the primary argument made by Fredric...,[To place workers in the most efficient positi...,[Describe the key goals and activities of the ...,[['The settlement house movement aimed to prov...,What was the primary argument made by Fredrick...
...,...,...,...,...,...,...,...,...
561,France gained much of the disputed territory a...,"[{'question': 'According to the passage, which...","[{'question': ""Explain how the creation of the...","[According to the passage, which territory did...","[All of the above, The war guilt clause, $33 b...",[Explain how the creation of the League of Nat...,"[[""The passage states that even before Wilson'...","According to the passage, which territory did ..."
562,"The result of the presidential election, ultim...",[{'question': 'What was the outcome of the 187...,[{'question': 'Why were the election results i...,[What was the outcome of the 1876 presidential...,[**Hayes won the electoral college but lost th...,"[Why were the election results in Florida, Lou...",[['The results in these three states were disp...,What was the outcome of the 1876 presidential ...
563,\nINTRODUCTION The Bostonians Paying the Excis...,"[{'question': ""What was the central event depi...",[{'question': 'Why did the crowd threaten to h...,[What was the central event depicted in the pr...,"[The tarring and feathering of John Malcolm, B...",[Why did the crowd threaten to hang John Malco...,[['The crowd threatened to hang John Malcolm b...,What was the central event depicted in the pri...
564,"To give teeth to the 1764 Sugar Act, the law i...",[{'question': 'What was the main purpose of th...,[{'question': 'Why were colonial violators of ...,[What was the main purpose of the 1764 Sugar A...,"[**To raise revenue for the British Empire**, ...",[Why were colonial violators of the Navigation...,[['Colonial juries were often sympathetic to m...,What was the main purpose of the 1764 Sugar Ac...


In [22]:
with open(r'chunks.txt', 'w') as fp:
    for item in ggg_dff['Text Chunk'].to_list():
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done')

Done


In [29]:
with open(r'mcqs.txt', 'w') as fp:
    for item in ggg_dff['concatenated_mcq_ques'].to_list():
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done')

Done
