In [None]:
# import pandas as pd
import pdfplumber
import re


In [30]:
import re

def extract_grade(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        first_page = pdf.pages[0]
        text = first_page.extract_text()
        # Assuming the grade is formatted as "Grade 6" or "6th Grade"
        match = re.search(r"Grade (\d+)", text)
        if match:
            return match.group(1)
        # Fallback for different format
        match = re.search(r"(\d+)(?:th|nd|rd|st) Grade", text)
        if match:
            return match.group(1)
    return "Unknown"  # Default if not found


In [31]:
# Function to extract strands and sub-strands with page numbers in smaller segments
def extract_strands_sub_strands_with_page_segmented(pdf_path, start_page, end_page):
    extracted_data = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for page_number in range(start_page, min(end_page, len(pdf.pages))):
            try:
                page = pdf.pages[page_number]
                tables = page.extract_tables()
                for table in tables:
                    if "Strand" in table[0] and "Sub Strand" in table[0]:
                        for row in table[1:]:
                            if row[0] and row[1]:
                                extracted_data.append({
                                    "page_number": page_number + 1,
                                    "grade": extract_grade(pdf_path),
                                    "strand": row[0],
                                    "sub_strand": row[1]
                                })
            except Exception as e:
                print(f"Error processing page {page_number + 1}: {e}")

    return extracted_data

# Example usage: Process the PDF in segments
pdf_path = '../data/math_downloads/ALL_MATH/GRADE 5 CURRICULUM DESIGNS- MATHEMATICS.pdf'
# pdf_path = '../data/raw/GRADE 6 CURRICULUM DESIGNS- HOME SCIENCE.pdf'
total_pages = 100  # Assuming the total number of pages in the PDF
segment_size = 10  # Number of pages to process at a time

all_strands_sub_strands = []
for start_page in range(0, total_pages, segment_size):
    extracted_data = extract_strands_sub_strands_with_page_segmented(pdf_path, start_page, start_page + segment_size)
    all_strands_sub_strands.extend(extracted_data)




In [32]:
print(all_strands_sub_strands)

[]


In [93]:
### Extract all the tables from the pdf from page 2 onwards. Include the reference page number for each table
pdf_path = '../data/math_downloads/ALL_MATH/mathematics-grade-two-curriculum-design.pdf'
pdf = pdfplumber.open(pdf_path)
all_tables = []
for i in range(0, len(pdf.pages)):
    page = pdf.pages[i]
    tables = page.extract_tables()
    for table in tables:
        all_tables.append({
            "page_number": i + 1,
            "table_position_in_page": tables.index(table) + 1,  # 1-indexed
            "number_of_columns": len(table[0]),
            "table": table
        })
pdf.close()


In [110]:
def split_tables_with_strand_change(page_tables):
    
    # Initialize variables for splitting
    split_index = None

    # Find the table with the new strand
    for index, table_info in enumerate(page_tables):
        table = table_info['table']
        # Check if "Strand" is in any cell of the table
        if any("Strand" in str(cell) for row in table for cell in row):
            split_index = index
            break

    if split_index is not None:
        # Tables before the strand change
        tables_before_strand = page_tables[:split_index]
        # Tables at and after the strand change
        tables_after_strand = page_tables[split_index:]
        return tables_before_strand, tables_after_strand
    else:
        return page_tables, None


In [None]:
import pdfplumber

def split_tables_with_strand_change(tables):
    for index, table in enumerate(tables):
        # Check if "Strand" is in any cell of the table
        if any("Strand" in str(cell) for row in table for cell in row):
            return tables[:index], tables[index:]
    return tables, []

def extract_tables_grouped_by_strand(pdf_path, page_start, page_end):
    with pdfplumber.open(pdf_path) as pdf:
        all_strands = []
        current_strand = []
        strand_counter = 1

        for page_number in range(page_start - 1, page_end):  # Adjusted for 0-based indexing
            page = pdf.pages[page_number]
            tables = page.extract_tables()
            text = page.extract_text()

            if "Strand" in text:  # Check if "STRAND" is found on the page
                if current_strand:  # If current strand is not empty
                    tables_before_strand, tables_after_strand = split_tables_with_strand_change(tables)
                    for table in tables_before_strand if tables_before_strand else []:
                        current_strand.append({
                            "page_number": page_number + 1,
                            "number_of_columns": len(table[0]),
                            "table_number": tables.index(table) + 1,  # 1-indexed
                            "table": table
                        })
                    all_strands.append({"strand_" + str(strand_counter): current_strand})
                    strand_counter += 1
                    tables = tables_after_strand
                    current_strand = []

            for table in tables:
                current_strand.append({
                    "page_number": page_number + 1,
                    "number_of_columns": len(table[0]),
                    "table_number": tables.index(table) + 1,  # 1-indexed
                    "table": table
                })

        # Append the last strand if not empty
        if current_strand:
            all_strands.append({"strand_" + str(strand_counter): current_strand})

        return all_strands

# Example usage
pdf_path = '../data/math_downloads/ALL_MATH/mathematics-grade-one-curriculum-activities.pdf'
grouped_tables = extract_tables_grouped_by_strand(pdf_path, 12, 52)
import json
with open('../data/math_downloads/ALL_MATH/JSON/mathematics-grade-one-curriculum-activities.json', 'w') as f:
    json.dump(grouped_tables, f, indent=4)


In [144]:
print(grouped_tables)

[{'strand_1': [{'page_number': 13, 'number_of_columns': 4, 'table_number': 1, 'table': [['Strand', 'Sub Strand', 'Specific Learning Outcomes', 'Suggested Learning Experiences'], ['1.0 NUMBERS', '1.1 WHOLE\nNUMBERS\n(20 Lessons)', 'By the end of the sub strand, the learner\nshould be able to:\na) use place value and total value of\ndigits up to tens of thousands in daily\nlife situations,\nb) read and write numbers up to 10,000\nin symbols in real life situations,\nc) read and write numbers up to 1,000 in\nwords in day to day activities,\nd) order numbers up to 1,000 in different\nsituations,\ne) round off numbers up to 1,000 to the\nnearest ten in different situations,\nf) identify factors/divisors of numbers\nup to 50 in different contexts,\ng) identify multiples of numbers up to\n100 in different situations,\nh) use even and odd numbers up to 100\nin different situations,\ni) represent Hindu Arabic numerals\nusing Roman numerals up to ‘X’ in\ndifferent situations,\nj) make patterns i

In [172]:
def contains_keywords(strands_data, keywords = ["Strand", "Sub Strand"]):
    for strand in strands_data:
        for row in strand['table']:
            for cell in row:
                if cell and any(keyword in cell for keyword in keywords):
                    keywords_str = "' or '".join(keywords)
                    print(f"The strands data contains '{keywords_str}'.")
                    return True
    keywords_str = "' or '".join(keywords)
    print(f"The strands data does not contain '{keywords_str}'.")
    return False


In [198]:
def contains_keywords(strands_data, keywords=["Strand", "Sub Strand"]):
    # Convert keywords to lowercase for case-insensitive comparison
    lower_keywords = [keyword.lower() for keyword in keywords]

    for strand in strands_data:
        for row in strand['table']:
            for cell in row:
                if cell and any(lower_keyword in cell.lower() for lower_keyword in lower_keywords):
                    keywords_str = "' or '".join(keywords)
                    print(f"The strands data contains '{keywords_str}'.")
                    return True
    keywords_str = "' or '".join(keywords)
    print(f"The strands data does not contain '{keywords_str}'.")
    return False


def find_keyword_columns(strands_data, keywords):
    column_indexes = {keyword: -1 for keyword in keywords}
    for strand in strands_data:
        for row in strand['table']:
            for index, cell in enumerate(row):
                if cell:
                    processed_cell = cell.replace('\n', ' ').lower()
                    for keyword in keywords:
                        processed_keyword = keyword.replace('\n', ' ').lower()
                        if processed_keyword in processed_cell and column_indexes[keyword] == -1:
                            column_indexes[keyword] = index
    return column_indexes

In [197]:
def merge_strand_and_indicator(tables):
    # Select json where number_of_columns >= 4
    tables = [table for table in tables if table['number_of_columns'] >= 4]
  
    # Split where "table".first contains "Indicator"
    split_index = None
    for table in tables:
        if "Indicator" in table['table'][0][0]:
            split_index = tables.index(table)
            break
    else:
        split_index = len(tables)
    tables_before_indicator = tables[:split_index]
    tables_after_indicator = tables[split_index:]

    # Merge tables_before_indicator["table"] into one table
    sub_strand = []
    for table in tables_before_indicator:
        sub_strand.extend(table['table'])

    # Merge tables_after_indicator["table"] into one table
    rubrics = []
    for table in tables_after_indicator:
        rubrics.extend(table['table'])

    # Process sub_strand and rubrics
    strand_data = {
        "strand": sub_strand[1][0],
        "sub_strand": sub_strand[1][1],
        "specific_learning_outcomes": [],
        "suggested_learning_experiences": [],
        "key_inquiry_questions": [],
        "assessment_rubrics": []
    }

    # Extracting specific learning outcomes and suggested learning experiences
    for row in sub_strand[2:]:
        strand_data["specific_learning_outcomes"].append(row[2].replace('\n', ' ').strip())
        strand_data["suggested_learning_experiences"].append(row[3].replace('\n', ' ').strip())
        strand_data["key_inquiry_questions"].append(row[4].replace('\n', ' ').strip())

    # Extracting rubrics
    for i in range(1, len(rubrics)):  # Skipping the header row
        indicator = rubrics[i][0].replace('\n', ' ').strip()
        exceeds = rubrics[i][1].replace('\n', ' ').strip()
        meets = rubrics[i][2].replace('\n', ' ').strip()
        approaches = rubrics[i][3].replace('\n', ' ').strip()
        below = rubrics[i][4].replace('\n', ' ').strip()

        strand_data["assessment_rubrics"].append({
            "indicator_name": indicator,
            "rubrics": [
                {"level": "Exceeds Expectations", "statement": exceeds},
                {"level": "Meets Expectations", "statement": meets},
                {"level": "Approaches Expectations", "statement": approaches},
                {"level": "Below Expectations", "statement": below}
            ]
        })

    return strand_data


In [169]:
strand_data

AttributeError: 'NoneType' object has no attribute 'replace'

In [182]:
import pdfplumber

def extract_tables_grouped_by_strand(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        all_tables = []
        for i in range(0, len(pdf.pages)):
            page = pdf.pages[i]
            tables = page.extract_tables()
            for table in tables:
                all_tables.append({
                    "page_number": i + 1,
                    "number_of_columns": len(table[0]),
                    "table": table
                })

        all_strands = []
        current_strand = []
        strand_counter = 1

        for table_info in all_tables:
            page_number = table_info["page_number"]
            table = table_info["table"]
            
            if "Strand" in ' '.join(filter(None, table[0])):  # Check if "Strand" is in the first row of the table
                if current_strand:  # If current strand is not empty
                    all_strands.append({"strand_" + str(strand_counter): current_strand})
                    strand_counter += 1
                    current_strand = []

            current_strand.append({
                "page_number": page_number,
                "number_of_columns": len(table[0]),
                "table": table
            })

        # Append the last strand if not empty
        if current_strand:
            all_strands.append({"strand_" + str(strand_counter): current_strand})

        return all_strands

# Example usage
# pdf_path = '/mnt/data/GRADE 5 CURRICULUM DESIGNS- MATHEMATICS.pdf'
## Loop through all the pdf files in the folder

folder_path = "../data/math_downloads/ALL_MATH/"
folder_write_path = "../data/math_downloads/ALL_MATH/JSON_v2/"
import os
# Ensure the write folder exists
os.makedirs(folder_write_path, exist_ok=True)

for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(folder_path, filename)
        grouped_tables = extract_tables_grouped_by_strand(pdf_path)

        strands = []
        for strand in grouped_tables:
            strand_key = list(strand.keys())[0]
            # Example usage
            keywords = [
                "Strand", "Sub strand", "Specific Learning Outcomes",
                "Suggested Learning Experiences", "Key Inquiry Question(s)", "Indicators",
                "Exceeds Expectations", "Meets Expectations", "Approaches Expectations",
                "Below Expectations"
            ]

            # Assuming 'strands_data' is defined and contains the necessary data
            column_positions = find_keyword_columns(strand, keywords)
            # Assuming 'tables' is part of your grouped tables data
            strand_data = merge_strand_and_indicator(tables, column_positions)
            # strand_data = merge_strand_and_indicator(strand[strand_key])
            strands.append({strand_key: strand_data})

        # Removing '.pdf' from filename for the JSON file
        json_filename = os.path.splitext(filename)[0] + '.json'
        json_path = os.path.join(folder_write_path, json_filename)

        with open(json_path, 'w') as f:
            json.dump(strands, f, indent=4)

# pdf_path = '../data/math_downloads/ALL_MATH/GRADE 5 CURRICULUM DESIGNS- MATHEMATICS.pdf'
# grouped_tables = extract_tables_grouped_by_strand(pdf_path)
# import json
# with open('../data/math_downloads/ALL_MATH/GRADE 5 CURRICULUM DESIGNS- MATHEMATICS.json', 'w') as f:
#     json.dump(grouped_tables, f, indent=4)


# data/math_downloads/Folders

The strands data does not contain 'Strand' or 'Sub Strand'.
The strands data contains 'Strand' or 'Sub Strand'.
[{'page_number': 3, 'number_of_columns': 5, 'table': [['Strand', 'Sub strand', 'Specific Learning Outcomes', 'Suggested Learning Experiences', 'Key Inquiry\nQuestion(s)'], ['1.0\nNUMBERS', '1.0 Whole\nNumbers\n(20 Lessons)', 'By the end of the sub strand, the\nlearner should be able to;\na) use place value and total value\nof digits up to hundreds of\nthousands in real life,\nb) use numbers up to hundreds\nof thousands in symbols in real\nlife,\nc) read, write and relate numbers\nup to tens of thousands in\nwords in real life,\nd) order numbers up to tens of\nthousands in real life,\ne) round off numbers up to tens\nof thousands to the nearest\nhundred and thousand in\ndifferent situations,\nf) apply divisibility tests of 2, 5\nand 10 in real life,\ng) apply highest Common Factor\n(HCF) and Greatest Common\nDivisor (GCD) in different\nsituations,\nh) use Least Common Multiple

AttributeError: 'NoneType' object has no attribute 'replace'

In [174]:
pdf_path = '../data/math_downloads/ALL_MATH/GRADE 5 CURRICULUM DESIGNS- MATHEMATICS.pdf'
grouped_tables = extract_tables_grouped_by_strand(pdf_path)

In [170]:
print(grouped_tables)

[{'strand_1': [{'page_number': 1, 'number_of_columns': 2, 'table': [['MINISTRY OF EDUCATION\nUPPER PRIMARY LEVEL DESIGNS\nL EARNING AR EA: MATHEMATICS\nGRADE 5\nNOVEMBER 2019\nKENYA INSTITUTE OF CURRICULUM DEVELOPEM', None], [None, 'KENYA INSTITUTE OF CURRICULUM DEVELOPEM']]}]}, {'strand_2': [{'page_number': 3, 'number_of_columns': 5, 'table': [['Strand', 'Sub strand', 'Specific Learning Outcomes', 'Suggested Learning Experiences', 'Key Inquiry\nQuestion(s)'], ['1.0\nNUMBERS', '1.0 Whole\nNumbers\n(20 Lessons)', 'By the end of the sub strand, the\nlearner should be able to;\na) use place value and total value\nof digits up to hundreds of\nthousands in real life,\nb) use numbers up to hundreds\nof thousands in symbols in real\nlife,\nc) read, write and relate numbers\nup to tens of thousands in\nwords in real life,\nd) order numbers up to tens of\nthousands in real life,\ne) round off numbers up to tens\nof thousands to the nearest\nhundred and thousand in\ndifferent situations,\nf) app

In [200]:
# strand_data = merge_strand_and_indicator(grouped_tables[2]['strand_3'])
strands = []
for strand in grouped_tables:
    strand_key = list(strand.keys())[0]
    # Example usage
    keywords = [
        "Strand", "Sub strand", "Specific Learning Outcomes",
        "Suggested Learning Experiences", "Key Inquiry Question(s)", "Indicators",
        "Exceeds Expectations", "Meets Expectations", "Approaches Expectations",
        "Below Expectations"
    ]

    # Assuming 'strands_data' is defined and contains the necessary data
    # column_positions = find_keyword_columns(strand, keywords)
    # Assuming 'tables' is part of your grouped tables data
    strand_data = merge_strand_and_indicator(tables)
    # strand_data = merge_strand_and_indicator(strand[strand_key])
    strands.append({strand_key: strand_data})

IndexError: list index out of range

In [39]:
## convert the tables into a JSON format
import json
with open('../data/math_downloads/ALL_MATH/GRADE 5 CURRICULUM DESIGNS- MATHEMATICS.json', 'w') as f:
    json.dump(all_tables, f, indent=4)

In [41]:
import pdfplumber

def is_continuation_table(prev_table, current_table):
    """
    Determines if the current_table is a continuation of the prev_table.
    This function should be customized based on the specific format of your tables.
    """
    # Example condition: if the first cell of the current table is empty, it's a continuation
    return current_table[0][0].strip() == ""

def merge_tables(prev_table, current_table):
    """
    Merges the current_table with the prev_table.
    This function should concatenate the rows of the two tables.
    """
    return prev_table + current_table[1:]  # Skip the header row of the current_table

def process_tables(all_tables):
    """
    Processes all tables and merges continuation tables.
    """
    processed_tables = []
    prev_table = None

    for table_data in all_tables:
        current_table = table_data["table"]

        if prev_table and is_continuation_table(prev_table, current_table):
            # Merge current_table with prev_table
            merged_table = merge_tables(prev_table, current_table)
            prev_table = merged_table
        else:
            # Process prev_table
            if prev_table:
                processed_tables.append(prev_table)

            # Update prev_table to current_table for the next iteration
            prev_table = current_table

    # Add the last table if it's not processed
    if prev_table:
        processed_tables.append(prev_table)

    return processed_tables

# # Example usage
# pdf_path = '../data/math_downloads/ALL_MATH/GRADE 5 CURRICULUM DESIGNS- MATHEMATICS.pdf'
# pdf = pdfplumber.open(pdf_path)
# all_tables = []

# for i in range(len(pdf.pages)):
#     page = pdf.pages[i]
#     tables = page.extract_tables()
#     for table in tables:
#         all_tables.append({
#             "page_number": i + 1,
#             "number_of_columns": len(table[0]),
#             "table": table
#         })

# pdf.close()

# Process and merge continuation tables
merged_tables = process_tables(all_tables)

# Now merged_tables contains the merged tables
# Further processing can be done based on the structure of the tables


In [46]:
# merged_tables

import json
with open('../data/math_downloads/ALL_MATH/GRADE 5 CURRICULUM DESIGNS- MATHEMATICS_v2.json', 'w') as f:
    json.dump(merged_tables, f, indent=4)

TypeError: list indices must be integers or slices, not str

In [33]:
# ## Page extraction with rubric page [NOT WORKING]
# def extract_strands_sub_strands_with_page_segmented(pdf_path, start_page, end_page):
#     extracted_data = []
#     rubric_headers = ["Indicators", "Exceeds Expectations", "Meets Expectations", "Approaches Expectations", "Below Expectations"]
    
#     with pdfplumber.open(pdf_path) as pdf:
#         for page_number in range(start_page, min(end_page, len(pdf.pages))):
#             try:
#                 page = pdf.pages[page_number]
#                 tables = page.extract_tables()
#                 for table in tables:
#                     # Check if the table is for Strand and Sub Strand
#                     if "Strand" in ' '.join(table[0]) and "Sub Strand" in ' '.join(table[0]):
#                         for row in table[1:]:
#                             if row[0] and row[1]:
#                                 strand_data = {
#                                     "page_number": page_number + 1,
#                                     "strand": row[0],
#                                     "sub_strand": row[1],
#                                     "rubric_start_page": None  # Default value
#                                 }
#                                 extracted_data.append(strand_data)

#                     # Check each cell in the first row for rubric headers
#                     for cell in table[0]:
#                         if any(header in cell for header in rubric_headers):
#                             ## print (cell)
#                             print(cell)
#                             if extracted_data:
#                                 extracted_data[-1]["rubric_start_page"] = page_number + 1
#                                 break
#             except Exception as e:
#                 print(f"Error processing page {page_number + 1}: {e}")

#     return extracted_data

# # Example usage: Process the PDF in segments
# pdf_path = '../data/raw/GRADE 6 CURRICULUM DESIGNS- MATHEMATICS.pdf'
# total_pages = 100  # Assuming the total number of pages in the PDF
# segment_size = 10  # Number of pages to process at a time

# all_strands_sub_strands = []
# for start_page in range(0, total_pages, segment_size):
#     extracted_data = extract_strands_sub_strands_with_page_segmented(pdf_path, start_page, start_page + segment_size)
#     all_strands_sub_strands.extend(extracted_data)

# # Displaying a portion of the extracted data for review
# print(all_strands_sub_strands)  # Displaying the first 5 entries for brevity


Indicators
Indicators
Indicators
Indicators
Error processing page 31: sequence item 1: expected str instance, NoneType found
Indicators
Indicators
Error processing page 44: sequence item 1: expected str instance, NoneType found
Indicators
Indicators
Indicators
Indicators
Indicators
Indicators
Error processing page 67: sequence item 1: expected str instance, NoneType found
Indicators
[{'page_number': 13, 'strand': '1.0\nNUMBERS', 'sub_strand': '1.1 Whole\nnumbers\n(20 Lessons)', 'rubric_start_page': 15}, {'page_number': 18, 'strand': '1.0\nNUMBERS', 'sub_strand': '1.2\nMultiplication\n(6 Lessons)', 'rubric_start_page': 20}, {'page_number': 21, 'strand': '1.0\nNUMBERS', 'sub_strand': '1.3 Division\n(6 Lessons)', 'rubric_start_page': 23}, {'page_number': 24, 'strand': '1.0\nNUMBERS', 'sub_strand': '1.4 Fractions\n(12 Lessons)', 'rubric_start_page': 27}, {'page_number': 29, 'strand': '1.0\nNUMBERS', 'sub_strand': '1.5 Decimals\n(12 Lessons)', 'rubric_start_page': None}, {'page_number': 34,

In [34]:
import pdfplumber
rubric_headers = ["Indicators", "Exceeds Expectations", "Meets Expectations", "Approaches Expectations", "Below Expectations"]

def find_rubric_start_pages(pdf_path, all_strands_sub_strands, rubric_headers):
    with pdfplumber.open(pdf_path) as pdf:
        for strand_info in all_strands_sub_strands:
            start_page = strand_info.get("page_number", 0)  # Default to 0 if not found
            found_rubric = False

            for page_number in range(start_page, len(pdf.pages)):
                page = pdf.pages[page_number]
                tables = page.extract_tables()

                for table in tables:
                    first_row_text = ' '.join(cell for cell in table[0] if cell)
                    if any(header in first_row_text for header in rubric_headers):
                        strand_info["rubric_start_page"] = page_number + 1  # Page numbers are 1-indexed
                        found_rubric = True
                        break  # Found the rubric, no need to check further

                if found_rubric:
                    break  # Move to the next strand_info

            if not found_rubric:
                strand_info["rubric_start_page"] = None  # Set to None if no rubric page is found

    return all_strands_sub_strands



# Run the function with the provided data
updated_strands_sub_strands = find_rubric_start_pages(pdf_path, all_strands_sub_strands, rubric_headers)

# Displaying a portion of the updated data
print(updated_strands_sub_strands)  # Displaying the first 5 entries for brevity


[{'page_number': 13, 'strand': '1.0\nNUMBERS', 'sub_strand': '1.1 Whole\nnumbers\n(20 Lessons)', 'rubric_start_page': 15}, {'page_number': 18, 'strand': '1.0\nNUMBERS', 'sub_strand': '1.2\nMultiplication\n(6 Lessons)', 'rubric_start_page': 20}, {'page_number': 21, 'strand': '1.0\nNUMBERS', 'sub_strand': '1.3 Division\n(6 Lessons)', 'rubric_start_page': 23}, {'page_number': 24, 'strand': '1.0\nNUMBERS', 'sub_strand': '1.4 Fractions\n(12 Lessons)', 'rubric_start_page': 27}, {'page_number': 29, 'strand': '1.0\nNUMBERS', 'sub_strand': '1.5 Decimals\n(12 Lessons)', 'rubric_start_page': 31}, {'page_number': 34, 'strand': '2.0 MEA-\nSUREMENT', 'sub_strand': '2.1 Length\n(14 Lessons)', 'rubric_start_page': 37}, {'page_number': 40, 'strand': '2.0 MEA-\nSUREMENT', 'sub_strand': '2.2 Area\n(6 Lessons)', 'rubric_start_page': 42}, {'page_number': 43, 'strand': '2.0 MEA-\nSUREMENT', 'sub_strand': '2.3 Capacity\n(6 Lessons)', 'rubric_start_page': 44}, {'page_number': 46, 'strand': '2.0 MEA-\nSUREMENT

In [35]:
## WORKING
complete_data = None
def extract_additional_info(pdf_path, all_strands_sub_strands):
    with pdfplumber.open(pdf_path) as pdf:
        for index, strand_info in enumerate(all_strands_sub_strands):
            start_page = strand_info["page_number"]
            end_page = all_strands_sub_strands[index + 1]["page_number"] if index + 1 < len(all_strands_sub_strands) else len(pdf.pages) + 1
            outcomes, experiences, questions = [], [], []
            values, community_service, pcis, links_to_subjects, additional_community_service = [], [], [], [], []
            skip_remaining_tables = False  # Flag to skip remaining tables once "Indicators" is found

            for page_number in range(start_page, end_page):
                if skip_remaining_tables:
                    break  # Move to the next strand_info if "Indicators" was found in previous tables

                page = pdf.pages[page_number - 1]
                tables = page.extract_tables()

                for table in tables:
                    for row in table:
                        row_text = ' '.join(filter(None, row))  # Combine all non-empty cells in the row

                        if "Indicators" in row_text:
                            skip_remaining_tables = True
                            break  # Skip to the next strand_info
                        for cell in row:
                            if cell:
                                if "Values:" in cell:
                                    values.append(cell.split("Values:")[1].strip())
                                elif "Suggested Community Service Learning Activities:" in cell:
                                    community_service.append(cell.split("Suggested Community Service Learning Activities:")[1].strip())
                                elif "PCIs:" in cell:
                                    pcis.append(cell.split("PCIs:")[1].strip())
                                elif "Links to other subjects:" in cell:
                                    links_to_subjects.append(cell.split("Links to other subjects:")[1].strip())
                                elif "Suggested Community Service Learning Activities:" in cell:
                                    additional_community_service.append(cell.split("Suggested Community Service Learning Activities:")[1].strip())

                        # Extract other standard information if the row has 5 columns
                        if len(row) >= 5:
                            outcomes.append(row[2].replace('\n', ' ').strip() if row[2] else '')
                            experiences.append(row[3].replace('\n', ' ').strip() if row[3] else '')
                            questions.append(row[4].replace('\n', ' ').strip() if row[4] else '')

            # Add the collected data to the strand_info
            strand_info.update({
                "specific_learning_outcomes": outcomes,
                "suggested_learning_experiences": experiences,
                "key_inquiry_questions": questions,
                "values": values,
                "community_service_activities": community_service,
                "pcis": pcis,
                "links_to_other_subjects": links_to_subjects,
                "additional_community_service_activities": additional_community_service
            })

    return all_strands_sub_strands

# Extract and add additional information to the strands and sub_strands data
complete_data = extract_additional_info(pdf_path, updated_strands_sub_strands)


In [36]:
def extract_rubric_data_complete(pdf_path, complete_data, rubric_headers):
    with pdfplumber.open(pdf_path) as pdf:
        for index, strand_info in enumerate(complete_data):
            rubric_start_page = strand_info.get("rubric_start_page")
            next_strand_start_page = complete_data[index + 1]["page_number"] if index + 1 < len(complete_data) else len(pdf.pages) + 1
            
            if rubric_start_page:
                rubrics = []
                rubric_continues = False
                
                # Loop through pages starting from rubric_start_page up to the page before next_strand_start_page
                for page_number in range(rubric_start_page, next_strand_start_page):
                    page = pdf.pages[page_number - 1]
                    tables = page.extract_tables()

                    # Process each table on the current page
                    for table in tables:
                        # If it's the first page of the rubrics or if rubric_continues is True
                        if rubric_continues or any(header in table[0] for header in rubric_headers):
                            # Set rubric_continues to False initially
                            rubric_continues = False
                            # Process each row in the table as a rubric entry
                            for row in table[1:] if any(header in table[0] for header in rubric_headers) else table:
                                # Construct the rubric entry if the row has 5 columns
                                if len(row) >= 5:
                                    rubric_entry = {
                                        "indicator_name": row[0].replace('\n', ' ').strip(),
                                        "rubrics": [
                                            {"level": "Exceeds Expectations", "statement": row[1].replace('\n', ' ').strip()},
                                            {"level": "Meets Expectations", "statement": row[2].replace('\n', ' ').strip()},
                                            {"level": "Approaches Expectations", "statement": row[3].replace('\n', ' ').strip()},
                                            {"level": "Below Expectations", "statement": row[4].replace('\n', ' ').strip()}
                                        ]
                                    }
                                    rubrics.append(rubric_entry)
                                
                            # If this table was a rubric table, set rubric_continues to True for the next page
                            if any(header in table[0] for header in rubric_headers):
                                rubric_continues = True

                # Update the strand_info with the accumulated rubric entries
                strand_info["assessment_rubrics"] = rubrics

    return complete_data

# Assuming complete_data is already defined and includes rubric start pages
# pdf_path = '/mnt/data/GRADE 6 CURRICULUM DESIGNS- MATHEMATICS.pdf'
rubric_headers = ["Indicators", "Exceeds Expectations", "Meets Expectations", "Approaches Expectations", "Below Expectations"]

# Extract rubric data
complete_data_with_rubrics = extract_rubric_data_complete(pdf_path, complete_data, rubric_headers)

# Check the last indicator name for the first rubric entry to verify correct extraction
if complete_data_with_rubrics and complete_data_with_rubrics[0]["assessment_rubrics"]:
    last_indicator = complete_data_with_rubrics[0]["assessment_rubrics"][-1]["indicator_name"]
else:
    last_indicator = "No rubric entries found"

last_indicator


'Applying square roots of perfect squares up to 10,000'

In [37]:
print(complete_data_with_rubrics)

[{'page_number': 13, 'strand': '1.0\nNUMBERS', 'sub_strand': '1.1 Whole\nnumbers\n(20 Lessons)', 'rubric_start_page': 15, 'specific_learning_outcomes': ['Specific Learning Outcomes', 'By the end of the Sub Strand, the learner should be able to; a) Use place value and total value of digits up to millions in real life, b) Use numbers up to millions in symbols in real life, c) Read, write and relate numbers up to 100,000 in words in real life, d) Order numbers up to 100,000 in real life situations, e) round off numbers up to 100,000 to the nearest thousand in different situations, f) Apply squares of whole numbers up to 100 in different situations,', 'g) Apply square roots of perfect squares up to 10,000 in different situations, h) Use it devices for learning more on whole numbers and for enjoyment, i) appreciate use of whole numbers in real life situations.', ''], 'suggested_learning_experiences': ['Suggested Learning Experiences', '• Learners in pairs/groups or as individuals to identif

In [None]:
import json

# Define the path for the processed JSON file
original_file_name = "GRADE 6 CURRICULUM DESIGNS- MATHEMATICS"
processed_pdf_path = f'../data/processed/{original_file_name}.json'

# Writing the extracted data to a JSON file
with open(processed_pdf_path, 'w', encoding='utf-8') as file:
    json.dump(complete_data_with_rubrics, file, ensure_ascii=False, indent=4)




In [None]:
# pdf = pdfplumber.open(pdf_path) 
# page = pdf.pages[16 - 1]
# tables = page.extract_tables()

In [None]:
# tables