In [1]:
import re
from typing import List, Union

import pdfplumber
import pandas as pd

In [2]:
from itertools import accumulate

# Compile the regex patterns for reuse, which is more efficient than recompiling them every function call
question_pattern = re.compile(r'^Nr\.{0,1} \d{1,3}\.')
answer_pattern = re.compile(r'^[A-E]\.{0,1}\W')

def find_line_type(text: str) -> Union[str, None]:
    """Identify the type of line from the given text.
    
    This function checks if a line of text is a question`s or an answer`s beginning based on regex patterns.
    
    Args:
        text (str): A line of text to be checked.
        
    Returns:
        Union[str, None]: 'Q' if a start of question, an answer letter ('A'-'E') if an answer`s beginning, or None otherwise.
    """
    # Check if the line is a question
    if question_pattern.match(text):
        return 'Q'
    # Check if the line is an answer
    match = answer_pattern.match(text)
    if match:
        return match.group(0)[0]
    return None

def accumulate_Q(tag_list: List[Union[str, None]]) -> List[int]:
    """Accumulate the number of questions in a list of tags.
    
    This function accumulates the count of 'Q' tags in the list, effectively giving the number of questions
    up to each point in the list.
    
    Args:
        tag_list (List[Union[str, None]]): A list of tags where 'Q' represents a question.
        
    Returns:
        List[int]: A list of integers representing the accumulated count of questions.

    Examples:
        >>> tags = ['Q', None, 'A', 'Q', 'B']
        >>> accumulate_Q(tags)
        [1, 1, 1, 2, 2]
    """
        
    # Use a generator expression to optimize the accumulation process
    return list(accumulate(1 if tag == 'Q' else 0 for tag in tag_list))

In [3]:

lines = []
with pdfplumber.open('..\examples\quest.pdf') as pdf:
        for page in pdf.pages:
            text = page.extract_text_lines(layout=False, strip=True, return_chars=True)
            if text:
                lines.extend(text)

df = pd.DataFrame.from_dict(lines)

In [4]:
from collections import defaultdict

#Add tag of type [Q, A, B, C, D, E] for every line 
tag_list = df['tag'] = df['text'].map(find_line_type).to_list()

#Add parent question`s number
question_index = accumulate_Q(tag_list)
df['question_nr'] = pd.Series(question_index)

#Groupe lines into questions
grouped_lines = defaultdict(list)
lines_list = df.to_dict('records')
for index, line in zip(question_index, lines_list):
    grouped_lines[index].append(line)


In [5]:
new_pattern = r'^[A-E]\.{0,1}\W'
old_pattern = r'^[A-E]\.'

def find_pattern(pattern: str, text: str):
    if results := re.findall(pattern, text):
        return results[0][0]

diff_methods= [
    i 
    for i, x in enumerate(df['text'])
    if find_pattern(new_pattern, x) != find_pattern(old_pattern, x)
]
diff_methods

[452]

In [6]:
#Removing first line which is a document title
df = df.iloc[1:, :]

In [7]:
#Questions with answers in one line do not have tag B
questions_with_tag_b = set(df[df['tag'] == 'B']['question_nr'].tolist())

#Finding numbers of questions with answers in one line.
last_question_number = df['question_nr'].iloc[-1]
all_questions = set(range(1, last_question_number))
oneline_answers_q_nr = all_questions - questions_with_tag_b

In [8]:
oneline_rows = df[(df['question_nr'].isin(oneline_answers_q_nr)) & (df['tag'] == 'A')]

#Is there any oneliner without tag A?
oneline_answers_q_nr -  set(oneline_rows['question_nr'].unique())

set()

In [9]:
oneline_chars = []
for _, line in oneline_rows.iterrows():
    line_dict = line.to_dict()
    chars = line_dict['chars']
    new_char = [{**x, 'q_nr': line_dict['question_nr']} for x in chars]
    oneline_chars.extend(new_char)


In [35]:
posible_fonts = {x['fontname'] for x in oneline_chars}
posible_fonts

{'ABCDEE+Calibri', 'ABCDEE+Calibri,Bold', 'ABCDEE+Calibri,Italic'}

In [11]:
indices_of_bold = [
    y['text'] 
    for x, y in zip(oneline_chars, oneline_chars[1:]) 
    if x['fontname'] == 'ABCDEE+Calibri' and y['fontname'] == 'ABCDEE+Calibri,Bold'
]
#Every oneliner has at last one bolded char
len(indices_of_bold), len(oneline_answers_q_nr)

(26, 26)

In [12]:
oneliners = df[(df['question_nr'].isin(oneline_rows['question_nr'])) & (df['tag'] == 'A' )]

In [13]:
def test_onliner_pattern(text: str):
    answer_pattern_oneliner = re.compile(r'[A-E]\.{0,1}\W')
    return len(re.findall(answer_pattern_oneliner, text))

def split_oneliner_indices(text: str):
    answer_pattern_oneliner = re.compile(r'[A-E]\.{0,1}\W')
    return [match.start() for match in re.finditer(answer_pattern_oneliner, text)] + [len(text) - 1]

In [14]:
oneliners['text'].map(test_onliner_pattern).unique()

array([5], dtype=int64)

In [15]:
def split_oneliner_text(text: str):
    def split_oneliner_indices(text: str):
        answer_pattern_oneliner = re.compile(r'[A-E]\.{0,1}\W')
        return [match.start() for match in re.finditer(answer_pattern_oneliner, text)] + [len(text) - 1]
    
    indices = split_oneliner_indices(text)
    return [text[s:e] for s, e in zip(indices[:-1], indices[1:])]

In [16]:
splited_onliners_text =  oneliners['text'].map(split_oneliner_text)
splited_onliners_text.to_list()[0][:3]

['A. 5,4. ', 'B. 3,2,4,1. ', 'C. 3,4,5. ']

In [17]:
def is_bold(chars):
    '''
    Return True if at last one char is bolded
    '''
    return any((True for x in chars if x['fontname'] == 'ABCDEE+Calibri,Bold'))

In [18]:
df_test = df.copy(True)
df_test['is_bold'] = df_test['chars'].map(is_bold)
len(df_test[df_test['is_bold'] & df_test['tag'].isin(['A','B','C','D','E'])])
# There is a problem, because question 58 has bolded chars in two answers

251

In [19]:
from collections import Counter
from icecream import ic

nr_answer_counter = Counter()
for _, r in df_test.iterrows():
    if r['tag'] in ['A','B','C','D','E'] and r['is_bold'] == True:
        nr_answer_counter[r['question_nr']] += 1

Find true answer in oneliner. 

In [20]:
def find_onliner_answer(chars):
    def split_chars(chars):
        letters = ['A', 'B', 'C', 'D', 'E']
        split_indices =[index for index, char in enumerate(chars) if char['text'] in letters]
        return split_indices + [len(chars)]
    
    split_indices = split_chars(chars)

    for option, split_index in enumerate(split_indices):
        if not is_bold(chars[split_index:]):
            return option - 1
        
    return 'Not found'
    

In [21]:
test_onliner_chars =  oneliners['chars'].to_list()

In [22]:
oneliners_answers_test = [find_onliner_answer(chars) for chars in test_onliner_chars]
index_2_letter = {index : letter for index, letter in  enumerate(['A', 'B', 'C', 'D', 'E'])}
oneliners_answer_test_results = [index_2_letter[index] for index in oneliners_answers_test]

Time to create database

In [23]:
online_filter = df['question_nr'].isin(oneliners['question_nr'])
df_onliners = df[online_filter]
df_pure = df[~online_filter]

In [24]:
df.columns

Index(['text', 'x0', 'top', 'x1', 'bottom', 'chars', 'tag', 'question_nr'], dtype='object')

In [25]:
from collections import defaultdict
from typing import List, Union

import pandera as pa
from pandera.typing import DataFrame

class LinesDataFrame(pa.DataFrameModel):
    text: str
    x0: float
    top: float
    x1: float
    bottom: float
    chars: List[any]
    tag: Union[str, None]
    question_nr: int



In [26]:
def extract_questions(df: DataFrame[LinesDataFrame]) -> List[defaultdict]:
    parsed_questions = []
    curr_question = defaultdict(str)
    curr_tag = None
    curr_text = ''
    question_number = 0

    for _, row in list(df.iterrows()):
        if row['tag'] == 'Q':
            #Add unsaved text from previous question as option E
            if curr_text:
                curr_question['E'] = curr_text
            if curr_question:
                parsed_questions.append(curr_question)
                curr_question = defaultdict(str)
            curr_question['number'] = row['question_nr']

        if tag :=  row['tag']:
            if curr_text:
                curr_question[curr_tag] = curr_text
            curr_text = row['text']
            curr_tag = tag
            if is_bold(row['chars']) and row['tag'] in ['A', 'B', 'C', 'D', 'E']:
                curr_question['answer'] += curr_tag
        else:
            curr_text = curr_text + '<br>' + row['text']

    
    curr_question['E'] = curr_text
    parsed_questions.append(curr_question)
    return parsed_questions

In [27]:
extracted_pure = extract_questions(df_pure)
listed_dicts = [[d['number'], d['Q'], d['A'], d['B'], d['C'], d['D'], d['E'], d['answer']] for d in extracted_pure]
db_pure = pd.DataFrame(listed_dicts, columns=['nr', 'Q', 'A', 'B', 'C', 'D', 'E', 'answer'])

In [28]:
pd.DataFrame.from_dict(enumerate(extract_questions(df_pure)[:5]))

Unnamed: 0,0,1
0,0,"{'number': 1, 'Q': 'Nr 1. Jedną z przyczyn wst..."
1,1,"{'number': 2, 'E': 'E. Poprawne są C, D.', 'Q'..."
2,2,"{'number': 3, 'E': 'E. w pozycji półsiedzącej...."
3,3,"{'number': 4, 'E': 'E. żadne z wymienionych', ..."
4,4,"{'number': 5, 'E': 'E. prawdziwe są odpowiedzi..."


In [29]:
import sqlite3

conn = sqlite3.connect('kpp.db')

db_pure.to_sql('pure', conn, if_exists='replace', index=False)

224

In [30]:
conn2 = sqlite3.connect('kpp_limit_5.db')
db_pure[:5].to_sql('pure', conn2, if_exists='replace', index=False)

5

In [33]:
all_chars = []
for _, line in df.iterrows():
    line_dict = line.to_dict()
    chars = line_dict['chars']
    new_char = [{**x, 'q_nr': line_dict['question_nr']} for x in chars]
    all_chars.extend(new_char)

In [44]:
posible_fonts = {x['fontname'] for x in all_chars}
posible_chars = set([x['text'] for x in all_chars])

posible_fonts, ''.join(sorted(list(posible_chars)))

({'ABCDEE+Calibri', 'ABCDEE+Calibri,Bold', 'ABCDEE+Calibri,Italic'},
 '"%(),-./0123456789:;<>?ABCDEGHIJKLMNOPRSTUVWZabcdefghijklmnoprstuwxyz°óąćęłńśźż„')

#TODO
Export oneliners