### Reading PDF Files is Painful

In [7]:
import pandas as pd
import re
import json
from pypdf import PdfReader 

class Parser:
    """
    A class used to parse agreement pdf files from assist.org

    ...

    Attributes
    ----------
    id : str
        the id of the agreement being parsed
    filename : str
        the location and name of the .pdf file to be parsed
    reader : PdfReader
        PdfReader object from the pypdf library
    ...
    
    Methods
    -------
    
    """

    def __init__(self, id):
        self._id = id
        self._filename = "./pdfs/" + str(id) +".pdf"
        self._reader = PdfReader(self._filename)

    def parse(self):
        """
        Parses
        """
        parts = []
        conjunctions = ['←', 'And', 'Or']
        
        def visitor_body(text, cm, tm, font_dict, font_size):
            text = text.replace('\u200b', '').strip()
            if text and text != '\n' and text != ' ':
                # if font_dict['/BaseFont'] == '/SegoeUIBold' and re.match('^(\S+\s)+\d+\w*$', text):
                #     parts.append([text, 0])
                if font_dict['/BaseFont'] == '/SegoeUIBold' and font_size == 19.0:
                    if tm[4] < 500:
                        parts.append([text, 0, 0])
                    else:
                        parts.append([text, 0, 1])
                elif font_dict['/BaseFont'] == '/SegoeUIRegular' and re.match('^(\S+\s)+\(\d+\.\d+\)$', text):
                    if tm[4] < 500:
                        parts.append([text, 2, 0])
                    else:
                        parts.append([text, 2, 1])
                elif font_size == 19.0 and font_dict['/BaseFont'] == '/SegoeUIRegular':
                    if tm[4] < 500:
                        parts.append([text, 1, 0])
                    else:
                        parts.append([text, 1, 1])
                elif text in conjunctions:
                    if tm[4] < 500:
                        parts.append([text, 3, 0])
                    else:
                        parts.append([text, 3, 1])
        
        for page in self._reader.pages:
            page.extract_text(visitor_text=visitor_body)

        """
        Creates a list seperated by side switches of lists of cleaned text
        
        step : 
            {0: "title", 1: "description, 2:"description end", 3:"conjunction"}
        side :
            {0: "left", 1: "right"}
        """
        step = 0
        side = 0 
        temp = []
        separated = []
        
        for i in parts:
            val = i[0]
            newstep = i[1]
            newside = i[2]
            # removes double conjunction errors
            if step == 3 and newstep == 3:
                temp = temp[:-1]
            # creates a new entry
            elif side != newside:
                separated.append(temp[:])
                temp = [val]
                step = newstep
            # continues
            else:
                temp.append(val)
                step = newstep
            # switches to (or stays on) newside
            side = newside

        """
        Creates a list seperated by side switches of lists grouped logically by the And and Or conjunctions
        """
        agreements = []
        
        for i in separated:
            course = []
            courses = []
            sets = []
            for j in i:
                # if element is not a conjunction, append to courses
                if j not in conjunctions:
                    course.append(j)
                # if element is a conjunction, append and stringify course to courses, clear course, then check if element is 'Or'
                else:
                    courses.append(" ".join(course))
                    course = []
                    # if element is 'Or', append courses to the set of courses, clear courses
                    if j == 'Or':
                        sets.append(courses[:])
                        courses = []
            # in the event that the list does not end in a conjunction, append course to courses
            if len(course) > 0:
                courses.append(" ".join(course))
            # an element of seperated will never end in 'Or', so append the final courses list to sets
            sets.append(courses[:])
            # finally, append the sets of courses to agreements
            agreements.append(sets[:])
            
        print(*agreements, sep='\n')

        """
        Creating pairs of agreements and writing them in JSON to a .txt file
        """
        with open(self._id + '.txt', 'w') as f:
            for i in range(len(agreements)//2):
                entry = {}
                entry["away"] = agreements[i][0]
                entry["home"] = agreements[i+1]
                f.write(json.dumps(entry, indent = 4) )
            f.close()
            
            
    def set_id(self, id):
        """
        Setter function for id parameter
        
        ...
        
        Parameters
        ----------
        id : str
            id number for desired agreement

        ...
        
        Raises
        ------
        TypeError
            If type of id is not str
        """
        if isinstance(id, str):
            self._id = id
            self._filename = "./pdfs/" + str(id) +".pdf"
            self._reader = PdfReader(self._filename)
        else:
            raise TypeError("This ID is not a string.")

thing = Parser("26088890")
#thing.set_id("26274157")
thing.parse()

[['CHEM 20A - Chemical Structure (4.00)', 'CHEM 20B - Chemical Energetics and Change (4.00)', 'CHEM 20L - General Chemistry Laboratory (3.00)', 'CHEM 30AL - General Chemistry Laboratory II (4.00)']]
[['CHEM 1A - General Chemistry (5.00)', 'CHEM 1B - General Chemistry (5.00)', 'CHEM 1C - General Chemistry and Qualitative Analysis (5.00)']]
[['CHEM 30A - Organic Chemistry I: Structure and Reactivity (4.00)', 'CHEM 30B - Organic Chemistry II: Reactivity, Synthesis, and Spectroscopy (4.00)']]
[['CHEM 12A - Organic Chemistry (5.00)', 'CHEM 12B - Organic Chemistry (5.00)']]
[['MATH 31A - Differential and Integral Calculus (4.00)']]
[['MATH 1A - Calculus (5.00)']]
[['MATH 31B - Integration and Infinite Series (4.00)']]
[['MATH 1B - Calculus (5.00)', 'MATH 1C - Calculus (5.00)'], ['MATH 1B - Calculus (5.00)', 'MATH 1CH - Calculus - HONORS (5.00)'], ['MATH 1BH - Calculus - HONORS (5.00)', 'MATH 1C - Calculus (5.00)'], ['MATH 1BH - Calculus - HONORS (5.00)', 'MATH 1CH - Calculus - HONORS (5.00)'