In [None]:
filelines = []

with open('./input/Lectionary 2021.txt', 'r') as txtfile:
    filelines = txtfile.readlines()
    
len(filelines)

In [None]:
from enum import Enum
import re
import doctest

class Section(Enum):
    UNKNOWN = 0,
    MONTH = 1
    DAY = 2,
    DAILY_EUCHARISTIC_L = 3,
    FOR_ALL_SAINTS = 4,
    MORNING_PRAYER = 5,
    EVENING_PRAYER = 6,
    REVISED_COMMON_L = 7,
    OTHER_READINGS_MORNING = 8,
    OTHER_READINGS_EVENING = 9


def isDaySection(line, currentSection):
    '''
    >>> isDaySection("2V", Section.DAY)
    True
    '''
    match = re.search("^(SUN|MON|TUE|WED|THU|FRI|SAT)\\b", line)
    if match:
        return True

    match = re.search("^[0-9]+(W|G|V|v|w|r|R| )*$", line)
    isDayNumber = match and currentSection == Section.DAY
    return True if isDayNumber else False

def isMonthSection(line):
    '''
    >>> isMonthSection("NOVEMBER 2020")
    True
    '''
    match = re.search("^(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER) ", line)
    return True if match else False

def isOtherReadingsMorning(line, previousLine):
    '''
    >>> isOtherReadingsMorning("Morning", "Other Readings")
    True
    '''
    if(re.search("^Other Readings", previousLine) and re.search("^Morning", line)):
        return True 
    return False

def isOtherReadingsEvening(line, previousLine):
    '''
    >>> isOtherReadingsEvening("Evening", "Other Readings")
    True
    '''
    if(re.search("^Other Readings", previousLine) and re.search("^Evening", line)):
        return True 
    return False
    
def getSection(line, previousLine = None, currentSection = Section.UNKNOWN):
    '''
    >>> getSection("NOVEMBER 2020", None)
    <Section.MONTH: 1>
    '''
    if(isDaySection(line, currentSection)):
        return Section.DAY
    if(isMonthSection(line)):
        return Section.MONTH
    if(isOtherReadingsMorning(line, previousLine)):
        return Section.OTHER_READINGS_MORNING
    if(isOtherReadingsEvening(line, previousLine)):
        return Section.OTHER_READINGS_EVENING
    
    return Section.UNKNOWN
    
doctest.testmod()

In [None]:
import json

days = []

def goThroughFile():
    previousLine = None
    currentSection = Section.MONTH
    widerSection = None
    
    currentMonth = "NOVEMBER 2020"
    currentDayOfWeek = "SAT"
    currentDay = "28"
    saints = None
    
    otherReadingsMorning = []
    otherReadingsEvening = []
    
    for line in filelines:
        
        line = line.replace('\n', '')     
        newSection = getSection(line, previousLine, currentSection)
        #print(f"{newSection} {line} ({widerSection})")
           
        hitStartOfNextDay = newSection == Section.MONTH or (newSection == Section.DAY and currentSection != Section.MONTH and currentSection != Section.DAY)
        if(hitStartOfNextDay and currentDayOfWeek):
            #summarise
            dayData = {
                'date': f"{currentDayOfWeek} {currentDay} {currentMonth}",
                'dayDescription': saints,
                'otherReadingsMorning':otherReadingsMorning,
                'otherReadingsEvening':otherReadingsEvening
            }       
            days.append(dayData)
            
            #clear the data
            currentDayOfWeek = None
            currentDay = None
            otherReadingsMorning = []
            otherReadingsEvening = []      
        
        if(newSection == Section.MONTH):  
            currentMonth = line
            widerSection = None
        elif (newSection == Section.DAY and currentSection != Section.DAY):
            currentDayOfWeek = line[0:3]
            saints = line[3:]
            widerSection = Section.DAY
        elif (newSection == Section.DAY and currentSection == Section.DAY):
            match = re.search("^[0-9]+", line)
            if(match):
                currentDay = match.group(0) #the whole match
            widerSection = None
        elif (newSection == Section.OTHER_READINGS_MORNING):
            widerSection = Section.OTHER_READINGS_MORNING
        elif (newSection == Section.OTHER_READINGS_EVENING):
            widerSection = Section.OTHER_READINGS_EVENING
        elif (newSection == Section.UNKNOWN and widerSection == Section.OTHER_READINGS_MORNING and line != "Other Readings"):
            otherReadingsMorning.append(line)
        elif (newSection == Section.UNKNOWN and widerSection == Section.OTHER_READINGS_EVENING):
            otherReadingsEvening.append(line)
          
        currentSection = widerSection if widerSection else newSection
        previousLine = line
              
goThroughFile()

In [None]:
def checkDate(currentDayDate, previousDayDate):
    daysOfWeek = ["MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN"]

    # check for None
    if("None" in currentDayDate):
        return False
    
    dayComponents = currentDayDate.split(" ")    
    if(len(dayComponents) != 4):
        return False
    
    if not previousDayDate:
        return True
    
    previousDayComponents = previousDayDate.split(" ")
    
    # check day
    try:
        difference = int(dayComponents[1]) - int(previousDayComponents[1])
        if difference == 1 or ( dayComponents[1] == "1" and dayComponents[2] != previousDayComponents[2]):
            return True
        else:
            # if today happens more than once and yesterday does too, then we know we have an alternative set of days
            countOfToday = sum([day['date'] == currentDayDate for day in days])
            countOfPreviousDay = sum([day['date'] == previousDayDate for day in days])
            
            return countOfToday == countOfPreviousDay and countOfToday > 1
            
    except ValueError:
        return False
    
    # check day of week
    dayOfWeek = dayComponents[0]
    previousDayOfWeek = previousDayComponents[0]
    difference = daysOfWeek.index(dayOfWeek) - daysOfWeek.index(previousDayOfWeek)
    if(difference not in [1, -6]):
        return False
        
    return True
    

for index, value in enumerate(days):
    dayDate = days[index]['date']
    previousDayDate = days[index - 1]['date'] if index > 0 else None
    
    warningPrefix = '\033[91m'
    okPrefix = '\033[92m'
    
    isOk = checkDate(dayDate, previousDayDate)
    if isOk:
        print(f"{okPrefix}{dayDate}")
    else:
        print(f"{warningPrefix}{dayDate}")
    
    

# Stop
Only continue once the above statement prints all the dates correctly.  If it looks like a rare typo (like Sun instead of SUN), just tweak the document.  If it looks like a common thing, try change the script.

In [None]:
# this is just a script to inspect the data for a particular day
foundDay = [day for day in days if day['date'] == "SUN 27 DECEMBER 2020"][0]
print(foundDay)

In [None]:
import copy
   
def removeOrAndTrim(reading):
    result = reading.replace(" or ", "")
    result = result.replace(" or", "")
    result = result.replace("or ", "")
    return result.strip()

def resolveAlternatives(list):   
    apocryphal = ["Tobit", "1 Macc", "2 Macc", "Sirach", "Baruch", "Wisd"]
    result = []
    for book in apocryphal:
        if book in list[1]:
            result = [list[0], list[2], list[3], list[1]]
        elif book in list[2]:
            result = [list[0], list[1], list[3], list[2]]

    return [removeOrAndTrim(reading) for reading in result]

def looksLikeScripture(reading):
    script_regex = "^([1-2] )?[A-Za-z]+ [\[\]\(\)0-9]+([0-9\-–;:,endab \(\)\*]+)?$"
    return re.search(script_regex, reading) is not None

def removeExtras(list):   
    
    
    filtered_list = [removeOrAndTrim(scripture) for scripture in list]
    
    if not looksLikeScripture(filtered_list[0]):
        return [f"error: expected scripture in '{filtered_list[0]}'"] + filtered_list
    elif not looksLikeScripture(filtered_list[1]):
        return [f"error: expected scripture in '{filtered_list[1]}'"] + filtered_list
    elif not looksLikeScripture(filtered_list[2]):
        return [f"error: expected scripture in '{filtered_list[2]}'"] + filtered_list
    
    filtered_list = [scripture for scripture in list if looksLikeScripture(scripture)]
    
    if len(filtered_list) == 3:
        return filtered_list
    if len(filtered_list) == 4:
        return resolveAlternatives(filtered_list)
    
    return ["error: couldn't resolve down to 3"] + filtered_list
    
                      
        
def processOtherReadings(day):
    '''
    >>> {'date': 'SUN 13 DECEMBER 2020', 'dayDescription': 'dont care', 'otherReadingsMorning': ['Ps 50:1-6,62', 'Isa 12', 'Luke 1:57-66'], 'otherReadingsEvening': ['Ps 68:1-19', 'Mal 3:1-4;4', 'Phil 4:4-7', 'Celebrating Common Prayer: Form 3 during Advent', 'Collect for the Third Sunday of Advent']}
    {'date': 'SUN 13 DECEMBER 2020', 'dayDescription': 'dont care', 'otherReadingsMorning': ['Ps 50:1-6,62', 'Isa 12', 'Luke 1:57-66'], 'otherReadingsEvening': ['Ps 68:1-19', 'Mal 3:1-4;4', 'Phil 4:4-7']}
    '''
    
    key = ""
    result = copy.deepcopy(day)
    
    # morning
    key = "otherReadingsMorning"
    unprocessedReadingsList = day[key]   
    if len(unprocessedReadingsList) == 3:
        result[key] = unprocessedReadingsList
    elif len(unprocessedReadingsList) == 4:
        result[key] = resolveAlternatives(unprocessedReadingsList)
    elif len(unprocessedReadingsList) > 4:
        result[key] = removeExtras(unprocessedReadingsList)
    else:
        result[key] = []
        
    # evening
    key = "otherReadingsEvening"
    unprocessedReadingsList = day[key]   
    if len(unprocessedReadingsList) == 3:
        result[key] = unprocessedReadingsList
    elif len(unprocessedReadingsList) == 4:
        result[key] = resolveAlternatives(unprocessedReadingsList)
    elif len(unprocessedReadingsList) > 4:
        result[key] = removeExtras(unprocessedReadingsList)
    else:
        result[key] = []
        
    return result

      
def processSundays():
    sundays = [day for day in days if "SUN " in day['date']]
    
    result = []
    for day in sundays:
        processed_day = processOtherReadings(day)       
        result.append(processed_day)
        
    return result 
    


In [None]:
# debug a particular day
foundDay = [day for day in days if day['date'] == "SUN 10 JANUARY 2021"][0]
processOtherReadings(foundDay)

In [None]:
processedSundays = processSundays()

for day in processedSundays:
    morningR = day['otherReadingsMorning']
    eveningR = day['otherReadingsEvening']
    
    hasMorningError = not morningR or "error: " in morningR[0]
    hasEveningError = not eveningR or "error: " in eveningR[0]
    if hasMorningError or hasEveningError:
        morningError = "error: no readings found" if not morningR else morningR[0]
        eveningError = "error: no readings found" if not eveningR else eveningR[0]
        print(f"{day['date']} = morning: {morningError if hasMorningError else 'OK'}, evening: {eveningError if hasEveningError else 'OK'}" )