In [3]:
filelines = []

with open('./input/Lectionary 2021.txt', 'r') as txtfile:
    filelines = txtfile.readlines()
    
len(filelines)

10373

In [5]:
from enum import Enum
import re
import doctest

class Section(Enum):
    UNKNOWN = 0,
    MONTH = 1
    DAY = 2,
    DAILY_EUCHARISTIC_L = 3,
    FOR_ALL_SAINTS = 4,
    MORNING_PRAYER = 5,
    EVENING_PRAYER = 6,
    REVISED_COMMON_L = 7,
    OTHER_READINGS_MORNING = 8,
    OTHER_READINGS_EVENING = 9


def isDaySection(line, currentSection):
    '''
    >>> isDaySection("2V", Section.DAY)
    True
    '''
    match = re.search("^(SUN|MON|TUE|WED|THU|FRI|SAT)\\b", line)
    if match:
        return True

    match = re.search("^[0-9]+(W|G|V|v|w|r|R| )*$", line)
    isDayNumber = match and currentSection == Section.DAY
    return True if isDayNumber else False

def isMonthSection(line):
    '''
    >>> isMonthSection("NOVEMBER 2020")
    True
    '''
    match = re.search("^(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER) ", line)
    return True if match else False

def isOtherReadingsMorning(line, previousLine):
    '''
    >>> isOtherReadingsMorning("Morning", "Other Readings")
    True
    '''
    if(re.search("^Other Readings", previousLine) and re.search("^Morning", line)):
        return True 
    return False

def isOtherReadingsEvening(line, previousLine):
    '''
    >>> isOtherReadingsEvening("Evening", "Other Readings")
    True
    '''
    if(re.search("^Other Readings", previousLine) and re.search("^Evening", line)):
        return True 
    return False
    
def getSection(line, previousLine = None, currentSection = Section.UNKNOWN):
    '''
    >>> getSection("NOVEMBER 2020", None)
    <Section.MONTH: 1>
    '''
    if(isDaySection(line, currentSection)):
        return Section.DAY
    if(isMonthSection(line)):
        return Section.MONTH
    if(isOtherReadingsMorning(line, previousLine)):
        return Section.OTHER_READINGS_MORNING
    if(isOtherReadingsEvening(line, previousLine)):
        return Section.OTHER_READINGS_EVENING
    
    return Section.UNKNOWN
    

In [6]:
import json

days = []

def goThroughFile():
    previousLine = None
    currentSection = Section.MONTH
    widerSection = None
    
    currentMonth = "NOVEMBER 2020"
    currentDayOfWeek = "SAT"
    currentDay = "28"
    saints = None
    
    otherReadingsMorning = []
    otherReadingsEvening = []
    
    for line in filelines:
        
        line = line.replace('\n', '')     
        newSection = getSection(line, previousLine, currentSection)
        #print(f"{newSection} {line} ({widerSection})")
           
        hitStartOfNextDay = newSection == Section.MONTH or (newSection == Section.DAY and currentSection != Section.MONTH and currentSection != Section.DAY)
        if(hitStartOfNextDay and currentDayOfWeek):
            #summarise
            dayData = {
                'date': f"{currentDayOfWeek} {currentDay} {currentMonth}",
                'dayDescription': saints,
                'otherReadingsMorning':otherReadingsMorning,
                'otherReadingsEvening':otherReadingsEvening
            }       
            days.append(dayData)
            
            #clear the data
            currentDayOfWeek = None
            currentDay = None
            otherReadingsMorning = []
            otherReadingsEvening = []      
        
        if(newSection == Section.MONTH):  
            currentMonth = line
            widerSection = None
        elif (newSection == Section.DAY and currentSection != Section.DAY):
            currentDayOfWeek = line[0:3]
            saints = line[3:]
            widerSection = Section.DAY
        elif (newSection == Section.DAY and currentSection == Section.DAY):
            match = re.search("^[0-9]+", line)
            if(match):
                currentDay = match.group(0) #the whole match
            widerSection = None
        elif (newSection == Section.OTHER_READINGS_MORNING):
            widerSection = Section.OTHER_READINGS_MORNING
        elif (newSection == Section.OTHER_READINGS_EVENING):
            widerSection = Section.OTHER_READINGS_EVENING
        elif (newSection == Section.UNKNOWN and widerSection == Section.OTHER_READINGS_MORNING and line != "Other Readings"):
            otherReadingsMorning.append(line)
        elif (newSection == Section.UNKNOWN and widerSection == Section.OTHER_READINGS_EVENING):
            otherReadingsEvening.append(line)
          
        currentSection = widerSection if widerSection else newSection
        previousLine = line
              
goThroughFile()

In [7]:
def checkDate(currentDayDate, previousDayDate):
    daysOfWeek = ["MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN"]

    # check for None
    if("None" in currentDayDate):
        return False
    
    dayComponents = currentDayDate.split(" ")    
    if(len(dayComponents) != 4):
        return False
    
    if not previousDayDate:
        return True
    
    previousDayComponents = previousDayDate.split(" ")
    
    # check day
    try:
        difference = int(dayComponents[1]) - int(previousDayComponents[1])
        if difference == 1 or ( dayComponents[1] == "1" and dayComponents[2] != previousDayComponents[2]):
            return True
        else:
            # if today happens more than once and yesterday does too, then we know we have an alternative set of days
            countOfToday = sum([day['date'] == currentDayDate for day in days])
            countOfPreviousDay = sum([day['date'] == previousDayDate for day in days])
            
            return countOfToday == countOfPreviousDay and countOfToday > 1
            
    except ValueError:
        return False
    
    # check day of week
    dayOfWeek = dayComponents[0]
    previousDayOfWeek = previousDayComponents[0]
    difference = daysOfWeek.index(dayOfWeek) - daysOfWeek.index(previousDayOfWeek)
    if(difference not in [1, -6]):
        return False
        
    return True
    

for index, value in enumerate(days):
    dayDate = days[index]['date']
    previousDayDate = days[index - 1]['date'] if index > 0 else None
    
    warningPrefix = '\033[91m'
    okPrefix = '\033[92m'
    
    isOk = checkDate(dayDate, previousDayDate)
    if isOk:
        print(f"{okPrefix}{dayDate}")
    else:
        print(f"{warningPrefix}{dayDate}")
    
    

[92mSAT 28 NOVEMBER 2020
[92mSUN 29 NOVEMBER 2020
[92mMON 30 NOVEMBER 2020
[92mTUE 1 DECEMBER 2020
[92mWED 2 DECEMBER 2020
[92mTHU 3 DECEMBER 2020
[92mFRI 4 DECEMBER 2020
[92mSAT 5 DECEMBER 2020
[92mSUN 6 DECEMBER 2020
[92mMON 7 DECEMBER 2020
[92mTUE 8 DECEMBER 2020
[92mWED 9 DECEMBER 2020
[92mTHU 10 DECEMBER 2020
[92mFRI 11 DECEMBER 2020
[92mSAT 12 DECEMBER 2020
[92mSUN 13 DECEMBER 2020
[92mMON 14 DECEMBER 2020
[92mTUE 15 DECEMBER 2020
[92mWED 16 DECEMBER 2020
[92mTHU 17 DECEMBER 2020
[92mFRI 18 DECEMBER 2020
[92mSAT 19 DECEMBER 2020
[92mSUN 20 DECEMBER 2020
[92mMON 21 DECEMBER 2020
[92mTUE 22 DECEMBER 2020
[92mWED 23 DECEMBER 2020
[92mTHU 24 DECEMBER 2020
[92mFRI 25 DECEMBER 2020
[92mSAT 26 DECEMBER 2020
[92mSUN 27 DECEMBER 2020
[92mMON 28 DECEMBER 2020
[92mTUE 29 DECEMBER 2020
[92mWED 30 DECEMBER 2020
[92mTHU 31 DECEMBER 2020
[92mFRI 1 JANUARY 2021
[92mFRI 1 JANUARY 2021
[92mSAT 2 JANUARY 2021
[92mSUN 3 JANUARY 2021
[92mMON 4 JANUARY 2021
[92m

# Stop
Only continue once the above statement prints all the dates correctly.  If it looks like a rare typo (like Sun instead of SUN), just tweak the document.  If it looks like a common thing, try change the script.

In [8]:
# this is just a script to inspect the data for a particular day
foundDay = [day for day in days if day['date'] == "SUN 27 DECEMBER 2020"][0]
print(foundDay)

{'date': 'SUN 27 DECEMBER 2020', 'dayDescription': ' St John the Evangelist/Hoani Tapu, Te Kaituhi rongopai', 'otherReadingsMorning': ['Ps 21;147:12-end', 'Exod 33:12-23', '1 John 2:1-11', 'Ps 105:1-11', 'Isa 63:7-9', 'Eph 3:5-12'], 'otherReadingsEvening': ['Ps 97', 'Isa 6:1-8', '1 John 5:1-12', 'If St John the Evangelist is not celebrated', '1st Sunday of Christmas/Te Rātapu Tuatahi i muri i te Rā Whānautanga', 'W RCL', 'Isa 61:10–62:3', 'Ps 148', 'Gal 4:4-7', 'Luke 2:22-40', 'Collect: Ep 2:2', '(Var: Christmas)', 'Other Readings', 'Ps 132', 'Isa 35', 'Col 1:9-20', 'Celebrating Common Prayer: Form 4 during Christmastide', 'Collect for the First Sunday of Christmas', 'Faithful God,', 'you set the promise of your coming in the hearts of your people;', 'your Son was revealed to Anna and Simeon', 'who had waited faithfully with hope.', 'Through the power of your Spirit inspire each generation', 'to wait patiently and expectantly', 'for your coming into the world again,', 'that, like them,

In [9]:
import copy
   
def removeOrAndTrim(reading):
    result = reading.replace(" or ", "")
    result = result.replace(" or", "")
    result = result.replace("or ", "")
    return result.strip()

def resolveAlternatives(list):   
    apocryphal = ["Tobit", "1 Macc", "2 Macc", "Sirach", "Baruch", "Wisd"]
    result = []
    for book in apocryphal:
        if book in list[1]:
            result = [list[0], list[2], list[3], list[1]]
        elif book in list[2]:
            result = [list[0], list[1], list[3], list[2]]

    return [removeOrAndTrim(reading) for reading in result]

def looksLikeScripture(reading):
    script_regex = "^([1-2] )?[A-Za-z]+ [\[\]\(\)0-9]+([0-9\-–;:,endab \(\)\*]+)?$"
    return re.search(script_regex, reading) is not None

def removeExtras(list):   
    
    
    filtered_list = [removeOrAndTrim(scripture) for scripture in list]
    
    if not looksLikeScripture(filtered_list[0]):
        return [f"error: expected scripture in '{filtered_list[0]}'"] + filtered_list
    elif not looksLikeScripture(filtered_list[1]):
        return [f"error: expected scripture in '{filtered_list[1]}'"] + filtered_list
    elif not looksLikeScripture(filtered_list[2]):
        return [f"error: expected scripture in '{filtered_list[2]}'"] + filtered_list
    
    filtered_list = [scripture for scripture in list if looksLikeScripture(scripture)]
    
    if len(filtered_list) == 3:
        return filtered_list
    if len(filtered_list) == 4:
        return resolveAlternatives(filtered_list)
    
    return ["error: couldn't resolve down to 3"] + filtered_list
    
                      
        
def processOtherReadings(day):
    '''
    >>> {'date': 'SUN 13 DECEMBER 2020', 'dayDescription': 'dont care', 'otherReadingsMorning': ['Ps 50:1-6,62', 'Isa 12', 'Luke 1:57-66'], 'otherReadingsEvening': ['Ps 68:1-19', 'Mal 3:1-4;4', 'Phil 4:4-7', 'Celebrating Common Prayer: Form 3 during Advent', 'Collect for the Third Sunday of Advent']}
    {'date': 'SUN 13 DECEMBER 2020', 'dayDescription': 'dont care', 'otherReadingsMorning': ['Ps 50:1-6,62', 'Isa 12', 'Luke 1:57-66'], 'otherReadingsEvening': ['Ps 68:1-19', 'Mal 3:1-4;4', 'Phil 4:4-7']}
    '''
    
    key = ""
    result = copy.deepcopy(day)
    
    # morning
    key = "otherReadingsMorning"
    unprocessedReadingsList = day[key]   
    if len(unprocessedReadingsList) == 3:
        result[key] = unprocessedReadingsList
    elif len(unprocessedReadingsList) == 4:
        result[key] = resolveAlternatives(unprocessedReadingsList)
    elif len(unprocessedReadingsList) > 4:
        result[key] = removeExtras(unprocessedReadingsList)
    else:
        result[key] = []
        
    # evening
    key = "otherReadingsEvening"
    unprocessedReadingsList = day[key]   
    if len(unprocessedReadingsList) == 3:
        result[key] = unprocessedReadingsList
    elif len(unprocessedReadingsList) == 4:
        result[key] = resolveAlternatives(unprocessedReadingsList)
    elif len(unprocessedReadingsList) > 4:
        result[key] = removeExtras(unprocessedReadingsList)
    else:
        result[key] = []
        
    return result

      
def processDays():
    
    result = []
    for day in days: 
        processed_day = processOtherReadings(day)

        if not "SUN" in day["date"]:
            processed_day["otherReadingsMorning"] = []
            processed_day["otherReadingsEvening"] = []
        
        result.append(processed_day)
        
    return result 
    

In [13]:
# debug a particular day
foundDay = [day for day in days if day['date'] == "MON 11 JANUARY 2021"][0]
processOtherReadings(foundDay)

{'date': 'MON 11 JANUARY 2021',
 'dayDescription': '',
 'otherReadingsMorning': [],
 'otherReadingsEvening': []}

In [19]:
processedDays = processDays()

for day in processedDays:
    morningR = day['otherReadingsMorning']
    eveningR = day['otherReadingsEvening']
    
    print(f"{day['date']}%%{'; '.join(morningR)}%%{'; '.join(eveningR)}")

SAT 28 NOVEMBER 2020%%%%
SUN 29 NOVEMBER 2020%%Ps 44; Isa 2:1-5; Luke 12:35-48%%error: couldn't resolve down to 3; Ps 25; Isa 1:1-20; Ps 48; Isa 49:1-9a; 1 Cor 4:9-16; Isa 49:5-13; Ps 67;96; Eph 2:13-22; Matt 28:16-20
MON 30 NOVEMBER 2020%%%%
TUE 1 DECEMBER 2020%%%%
WED 2 DECEMBER 2020%%%%
THU 3 DECEMBER 2020%%%%
FRI 4 DECEMBER 2020%%%%
SAT 5 DECEMBER 2020%%%%
SUN 6 DECEMBER 2020%%Ps 80; Zeph 3:14-20; Luke 1:5-20; Baruch 5:1-9%%Ps 40; 1 Kgs 22:1-28; Rom 15:4-13
MON 7 DECEMBER 2020%%%%
TUE 8 DECEMBER 2020%%%%
WED 9 DECEMBER 2020%%%%
THU 10 DECEMBER 2020%%%%
FRI 11 DECEMBER 2020%%%%
SAT 12 DECEMBER 2020%%%%
SUN 13 DECEMBER 2020%%Ps 50:1-6,62; Isa 12; Luke 1:57-66%%Ps 68:1-19; Mal 3:1-4;4; Phil 4:4-7
MON 14 DECEMBER 2020%%%%
TUE 15 DECEMBER 2020%%%%
WED 16 DECEMBER 2020%%%%
THU 17 DECEMBER 2020%%%%
FRI 18 DECEMBER 2020%%%%
SAT 19 DECEMBER 2020%%%%
SUN 20 DECEMBER 2020%%Ps 144; Isa 7:10-16; Rom 1:1-7%%Ps 113,(131); Zech 2:10-13; Luke 1:39-55
MON 21 DECEMBER 2020%%%%
TUE 22 DECEMBER 2020%%%

# Stop
Take the output of the last step, paste it into a text file, and go through each sunday, fixing them up so there are 3 readings in each section, and save the file in `input` folder named ``

I have been:
1. Removing extra psalms in brackets, but if there are multiple main ones, leaving them
2. Keep an eye out for the apochyphal books - the processing this far isn't that great
3. For sundays where there are multiple options, I'm trying to pick the one that normally gets celebrated on that day