# Step 1: reading in each day's data from the raw txt from the PDF

In [57]:
import copy
from enum import Enum
import re
import doctest
import json

In [58]:
filelines = []

with open('./input/Lectionary 2023.txt', 'r') as txtfile:
    filelines = txtfile.readlines()
    
len(filelines)

9857

In [59]:
class Section(Enum):
    UNKNOWN = 0,
    MONTH = 1
    DAY = 2,
    DAILY_EUCHARISTIC_L = 3,
    FOR_ALL_SAINTS = 4,
    MORNING_PRAYER = 5,
    EVENING_PRAYER = 6,
    REVISED_COMMON_L = 7,
    OTHER_READINGS_MORNING = 8,
    OTHER_READINGS_EVENING = 9


def isDaySection(line, currentSection):
    '''
    >>> isDaySection("2V", Section.DAY)
    True
    '''
    match = re.search("^(SUN|MON|TUE|WED|THU|FRI|SAT)\\b", line)
    if match:
        return True

    match = re.search("^[0-9]+(W|G|V|v|w|r|R| )*$", line)
    isDayNumber = match and currentSection == Section.DAY
    return True if isDayNumber else False

def isMonthSection(line):
    '''
    >>> isMonthSection("NOVEMBER 2020")
    True
    '''
    match = re.search("^(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER) ", line)
    return True if match else False

def isOtherReadingsMorning(line, previousLine):
    '''
    >>> isOtherReadingsMorning("Morning", "Other Readings")
    True
    '''
    if(re.search("^Other Readings", previousLine) and re.search("^Morning", line)):
        return True 
    return False

def isOtherReadingsEvening(line, previousLine):
    '''
    >>> isOtherReadingsEvening("Evening", "Other Readings")
    True
    '''
    if(re.search("^Other Readings", previousLine) and re.search("^Evening", line)):
        return True 
    return False
    
def getSection(line, previousLine = None, currentSection = Section.UNKNOWN):
    '''
    >>> getSection("NOVEMBER 2020", None)
    <Section.MONTH: 1>
    '''
    if(isDaySection(line, currentSection)):
        return Section.DAY
    if(isMonthSection(line)):
        return Section.MONTH
    if(isOtherReadingsMorning(line, previousLine)):
        return Section.OTHER_READINGS_MORNING
    if(isOtherReadingsEvening(line, previousLine)):
        return Section.OTHER_READINGS_EVENING
    
    return Section.UNKNOWN
   
    
doctest.testmod()

TestResults(failed=0, attempted=5)

In [60]:
days = []

def goThroughFile():
    previousLine = None
    currentSection = Section.MONTH
    widerSection = None
    
    currentMonth = "NOVEMBER 2022"
    currentDayOfWeek = "SUN"
    currentDay = "27"
    saints = None
    
    otherReadingsMorning = []
    otherReadingsEvening = []
    
    for line in filelines:
        
        line = line.replace('\n', '')     
        newSection = getSection(line, previousLine, currentSection)
        #print(f"{newSection} {line} ({widerSection})")
           
        hitStartOfNextDay = newSection == Section.MONTH or (newSection == Section.DAY and currentSection != Section.MONTH and currentSection != Section.DAY)
        if(hitStartOfNextDay and currentDayOfWeek):
            #summarise
            dayData = {
                'date': f"{currentDayOfWeek} {currentDay} {currentMonth}",
                'dayDescription': saints,
                'otherReadingsMorning':otherReadingsMorning,
                'otherReadingsEvening':otherReadingsEvening
            }       
            days.append(dayData)
            
            #clear the data
            currentDayOfWeek = None
            currentDay = None
            otherReadingsMorning = []
            otherReadingsEvening = []      
        
        if(newSection == Section.MONTH):  
            currentMonth = line
            widerSection = None
        elif (newSection == Section.DAY and currentSection != Section.DAY):
            currentDayOfWeek = line[0:3]
            saints = line[3:]
            widerSection = Section.DAY
        elif (newSection == Section.DAY and currentSection == Section.DAY):
            match = re.search("^[0-9]+", line)
            if(match):
                currentDay = match.group(0) #the whole match
            widerSection = None
        elif (newSection == Section.OTHER_READINGS_MORNING):
            widerSection = Section.OTHER_READINGS_MORNING
        elif (newSection == Section.OTHER_READINGS_EVENING):
            widerSection = Section.OTHER_READINGS_EVENING
        elif (newSection == Section.UNKNOWN and widerSection == Section.OTHER_READINGS_MORNING and line != "Other Readings"):
            otherReadingsMorning.append(line)
        elif (newSection == Section.UNKNOWN and widerSection == Section.OTHER_READINGS_EVENING):
            otherReadingsEvening.append(line)
          
        currentSection = widerSection if widerSection else newSection
        previousLine = line
              
goThroughFile()

In [61]:
def checkDate(currentDayDate, previousDayDate):
    daysOfWeek = ["MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN"]

    # check for None
    if("None" in currentDayDate):
        return False
    
    dayComponents = currentDayDate.split(" ")    
    if(len(dayComponents) != 4):
        return False
    
    if not previousDayDate:
        return True
    
    previousDayComponents = previousDayDate.split(" ")
    
    # check day
    try:
        difference = int(dayComponents[1]) - int(previousDayComponents[1])
        if difference == 1 or ( dayComponents[1] == "1" and dayComponents[2] != previousDayComponents[2]):
            return True
        else:
            # if today happens more than once and yesterday does too, then we know we have an alternative set of days
            countOfToday = sum([day['date'] == currentDayDate for day in days])
            countOfPreviousDay = sum([day['date'] == previousDayDate for day in days])
            
            return countOfToday == countOfPreviousDay and countOfToday > 1
            
    except ValueError:
        return False
    
    # check day of week
    dayOfWeek = dayComponents[0]
    previousDayOfWeek = previousDayComponents[0]
    difference = daysOfWeek.index(dayOfWeek) - daysOfWeek.index(previousDayOfWeek)
    if(difference not in [1, -6]):
        return False
        
    return True
    

for index, value in enumerate(days):
    dayDate = days[index]['date']
    previousDayDate = days[index - 1]['date'] if index > 0 else None
    
    warningPrefix = '\033[91m'
    okPrefix = '\033[92m'
    
    isOk = checkDate(dayDate, previousDayDate)
    if isOk:
        print(f"{okPrefix}{dayDate}")
    else:
        print(f"{warningPrefix}{dayDate}")
    
    

[92mSUN 27 NOVEMBER 2022
[92mSUN 27 NOVEMBER 2022
[92mMON 28 NOVEMBER 2022
[92mTUE 29 NOVEMBER 2022
[92mWED 30 NOVEMBER 2022
[92mTHU 1 DECEMBER 2022
[92mFRI 2 DECEMBER 2022
[92mSAT 3 DECEMBER 2022
[92mSUN 4 DECEMBER 2022
[92mMON 5 DECEMBER 2022
[92mTUE 6 DECEMBER 2022
[92mWED 7 DECEMBER 2022
[92mTHU 8 DECEMBER 2022
[92mFRI 9 DECEMBER 2022
[92mSAT 10 DECEMBER 2022
[92mSUN 11 DECEMBER 2022
[92mMON 12 DECEMBER 2022
[92mTUE 13 DECEMBER 2022
[92mWED 14 DECEMBER 2022
[92mTHU 15 DECEMBER 2022
[92mFRI 16 DECEMBER 2022
[92mSAT 17 DECEMBER 2022
[92mSUN 18 DECEMBER 2022
[92mMON 19 DECEMBER 2022
[92mTUE 20 DECEMBER 2022
[92mWED 21 DECEMBER 2022
[92mTHU 22 DECEMBER 2022
[92mFRI 23 DECEMBER 2022
[92mSAT 24 DECEMBER 2022
[92mSUN 25 DECEMBER 2022
[92mMON 26 DECEMBER 2022
[92mTUE 27 DECEMBER 2022
[92mWED 28 DECEMBER 2022
[92mTHU 29 DECEMBER 2022
[92mFRI 30 DECEMBER 2022
[92mSAT 31 DECEMBER 2022
[92mSUN 1 JANUARY 2023
[92mMON 2 JANUARY 2023
[92mTUE 3 JANUARY 2023


## Stop
Only continue once the above statement prints all the dates correctly.  If it looks like a rare typo (like Sun instead of SUN), just tweak the document.  If it looks like a common thing, try change the script.

In [4]:
# this is just a script to inspect the data for a particular day
foundDay = [day for day in days if day['date'] == "SUN 27 DECEMBER 2020"][0]
print(foundDay)

IndexError: list index out of range

# Step 2: Attempting to process the morning and evening readings programatically before finishing manually

In [63]:
def removeOrAndTrim(reading):
    result = reading.replace(" or ", "")
    result = result.replace(" or", "")
    result = result.replace("or ", "")
    return result.strip()

def resolveAlternatives(list):   
    apocryphal = ["Tobit", "1 Macc", "2 Macc", "Sirach", "Baruch", "Wisd"]
    result = []
    for book in apocryphal:
        if book in list[1]:
            result = [list[0], list[2], list[3], list[1]]
        elif book in list[2]:
            result = [list[0], list[1], list[3], list[2]]

    return [removeOrAndTrim(reading) for reading in result]

def looksLikeScripture(reading):
    script_regex = "^([1-2] )?[A-Za-z]+ [\[\]\(\)0-9]+([0-9\-–;:,endab \(\)\*]+)?$"
    return re.search(script_regex, reading) is not None

def removeExtras(list):   
    '''
    >>> removeExtras(['Ps 68:1-19', 'Mal 3:1-4;4', 'Phil 4:4-7', 'Celebrating Common Prayer: Form 3 during Advent', 'Collect for the Third Sunday of Advent'])
    ['Ps 68:1-19', 'Mal 3:1-4;4', 'Phil 4:4-7']
    '''
    
    filtered_list = [removeOrAndTrim(scripture) for scripture in list]
    
    if not looksLikeScripture(filtered_list[0]):
        return [f"error: expected scripture in '{filtered_list[0]}'"] + filtered_list
    elif not looksLikeScripture(filtered_list[1]):
        return [f"error: expected scripture in '{filtered_list[1]}'"] + filtered_list
    elif not looksLikeScripture(filtered_list[2]):
        return [f"error: expected scripture in '{filtered_list[2]}'"] + filtered_list
    
    filtered_list = [scripture for scripture in filtered_list if looksLikeScripture(scripture)]
    
    if len(filtered_list) == 3:
        return filtered_list
    if len(filtered_list) == 4:
        return resolveAlternatives(filtered_list)
    
    return ["error: couldn't resolve down to 3"] + filtered_list
    
                      
        
def processOtherReadings(day):
    
    key = ""
    result = copy.deepcopy(day)
    
    # morning
    key = "otherReadingsMorning"
    unprocessedReadingsList = day[key]   
    if len(unprocessedReadingsList) == 3:
        result[key] = unprocessedReadingsList
    elif len(unprocessedReadingsList) == 4:
        result[key] = resolveAlternatives(unprocessedReadingsList)
    elif len(unprocessedReadingsList) > 4:
        result[key] = removeExtras(unprocessedReadingsList)
    else:
        result[key] = []
        
    # evening
    key = "otherReadingsEvening"
    unprocessedReadingsList = day[key]   
    if len(unprocessedReadingsList) == 3:
        result[key] = unprocessedReadingsList
    elif len(unprocessedReadingsList) == 4:
        result[key] = resolveAlternatives(unprocessedReadingsList)
    elif len(unprocessedReadingsList) > 4:
        result[key] = removeExtras(unprocessedReadingsList)
    else:
        result[key] = []
        
    return result

      
def processDays():
    
    result = []
    for day in days: 
        processed_day = processOtherReadings(day)

        if not "SUN" in day["date"]:
            processed_day["otherReadingsMorning"] = []
            processed_day["otherReadingsEvening"] = []
        
        result.append(processed_day)
        
    return result 
    
doctest.testmod()

TestResults(failed=0, attempted=6)

In [6]:
# debug a particular day
foundDay = [day for day in days if day['date'] == "MON 11 JANUARY 2021"][0]
processOtherReadings(foundDay)

IndexError: list index out of range

In [64]:
processedDays = processDays()

for day in processedDays:
    morningR = day['otherReadingsMorning']
    eveningR = day['otherReadingsEvening']
    
    print(f"{day['date']}%%{'; '.join(morningR)}%%{'; '.join(eveningR)}")

SUN 27 NOVEMBER 2022%%%%
SUN 27 NOVEMBER 2022%%Ps 44; Micah 4:1-7; 1 Thess 5:1-11%%Ps 9*; Isa 52:1-12; Matt 24:15-28
MON 28 NOVEMBER 2022%%%%
TUE 29 NOVEMBER 2022%%%%
WED 30 NOVEMBER 2022%%%%
THU 1 DECEMBER 2022%%%%
FRI 2 DECEMBER 2022%%%%
SAT 3 DECEMBER 2022%%%%
SUN 4 DECEMBER 2022%%Ps 80; Amos 7; Luke 1:5-20%%Ps 11,(28); 1 Kgs 18:17-39; John 1:19-28
MON 5 DECEMBER 2022%%%%
TUE 6 DECEMBER 2022%%%%
WED 7 DECEMBER 2022%%%%
THU 8 DECEMBER 2022%%%%
FRI 9 DECEMBER 2022%%%%
SAT 10 DECEMBER 2022%%%%
SUN 11 DECEMBER 2022%%Ps 68:1-19; Zeph 3:14-20; Phil 4:4-7%%Ps 12,(14); Isa 5:8-30; Acts 13:13-41
MON 12 DECEMBER 2022%%%%
TUE 13 DECEMBER 2022%%%%
WED 14 DECEMBER 2022%%%%
THU 15 DECEMBER 2022%%%%
FRI 16 DECEMBER 2022%%%%
SAT 17 DECEMBER 2022%%%%
SUN 18 DECEMBER 2022%%Ps 144; Micah 5:2-5a; Luke 1:26-38%%Ps 113,(126); 1 Sam 1:1-20; Rev 22:6-21
MON 19 DECEMBER 2022%%%%
TUE 20 DECEMBER 2022%%%%
WED 21 DECEMBER 2022%%%%
THU 22 DECEMBER 2022%%%%
FRI 23 DECEMBER 2022%%%%
SAT 24 DECEMBER 2022%%%%
SUN 2

## Stop
Take the output of the last step, paste it into a text file, and go through each sunday, fixing them up so there are 3 readings in each section, and save the file in `input` folder named `raw_sunday_readings_manually_selected.txt`

I have been:
1. Removing extra psalms in brackets, but if there are multiple main ones, leaving them
2. Keep an eye out for the apochyphal books - the processing this far isn't that great
3. For sundays where there are multiple options, I'm trying to pick the one that normally gets celebrated on that day

# Step 3: Cleaning and formatting the Sunday readings


In [65]:
processed_days = []

processedfilelines = []
with open('./input/raw_sunday_readings_manually_selected_2023.txt', 'r') as txtfile:
    processedfilelines = txtfile.readlines()
    
for line in processedfilelines:
    day_morning_evening = line.replace("\n", "").split("%%")
    processed_days.append({
        "date": day_morning_evening[0], 
        "otherReadingsMorning": day_morning_evening[1], 
        "otherReadingsEvening": day_morning_evening[2]
    })
    

In [66]:
def hydrate_book_names(scripture_list):
    '''
    >>> hydrate_book_names(['Ps 44', 'Isa 2:1-5', 'Luke 12:35-48'])
    ['Psalm 44', 'Isaiah 2:1-5', 'Luke 12:35-48']
    '''
    
    book_replacement_dictionary = {
        '^(Gen )': 'Genesis ',
        '^(Exod |Ex )': 'Exodus ',
        '^(Lev )': 'Leviticus ',
        '^(Num |Numb )': 'Numbers ',
        '^(Deut )': 'Deuteronomy ',
        '^(Josh )': 'Joshua ',
        '^(Judg )': 'Judges ',
        '^(Ruth )': 'Ruth ',
        '^(1 Sam )': '1 Samuel ',
        '^(2 Sam )': '2 Samuel ',
        '^(1 Kgs )': '1 Kings ',
        '^(2 Kgs )': '2 Kings ',
        '^(1 Chr|1 Chron )': '1 Chronicles ',
        '^(2 Chr|1 Chron )': '2 Chronicles ',
        '^(Ezra )': 'Ezra ',
        '^(Neh )': 'Nehemiah ',
        '^(Esth )': 'Esther ',
        '^(Job )': 'Job ',
        '^(Ps )': 'Psalm ',
        '^(Prov )': 'Proverbs ',
        '^(Eccl )': 'Ecclesiastes ',
        '^(Song of Sol )': 'Song of Solomon ',
        '^(Isa )': 'Isaiah ',
        '^(Jer )': 'Jeremiah ',
        '^(Lam )': 'Lamentations ',
        '^(Ezek )': 'Ezekiel ',
        '^(Dan )': 'Daniel ',
        '^(Hos )': 'Hosea ',
        '^(Joel )': 'Joel ',
        '^(Amos )': 'Amos ',
        '^(Obad )': 'Obadiah ',
        '^(Jonah )': 'Jonah ',
        '^(Mic )': 'Micah ',
        '^(Nah )': 'Nahum ',
        '^(Hab )': 'Habakkuk ',
        '^(Zeph )': 'Zephaniah ',
        '^(Hag )': 'Haggai ',
        '^(Zech )': 'Zechariah ',
        '^(Mal )': 'Malachi ',
        '^(Matt )': 'Matthew ',
        '^(Mark )': 'Mark ',
        '^(Luke )': 'Luke ',
        '^(John )': 'John ',
        '^(Acts )': 'Acts ',
        '^(Rom )': 'Romans ',
        '^(1 Cor )': '1 Corinthians ',
        '^(2 Cor )': '2 Corinthians ',
        '^(Gal )': 'Galatians ',
        '^(Eph )': 'Ephesians ',
        '^(Phil )': 'Philippians ',
        '^(Col )': 'Colossians ',
        '^(1 Thess )': '1 Thessalonians ',
        '^(2 Thess )': '2 Thessalonians ',
        '^(1 Tim )': '1 Timothy ',
        '^(2 Tim )': '2 Timothy ',
        '^(Titus )': 'Titus ',
        '^(Phlm |Philem )': 'Philemon ',
        '^(Heb )': 'Hebrews ',
        '^(Jas )': 'James ',
        '^(1 Pet )': '1 Peter ',
        '^(2 Pet )': '2 Peter ',
        '^(1 John )': '1 John ',
        '^(2 John )': '2 John ',
        '^(3 John )': '3 John ',
        '^(Jude )': 'Jude ',
        '^(Rev )': 'Revelation ',
    }
    
    result = []
    for scripture in scripture_list:
        new_scripture = scripture
        for regex in book_replacement_dictionary.keys():
            new_scripture = re.sub(regex, book_replacement_dictionary[regex], new_scripture)
         
        result.append(new_scripture)
    
    return result

def process_joined_readings(joined_readings):
    '''
    >>> process_joined_readings('Ps 44; Isa 2:1-5; Luke 12:35-48')
    'Psalm 44; Isaiah 2:1-5; Luke 12:35-48'
    '''
    
    if joined_readings == '':
        return ''
    
    split_readings = joined_readings.split("; ") 
    split_readings = hydrate_book_names(split_readings)  
    return "; ".join(split_readings)

    
doctest.testmod()

TestResults(failed=0, attempted=8)

In [67]:
def process_again():
    result = []
    for day in processed_days:
        new_day = copy.deepcopy(day)
        new_day['otherReadingsMorning'] = process_joined_readings(new_day['otherReadingsMorning'])
        new_day['otherReadingsEvening'] = process_joined_readings(new_day['otherReadingsEvening'])
        
        print(f"{new_day['date']}%%{new_day['otherReadingsMorning']}%%{new_day['otherReadingsEvening']}")
    
process_again()

SUN 27 NOVEMBER 2022%%Psalm 44; Micah 4:1-7; 1 Thessalonians 5:1-11%%Psalm 9*; Isaiah 52:1-12; Matthew 24:15-28
MON 28 NOVEMBER 2022%%%%
TUE 29 NOVEMBER 2022%%%%
WED 30 NOVEMBER 2022%%%%
THU 1 DECEMBER 2022%%%%
FRI 2 DECEMBER 2022%%%%
SAT 3 DECEMBER 2022%%%%
SUN 4 DECEMBER 2022%%Psalm 80; Amos 7; Luke 1:5-20%%Psalm 11; 1 Kings 18:17-39; John 1:19-28
MON 5 DECEMBER 2022%%%%
TUE 6 DECEMBER 2022%%%%
WED 7 DECEMBER 2022%%%%
THU 8 DECEMBER 2022%%%%
FRI 9 DECEMBER 2022%%%%
SAT 10 DECEMBER 2022%%%%
SUN 11 DECEMBER 2022%%Psalm 68:1-19; Zephaniah 3:14-20; Philippians 4:4-7%%Psalm 12; Isaiah 5:8-30; Acts 13:13-41
MON 12 DECEMBER 2022%%%%
TUE 13 DECEMBER 2022%%%%
WED 14 DECEMBER 2022%%%%
THU 15 DECEMBER 2022%%%%
FRI 16 DECEMBER 2022%%%%
SAT 17 DECEMBER 2022%%%%
SUN 18 DECEMBER 2022%%Psalm 144; Micah 5:2-5a; Luke 1:26-38%%Psalm 113; 1 Samuel 1:1-20; Revelation 22:6-21
MON 19 DECEMBER 2022%%%%
TUE 20 DECEMBER 2022%%%%
WED 21 DECEMBER 2022%%%%
THU 22 DECEMBER 2022%%%%
FRI 23 DECEMBER 2022%%%%
SAT 24

## Step 4: Publishing
1. Copy paste the above into the master sheet "constructing the year" tab (ensuring the dates match up all the way down)
2. Split text to columns on '%%'
3. Delete the extra date column, name the two new columns somethig like "Other Readings Morning/Evening", 
4. Adjust the morning and evening formula to use the new columns for Sundays
5. Copy paste the values of `Morning prayers together` and `Evening prayers together` into the published spreadhseet
6. Download a csv of the published spreadsheet tab, save it in the `input` folder as `all_readings.csv`.  
7. Run it through the `write_data_to_json.ipynb` script to publish the data.