In [1]:
import base64
import email
import os
import re

from lxml import etree

In [11]:
def parse_old_mail(content: str) -> dict | None:
    tree = etree.HTML(content)
    results = tree.xpath(
        "//*[contains(text(),'Lessons finished')]/ancestor::tr[1]/following-sibling::tr[1]//b[@class='card-value chameleon-heading']/text()"
    )
    if len(results) == 1:
        return {
            'lessons': int(results[0].strip())
        }

def parse_new_mail(content: str) -> dict | None:
    lessons = re.search(r'(\d+)\s*lessons?\s*\</h2', content.strip(), re.IGNORECASE)
    minutes = re.search(r'(\d+)\s*minutes?\s*\</h2', content.strip(), re.IGNORECASE)
    if lessons:
        return {
            'lessons': int(lessons.group(1)),
            'minutes': int(minutes.group(1))
        }        

parsers = [parse_old_mail, parse_new_mail]

In [12]:
directory = 'emails'
for filename in os.listdir(directory):
    if not filename.endswith('.eml'): 
        continue
    path = os.path.join(directory, filename)
    
    with open(path, 'r') as eml_file:
        msg = email.message_from_file(eml_file)

    if not 'duolingo' in msg['From'].casefold():
        continue

    print(msg['date'])
    for part in msg.walk():
        if part.get_content_type() != 'text/html':
            continue
        if part.get('Content-Transfer-Encoding') != 'base64':
            continue
        content = base64.b64decode(part.get_payload()).decode()

        result = next((result for parser in parsers if (result := parser(content))))
        print(result)

Sat, 28 Dec 2024 17:27:55 +0000
{'lessons': 154, 'minutes': 497}
Sat, 2 Sep 2023 04:14:45 +0000
{'lessons': 14}
