In [1]:
import base64
import email
import os
import re
from datetime import datetime

from lxml import etree
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
def parse_old_mail(content: str) -> dict | None:
    tree = etree.HTML(content)
    lessons = tree.xpath(
        "//*[contains(text(),'Lessons finished')]/ancestor::tr[1]/following-sibling::tr[1]//b[@class='card-value chameleon-heading']/text()"
    )
    minutes = tree.xpath(
        "//*[contains(text(),'Time learning')]/ancestor::tr[1]/following-sibling::tr[1]//b[@class='stat-value chameleon-heading']/text()"
    )
    if len(lessons) == 1 and len(minutes) == 3:
        return {
            'lessons': int(lessons[0].strip()),
            'minutes': int(minutes[0].strip()) * 60 + int(minutes[1].strip())
        }

def parse_new_mail(content: str) -> dict | None:
    lessons = re.search(r'(\d+)\s*lessons?\s*\</h2', content.strip(), re.IGNORECASE)
    minutes = re.search(r'(\d+)\s*minutes?\s*\</h2', content.strip(), re.IGNORECASE)
    if lessons:
        return {
            'lessons': int(lessons.group(1)),
            'minutes': int(minutes.group(1))
        }        

parsers = [parse_old_mail, parse_new_mail]

In [3]:
directory = 'emails'
data = list()
for filename in os.listdir(directory):
    if not filename.endswith('.eml'): 
        continue
    path = os.path.join(directory, filename)
    
    with open(path, 'r') as eml_file:
        msg = email.message_from_file(eml_file)

    if not 'duolingo' in msg['From'].casefold():
        continue

    for part in msg.walk():
        if part.get_content_type() != 'text/html':
            continue
        if part.get('Content-Transfer-Encoding') != 'base64':
            continue
        content = base64.b64decode(part.get_payload()).decode()

        try:
            result = next((result for parser in parsers if (result := parser(content))))
        except StopIteration:
            continue
        result['date'] = datetime.strptime(msg['date'], '%a, %d %b %Y %H:%M:%S +0000')
        data.append(result)
        break

df = pd.DataFrame(data).set_index('date').sort_index()
df

Unnamed: 0_level_0,lessons,minutes
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-04-16 04:09:23,70,210
2022-04-23 04:47:34,45,150
2022-04-30 04:12:18,40,110
2022-05-07 04:09:42,45,118
2022-05-14 04:23:12,10,31
...,...,...
2024-11-30 16:37:34,11,58
2024-12-07 19:36:12,122,381
2024-12-14 17:55:08,82,272
2024-12-21 17:20:21,99,321
