In [1]:
from dotenv import load_dotenv
import os
import dateutil.parser as parser
from webdav4.fsspec import WebdavFileSystem
from fsspec.implementations.local import LocalFileSystem
import yaml

## Functions

In [2]:
from docx import Document
from pathlib import Path


def process(reports_path: list):
    consultants = []
    
    for report_path in reports_path:
        consultants.append(extract_report_data(report_path))                
    return consultants

def extract_report_data(report_path):
    document = Document(report_path)
    report_data = reset_report_data()
    reports = []
    current_section = None

    for para in document.paragraphs:
        text = para.text.strip()
        
        if text.startswith('Type:'):
            if report_data['type']:
                reports.append(report_data)
                report_data = reset_report_data()
            report_data['type'] = clean_text(text, 'Type:')
            current_section = 'content'
        elif text.startswith('Scholar:'):
            report_data['scholar'] = clean_text(text, 'Scholar:')
        elif text.startswith('Date:'):
            report_data['date'] = clean_text(text, 'Date:')
        elif text.startswith('Topic:'):
            report_data['topic'] = clean_text(text, 'Topic:')
        elif text.startswith('Content:'):
            report_data['content'] = clean_text(text, 'Content:')
            current_section = 'content'
        elif current_section == 'content':
            report_data['content'] += '\n' + text

    if report_data['type']:
        reports.append(report_data)

    return reports

def clean_text(text, prefix):
    return text.replace(prefix, '').strip()

def reset_report_data():
    return {'type': '', 'scholar': '', 'date': '', 'topic': '', 'content': ''}

## Download reports from sciebo

In [None]:
load_dotenv()
USR = os.getenv('REPORT_USR')
PWD = os.getenv('REPORT_PWD')
fs = WebdavFileSystem("https://uni-bonn.sciebo.de/public.php/webdav", auth=(USR, PWD))
fs.download('/', 'raw/', recursive=True)

## Process reports

In [None]:

fs_raw = LocalFileSystem()
reports = fs_raw.ls('raw', detail=False)
consultants = process(reports_path=reports)
consultants = consultants[1:] # ignoring test doc

## Write each session to a yaml file

In [None]:

for c_num, consultant in enumerate(consultants):
    for s_num, session in enumerate(consultant):
        os.makedirs('output_yaml', exist_ok=True)
        filename = 'output_yaml/' + str(c_num) + '_' + str(s_num).zfill(3) + '_' + session['type'] + '_' + parser.parse(session['date']).strftime('%Y_%m_%d') + '.yaml'
        with open(filename, 'w') as f:
            yaml.dump(session, f)