In [1]:
from dotenv import load_dotenv
import os
from pathlib import Path
import shutil
import dateutil.parser as parser
from webdav4.fsspec import WebdavFileSystem
from fsspec.implementations.local import LocalFileSystem
import yaml

In [2]:
# Setting parameters for papermill

raw_dir = 'raw/'
output_dir = 'output_yaml'
env_usr = 'REPORT_USR'
env_pwd = 'REPORT_PWD'

In [3]:
# Parameters
raw_dir = "raw"
output_dir = "output_yaml"


## Functions

In [4]:
from docx import Document
from pathlib import Path


def process(reports_path: list):
    consultants = []
    
    for report_path in reports_path:
        consultants.append(extract_report_data(report_path))                
    return consultants

def extract_report_data(report_path):
    document = Document(report_path)
    report_data = reset_report_data()
    reports = []
    current_section = None

    for para in document.paragraphs:
        text = para.text.strip()
        
        if text.startswith('Type:'):
            if report_data['type']:
                reports.append(report_data)
                report_data = reset_report_data()
            report_data['type'] = clean_text(text, 'Type:')
            current_section = 'content'
        elif text.startswith('Scholar:'):
            report_data['scholar'] = clean_text(text, 'Scholar:')
        elif text.startswith('Date:'):
            report_data['date'] = clean_text(text, 'Date:')
        elif text.startswith('Topic:'):
            report_data['topic'] = clean_text(text, 'Topic:')
        elif text.startswith('Content:'):
            report_data['content'] = clean_text(text, 'Content:')
            current_section = 'content'
        elif current_section == 'content':
            report_data['content'] += '\n' + text

    if report_data['type']:
        reports.append(report_data)

    return reports

def clean_text(text, prefix):
    return text.replace(prefix, '').strip()

def reset_report_data():
    return {'type': '', 'scholar': '', 'date': '', 'topic': '', 'content': ''}

## Download reports from sciebo

In [5]:
load_dotenv()
USR = os.getenv(env_usr)
PWD = os.getenv(env_pwd)
fs = WebdavFileSystem("https://uni-bonn.sciebo.de/public.php/webdav", auth=(USR, PWD))
fs.download('/', raw_dir, recursive=True)

## Process reports

In [6]:

fs_raw = LocalFileSystem()
reports = fs_raw.ls(raw_dir, detail=False)
reports = reports[1:] 
consultants = process(reports_path=reports)

## Write each session to a yaml file

In [7]:
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)
os.makedirs(output_dir, exist_ok=True)


for c_num, consultant in enumerate(consultants):
    consultant_name = Path(reports[c_num]).stem
    for s_num, session in enumerate(consultant):
        session['consultant'] = consultant_name
        session['date'] = parser.parse(session['date']).strftime('%Y-%m-%d')
        filename = 'output_yaml/' + consultant_name + '_' + str(s_num).zfill(3) + '_' + parser.parse(session['date']).strftime('%Y-%m-%d') + '.yaml'
        with open(filename, 'w') as f:
            yaml.dump(session, f)