# Utils
> supporting functions

In [1]:
# | default_exp utils

In [2]:
# | exporti

import os

import re
import unicodedata
import json

from bs4 import BeautifulSoup
from markdownify import MarkdownConverter

from PIL import Image

import pptx2md

import zipfile
import io


import datetime as dt
from dateutil.parser import parse as dtu_parse

from dotenv import set_key, load_dotenv

In [3]:
# | hide
from nbdev.showdoc import show_doc
from dotenv import load_dotenv
import gdoc_sync.google.auth as ga

load_dotenv("../.env")

True

# File Management

In [4]:
# | exports
def update_env(env_path: str, key: str, value: str, debug_prn: bool = False) -> dict:
    """
    updates a .env file with a key value pair
    then reloads the env_file
    """

    if not os.path.exists(env_path):
        with open(env_path, "w", encoding="utf-8") as f:
            f.write("")

    quote_mode = "always"

    if isinstance(value, dict):
        quote_mode = "never"
        value = json.dumps(value)

    if debug_prn:
        from pprint import pprint

        pprint(
            {
                "env_path": env_path,
                "key": key,
                "value": value,
                "type": type(value),
                "quote_mode": quote_mode,
            }
        )

    set_key(env_path, key, value, quote_mode=quote_mode)

    set_key(env_path, "env_last_modified", f"updated - {dt.date.today()}")

    load_dotenv(env_path, override=True)

    return {key: os.getenv(key)}

In [1]:
# | exports
def upsert_folder(folder_path: str, debug_prn: bool = False):
    folder_path = os.path.dirname(folder_path)

    if debug_prn:
        print({"upsert_folder": os.path.abspath(folder_path), "is_exist": os.path.exists(folder_path)})

    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

## handle converting Files to Markdown

In [6]:
# | exporti


class ImageBlockConverter(MarkdownConverter):
    """
    Create a custom MarkdownConverter that adds two newlines after an image
    """

    def convert_img(self, el, text, convert_as_inline, is_resize: bool = True):
        """
        custom image downloader for ImabeBlockConverter
        will handle resize
        """

        if is_resize:
            style_obj = {
                (obj.split(":")[0].strip()): obj.split(":")[1].strip()
                for obj in el.get("style").split(";")
                if ":" in obj
            }

            file_path = os.path.join(
                os.path.dirname(self.options["file_path"]), el["src"]
            )

            image = Image.open(file_path)

            width = style_obj["width"].replace("px", "")
            width = int(float(width))

            height = style_obj["height"].replace("px", "")
            height = int(float(height))

            new_image = image.resize((width, height))
            new_image.save(file_path)

        return super().convert_img(el, text, convert_as_inline)


def md(html, **options):
    """Create shorthand method for handling conversion"""
    return ImageBlockConverter(**options).convert(html)

In [7]:
# |exports
def convert_html_to_markdown(file_path):
    """converts html file to markdown in place"""

    with open(file_path, encoding="utf-8") as f:
        html = f.read()

    markdown_content = md(
        str(html),
        keep_inline_images_in=["td", "span"],
        file_path=file_path,
        is_resize=True,
    )

    md_path = file_path.replace(".html", ".md")

    with open(md_path, "w+", encoding="utf-8") as f:
        f.write(markdown_content)

    return

In [8]:
# | exports


def download_zip(zip_bytes_content, output_folder, is_convert_to_markdown: bool = True):
    """save bytes content to a zip file then convert html to markdown"""

    zip = zipfile.ZipFile(io.BytesIO(zip_bytes_content), "r")
    zip.extractall(output_folder)

    file_ls = os.listdir(output_folder)

    # rename the html file to index.html
    for file_name in file_ls:
        if file_name.endswith(".html"):
            output_index = os.path.join(output_folder, "index.html")
            os.replace(os.path.join(output_folder, file_name), output_index)

            if is_convert_to_markdown:
                convert_html_to_markdown(os.path.join(output_folder, "index.html"))

    return f"successfully downloaded zip to {output_folder}"

#### sample implementation of downloading a zip from google docs and converting it to markdown

In [9]:
# import gdoc_sync.google.auth as ga


DOCUMENT_ID = "1j7XsbvFy0xUgGL6i-3LSChKvzSmTZSOyimEt6tQS-Kk"

# generates Credentials object
google_auth = ga.GoogleAuth.get_creds_from_env(
    credentials_env_key="GDOC_KEY",
    token_env_key="GDOC_TOKEN",
)
google_auth
content = (
    google_auth.service.files()
    .export(fileId=DOCUMENT_ID, mimeType="application/zip")
    .execute()
)

download_zip(content, "../TEST/utils/drive_converter-download_zip")

using saved token
generating service object on GoogleAuth


'successfully downloaded zip to ../TEST/utils/drive_converter-download_zip'

In [10]:
# |exports


def download_pptx(
    pptx_bytes_content, output_folder, is_convert_to_markdown: bool = True
):
    """save bytes content to a pptx file then converts to markdown"""

    upsert_folder(output_folder)

    output_ppt_index = os.path.join(output_folder, "index.pptx")

    with open(output_ppt_index, "wb+") as binary_file:
        # Write bytes to file
        binary_file.write(pptx_bytes_content)

    if is_convert_to_markdown:
        pptx2md.convert(
            output_ppt_index,
            output=os.path.join(output_folder, "index.md"),
            image_dir=os.path.join(output_folder, "images"),
        )

    return f"successfully downloaded content to {output_folder}"

#### sample implementation of download pptx from google drive

In [11]:
# | import gdoc_sync.google.auth as ga

SLIDE_ID = "1_k4NRraKI1TmHNlpQCuqJrWr6dP7DNracdMCtfN8XlM"

# generates Credentials object
google_auth = ga.GoogleAuth.get_creds_from_env(
    credentials_env_key="GDOC_KEY", token_env_key="GDOC_TOKEN"
)

content = (
    google_auth.service.files()
    .export(
        fileId=SLIDE_ID,
        mimeType="application/vnd.openxmlformats-officedocument.presentationml.presentation",
    )
    .execute()
)
download_pptx(content, "../TEST/utils/drive_converter-download_pptx")

using saved token
generating service object on GoogleAuth


Converting slides: 100%|██████████| 2/2 [00:00<00:00, 329.34it/s]


'successfully downloaded content to ../TEST/utils/drive_converter-download_pptx'

# Text Cleaning

In [12]:
# | exports
def convert_str_to_snake_case(text_str):
    """converts 'snake_case_str' to 'snakeCaseStr'"""

    return text_str.replace(" ", "_").lower()

In [13]:
# | exports
def convert_str_remove_accents(text_str: str) -> str:
    return "".join(
        c
        for c in unicodedata.normalize("NFD", text_str)
        if unicodedata.category(c) != "Mn"
    )

In [14]:
convert_str_remove_accents("est être"), convert_str_remove_accents("kožušček")

('est etre', 'kozuscek')

In [15]:
# | exports
def convert_str_keep_alphanumeric(text_str) -> str:
    pattern = "[^0-9a-zA-Z_\s]+"

    return re.sub(pattern, "", text_str)

In [16]:
# | exports
def convert_str_file_name(text_str: str) -> str:
    """convert strings to clean file name or url"""

    return convert_str_keep_alphanumeric(
        convert_str_to_snake_case(convert_str_remove_accents(text_str))
    )

In [17]:
convert_str_file_name("Register Snowflake with Cloud Amplifier"), convert_str_file_name(
    "Kožušček and Beast Modes"
)

('register_snowflake_with_cloud_amplifier', 'kozuscek_and_beast_modes')

# Conversion

In [18]:
# | exports
def convert_str_to_date(datefield: str) -> dt.datetime:
    """converts string date to datetime object"""
    return dtu_parse(datefield) if datefield else None

In [19]:
convert_str_to_date("2023-10-01")

datetime.datetime(2023, 10, 1, 0, 0)

In [2]:
# | hide
import nbdev

nbdev.nbdev_export()