# Google Drive DownloadConverter

In [48]:
#| default_exp google.converter

In [49]:
#| exporti

import os

from enum import Enum

from bs4 import BeautifulSoup
from markdownify import MarkdownConverter

import zipfile
import io
from PIL import Image

import pptx2md

import gdoc_sync.google.auth as ga
from gdoc_sync.client import upsert_folder


In [50]:
#| export

class GoogleDrive_MimeType_Enum(Enum):
    """for translating Google Drive types into 'exports as'"""
    
    docx  = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
    pptx = 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
    
    odt  = 'application/vnd.oasis.opendocument.text'
    rtf = 'application/rtf'
    pdf = 'application/pdf'
    txt = 'text/plain'
    # html = 'application/zip'
    zip_file = 'application/zip'
    epub  = 'application/epub+zip'
    
    folder = 'application/vnd.google-apps.folder'
    gdoc = 'application/vnd.google-apps.document'   
    gsheet ='application/vnd.google-apps.spreadsheet'
    gshortcut = 'application/vnd.google-apps.shortcut'
    gslides = 'application/vnd.google-apps.presentation'
    
    # 'tex'  = 'application/zip'
    # 'html.zip'= 'application/zip'

In [51]:
for member in GoogleDrive_MimeType_Enum:
    print(member.name, member.value)

docx application/vnd.openxmlformats-officedocument.wordprocessingml.document
pptx application/vnd.openxmlformats-officedocument.presentationml.presentation
odt application/vnd.oasis.opendocument.text
rtf application/rtf
pdf application/pdf
txt text/plain
zip_file application/zip
epub application/epub+zip
folder application/vnd.google-apps.folder
gdoc application/vnd.google-apps.document
gsheet application/vnd.google-apps.spreadsheet
gshortcut application/vnd.google-apps.shortcut
gslides application/vnd.google-apps.presentation


# Handle Downloading Documents

## default download method  -- ZIPs

- download and unpack zip file
- convert index.html to markdown
    - resize images referenced in markdown 

In [52]:
# | exporti

def clean_bs4(html):
    soup = BeautifulSoup(html, "html.parser")
    return soup.find('body')

class ImageBlockConverter(MarkdownConverter):
    """
    Create a custom MarkdownConverter that adds two newlines after an image
    """

    def convert_img(self, el, text, convert_as_inline, is_resize : bool = True):
        """
        custom image downloader for ImabeBlockConverter
        will handle resize
        """

        if is_resize:
            style_obj = {(obj.split(':')[0].strip()) : obj.split(':')[1].strip() for obj in el.get('style').split(';') if ':' in obj}
            
            file_path = os.path.join(os.path.dirname(self.options['file_path']), el['src'])

            image = Image.open(file_path)

            width = style_obj['width'].replace('px', '')
            width = int(float(width))

            height = style_obj['height'].replace('px', '')
            height = int(float(height))

            new_image = image.resize((width,height))
            new_image.save(file_path)
    
        return super().convert_img(el, text, convert_as_inline)

def md(html, **options):
    """Create shorthand method for handling conversion"""
    return ImageBlockConverter(**options).convert(html)


def convert_file_in_place(file_path):
    """converts html file to markdown in place"""
    
    with(open(file_path, encoding='utf-8')) as f:
        html = f.read()
    
    markdown_content = md(str(html), keep_inline_images_in= ['td', 'span'], file_path =file_path, is_resize = True)

    md_path = file_path.replace(".html", ".md")

    with open(md_path, "w+", encoding="utf-8") as f:
        f.write(markdown_content)

    return 

def download_zip(content, output_folder):
    """save content to a zip file then convert html to markdown"""

    zip = zipfile.ZipFile(io.BytesIO(content), 'r')
    zip.extractall(output_folder)

    file_ls = os.listdir(output_folder)

    # rename the html file to index.html
    for file_name in file_ls:
        if file_name.endswith('.html'):
            output_index = os.path.join(output_folder, 'index.html')
            os.replace(os.path.join(output_folder, file_name), output_index)
            convert_file_in_place(os.path.join(output_folder, 'index.html'))
    
    return f"successfully downloaded zip to {output_folder}"

#### sample implementation of download_zip

In [53]:
from dotenv import load_dotenv
import json
import os

DOCUMENT_ID = '1j7XsbvFy0xUgGL6i-3LSChKvzSmTZSOyimEt6tQS-Kk'


# generates Credentials object
google_auth = ga.GoogleAuth()
google_auth.get_creds_from_env(credentials_env_key= 'GDOC_KEY', token_env_key = 'GDOC_TOKEN',   env_file = '.env')

content = google_auth.service.files().export(fileId=DOCUMENT_ID, mimeType= 'application/zip').execute()
download_zip(content, 'sample/drive_converter-download_zip')


using saved token
generating service object on GoogleAuth


'successfully downloaded zip to sample/drive_converter-download_zip'

## Download PPTX files

- export content to pptx file
- convert pptx to markdown

In [54]:
#|exporti

def download_pptx(content, output_folder):
    
    upsert_folder(output_folder)

    output_ppt_index = os.path.join(output_folder, 'index.pptx')

    with open(output_ppt_index, "wb+") as binary_file:
        # Write bytes to file
        binary_file.write(content)

    pptx2md.convert(output_ppt_index,
                    output=os.path.join(output_folder, 'index.md'),
                    image_dir=os.path.join(output_folder, 'images'))
    
    return f'successfully downloaded content to {output_folder}'




In [55]:
from dotenv import load_dotenv
import json
import os

SLIDE_ID = '1_k4NRraKI1TmHNlpQCuqJrWr6dP7DNracdMCtfN8XlM'

# generates Credentials object
google_auth = ga.GoogleAuth()
google_auth.get_creds_from_env(credentials_env_key= 'GDOC_KEY', token_env_key = 'GDOC_TOKEN',   env_file = '.env')

content = google_auth.service.files().export(fileId=SLIDE_ID, mimeType= 'application/vnd.openxmlformats-officedocument.presentationml.presentation').execute()
download_pptx(content, 'sample/drive_converter-download_pptx')


using saved token
generating service object on GoogleAuth


Converting slides: 100%|██████████| 2/2 [00:00<00:00, 517.24it/s]


'successfully downloaded content to sample/drive_converter-download_pptx'

# Download Factory
Extend the download factory to support exporting different file types and converting them into useable formats

In [56]:
#| export

download_factory = {
    'application/vnd.google-apps.presentation': {'download_fn': download_pptx,
                                                 'export_type': GoogleDrive_MimeType_Enum.pptx},
    'default': {'download_fn': download_zip,
                'export_type': GoogleDrive_MimeType_Enum.zip_file}
}

In [57]:
#| hide

import nbdev
nbdev.nbdev_export()