# core

> Fill in a module description here

In [1]:
#| default_exp drive

In [2]:
#| exporti

from fastcore.basics import patch_to

import os
from enum import Enum
from dataclasses import dataclass ,field

import datetime as dt
import dateutil as dtu

from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build, Resource

from googleapiclient.errors import HttpError
from google.auth.transport.requests import Request

from nbdev.showdoc import *

import pptx2md
import zipfile
import io
import markdownify
from bs4 import BeautifulSoup


  warn("Couldn't import ipywidgets properly, progress bar will use console behavior")


In [3]:
#| export

class GoogleDrive_MimeType_Enum(Enum):
    """for translating Google Drive types into 'exports as'"""
    
    docx  = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
    pptx = 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
    
    odt  = 'application/vnd.oasis.opendocument.text'
    rtf = 'application/rtf'
    pdf = 'application/pdf'
    txt = 'text/plain'
    # html = 'application/zip'
    zip = 'application/zip'
    epub  = 'application/epub+zip'
    
    folder = 'application/vnd.google-apps.folder'
    gdoc = 'application/vnd.google-apps.document'   
    gsheet ='application/vnd.google-apps.spreadsheet'
    gshortcut = 'application/vnd.google-apps.shortcut'
    gslides = 'application/vnd.google-apps.presentation'
    
    # 'tex'  = 'application/zip'
    # 'html.zip'= 'application/zip'

In [4]:
for member in GoogleDrive_MimeType_Enum:
    print(member.name, member.value)

docx application/vnd.openxmlformats-officedocument.wordprocessingml.document
pptx application/vnd.openxmlformats-officedocument.presentationml.presentation
odt application/vnd.oasis.opendocument.text
rtf application/rtf
pdf application/pdf
txt text/plain
zip application/zip
epub application/epub+zip
folder application/vnd.google-apps.folder
gdoc application/vnd.google-apps.document
gsheet application/vnd.google-apps.spreadsheet
gshortcut application/vnd.google-apps.shortcut
gslides application/vnd.google-apps.presentation


# Google Drive File Looper
retrieves a listing of all the files and subfiles in a folder

In [5]:
# | export

@dataclass
class GDoc_File:
    mime_type: str
    webViewLink: str
    doc_id: str
    doc_name: str
    modified_time: dt.datetime

    parent_ls : [str] = field(default = None)
    # parent_gdocs : list[dict] = field(default = None, repr = False)

    folder_path : str = None
    
    # folder_path: str = None
    # file_type: GoogleDrive_MimeType_Enum = None

    service: Resource = field(default = None, repr = False)
    creds: Credentials = field( default = None, repr = False)
    
    content = None

    def __post_init__(self):
        if self.creds and self.service is None:
            self.service = self._generate_service(creds = self.creds)

        # if self.parent_ls:
        #     print(self.parent_ls)
        #     self.parent_gdocs = [GDoc_File.get_from_id(document_id = parent, service = self.service) for parent in self.parent_ls]
        #     self.folder_path = " > ".join([ parent.doc_name for parent in self.parent_gdocs])


    @staticmethod
    def _generate_service(creds: Credentials ):
        return build(serviceName='drive',
                             version='v3', 
                             credentials=creds)

    @classmethod
    def _from_json(cls, obj: dict, service=None, creds=None):

        return cls(mime_type=obj['mimeType'],
                   webViewLink=obj['webViewLink'],
                   doc_id=obj['id'],
                   doc_name=obj['name'],
                   modified_time=dtu.parser.parse(obj['modifiedTime']),
                   parent_ls = obj.get('parents', None),
                   
                   service=service,
                   creds=creds)


In [6]:
#export
class GDoc_File_ServiceRequired(Exception):
    def __init__(self, doc_url = None, doc_id = None):
        message = f"service (googleclientapi.discovery.Resource) required to download {doc_url or doc_id}"
        super().__init__(message)
        

In [7]:
@patch_to(GDoc_File, cls_method=True)
def get_from_id(cls,
                document_id : str,
                creds: Credentials = None,
                service: Resource = None,
                return_raw: bool = False
                ):

    if not service and creds:
        service = cls._generate_service(creds = creds) 

    if not service:
        raise GDoc_File_ServiceRequired(doc_id=document_id)

    document_data = service.files().get(fileId=document_id, fields="id,webViewLink, name, mimeType,modifiedTime,parents ").execute()

    if return_raw:
        return document_data

    return cls._from_json(obj = document_data, creds = creds, service = service)


#### sample implementation of get_from_id

In [8]:
from dotenv import load_dotenv
import json
import os
import gdoc_sync.creds as gcred

# DOCUMENT_ID = '1j7XsbvFy0xUgGL6i-3LSChKvzSmTZSOyimEt6tQS-Kk'
DOCUMENT_ID = '1m48jciWr2iZqwnhN7rezM8_GposSDZhGwQhsIdRqLJg'

# loads client secret information from .env file into env
load_dotenv('.env')

# loads client secret information into dict
client_secret_info=json.loads(os.environ['GDOC_KEY'])
token_info = json.loads(os.environ['GDOC_TOKEN'])

# generates Credentials object
creds = gcred.generate_creds_object(client_secret_info=client_secret_info, token_info = token_info)

GDoc_File.get_from_id(document_id= DOCUMENT_ID, creds = creds, return_raw= False)

GDoc_File(mime_type='application/vnd.google-apps.document', webViewLink='https://docs.google.com/document/d/1m48jciWr2iZqwnhN7rezM8_GposSDZhGwQhsIdRqLJg/edit?usp=drivesdk', doc_id='1m48jciWr2iZqwnhN7rezM8_GposSDZhGwQhsIdRqLJg', doc_name='test_subfolder', modified_time=datetime.datetime(2023, 8, 17, 21, 1, 6, 479000, tzinfo=tzutc()), parent_ls=['1grGeTxDXjEoo8MkRsMzTg6v_Pdtuxi1T'], folder_path=None)

# Handle Downloading Documents

In [9]:
#| exporti
def upsert_folder(folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

## handle pptx

In [10]:
#| exporti

def download_pptx(content, output_folder):
    output_ppt_index = os.path.join(output_folder, 'index.pptx')

    with open(output_ppt_index, "wb+") as binary_file:
        # Write bytes to file
        binary_file.write(content)

    pptx2md.convert(output_ppt_index,
                    output=os.path.join(output_folder, 'index.md'),
                    image_dir=os.path.join(output_folder, 'images'))
    
    return True

## handle doc

Download Zip will export the document as a .html file, convert it to beautiful soup and then use `markdownify` to convert HTML to markdown

In [11]:
# | exporti
def clean_bs4(html):
    soup = BeautifulSoup(html, "html.parser")
    return soup.find('body')

def convert_file_in_place(file_path):
    with(open(file_path, encoding='utf-8')) as f:
        html = f.read()
    
    # html = remove_head_tag(html)
    html = clean_bs4(html)
    # h = markdownify.markdownify(html, heading_style="ATX")
    h = markdownify.markdownify(str(html))

    md_path = file_path.replace('.html', '.md')
    
    with (open(md_path, 'w+', encoding='utf-8')) as f:
        f.write(h)
    
    return md_path

def download_zip(content, output_folder):
    zip = zipfile.ZipFile(io.BytesIO(content), 'r')
    zip.extractall(output_folder)

    file_ls = os.listdir(output_folder)

    # rename the html file to index.html
    for file_name in file_ls:
        if file_name.endswith('.html'):
            output_index = os.path.join(output_folder, 'index.html')
            os.replace(os.path.join(output_folder, file_name), output_index)
            convert_file_in_place(os.path.join(output_folder, 'index.html'))
    
    return True

# Download Factory
exted by adding more download functions

In [12]:
#| exporti

download_factory = {
    'application/vnd.google-apps.presentation': {'download_fn': download_pptx,
                                                 'export_type': GoogleDrive_MimeType_Enum.pptx},
    'default': {'download_fn': download_zip,
                'export_type': GoogleDrive_MimeType_Enum.zip}
}


In [13]:
#| export    
class GDocFile_DownloadError(Exception):
    def __init__(self, doc_url):
        message = f"failure to download content for {doc_url}"
        super().__init__(message)

@patch_to(GDoc_File)
def download_file(self :GDoc_File,
                  service : Resource = None,
                  output_folder: str = ""):

    service = service or self.service
    factory = download_factory.get(self.mime_type, download_factory['default'])

    loop = 0
    content = None

    while loop < 3 and content is None:
        try:
            loop += 1
            content = service.files().export(
                fileId=self.doc_id,
                mimeType=factory['export_type'].value).execute()

            self.content = content

        except HttpError as err:
            print(err)

        except Exception as e:
            print(e)

    if not content:
        raise GDocFile_DownloadError(self.webViewLink)
    
    # create output_folder
    output_folder_path = os.path.join(output_folder, self.doc_name)
    upsert_folder(output_folder_path)

    # download
    return factory['download_fn'](content, output_folder_path)

### sample implementation of download_file

In [14]:
from dotenv import load_dotenv
import json
import os
import gdoc_sync.creds as gcred

DOCUMENT_ID = '1j7XsbvFy0xUgGL6i-3LSChKvzSmTZSOyimEt6tQS-Kk'

# loads client secret information from .env file into env
load_dotenv('.env')


# loads client secret information into dict
client_secret_info=json.loads(os.environ['GDOC_KEY'])
token_info = json.loads(os.environ['GDOC_TOKEN'])

# generates Credentials object
creds = gcred.generate_creds_object(client_secret_info=client_secret_info, token_info = token_info)

test_file = GDoc_File.get_from_id(document_id= DOCUMENT_ID, creds = creds, return_raw= False)

test_file.download_file()

True

In [15]:
# | exporti

# def process_file(file_obj, folder_path, return_raw: bool = False, service : Resource = None):
#     """gets the file type using GoogleDrive Enum, useful for processing & downloading a file"""

#     file_obj.update({'folder_path': folder_path})

#     file_type = "undefined_enum"

#     try:
#         file_type = GoogleDrive_MimeType_Enum(file_obj['mimeType'])

#     except Exception as e:
#         print(e)

#     finally:
#         file_obj.update({'file_type': file_type})

#     if return_raw:
#         return file_obj

# return GDoc_File._from_json(obj = file_obj, service = service)

def get_files(folder_id, service: Resource):
    """retrieves files in a folder"""

    page_token = None
    file_ls = []

    while True:
        res = service.files().list(
            q=f"'{folder_id}' in parents",
            pageSize=10, fields="nextPageToken, files(id,webViewLink, name, mimeType,modifiedTime )",
            pageToken=page_token).execute()

        items = res.get('files', [])

        file_ls += items

        page_token = res.get('nextPageToken', None)

        if page_token is None:
            break

    return file_ls


def get_file_looper(folder_id, service: Resource, file_ls=None, folder_path='',
                    return_raw: bool = False,
                    ):
    """recursive function to get files in a folder and map over files in subfolder"""

    if not file_ls:
        file_ls = []

    new_files = get_files(folder_id=folder_id, service=service)

    new_files = [GDoc_File._from_json( obj=file_obj, service=service) for file_obj in new_files]

    file_ls += new_files

    if not new_files:
        return file_ls

    for file_obj in new_files:
        file_obj.folder_path = folder_path

        if (isinstance(file_obj, dict) and file_obj['mimeType'] == 'application/vnd.google-apps.folder') :
            get_file_looper(folder_id=file_obj.get('id'),
                            folder_path=os.path.join( folder_path, file_obj.get('name')),
                            service=service,
                            file_ls=file_ls)

        if (isinstance(file_obj, GDoc_File) and file_obj.mime_type == 'application/vnd.google-apps.folder'):
            
            get_file_looper(folder_id=file_obj.doc_id,
                            folder_path=os.path.join( folder_path, file_obj.doc_name),
                            service=service,
                            file_ls=file_ls)



    return file_ls


In [16]:
def get_folder_contents(folder_id, creds, folder_path='', return_raw: bool = False) -> [dict]: # returns list of file objects

    """creates a list files in a folder using a set of credentials"""

    service = build(serviceName='drive',
                    version='v3',
                    credentials=creds)

    return get_file_looper(folder_id=folder_id, service=service, folder_path=folder_path, return_raw = return_raw)


In [17]:
from dotenv import load_dotenv
import json
import os
from gdoc_sync.creds import generate_creds_object

import pandas as pd

FOLDER_ID ='1SRrD1dNgZgHYjnhkJtARbLhydMP94qWi'
# loads client secret information from .env file into env
load_dotenv('.env')

# loads client secret information into dict
client_secret_info=json.loads(os.environ['GDOC_KEY'])
token_info = json.loads(os.environ['GDOC_TOKEN'])

# generates Credentials object
creds = generate_creds_object(client_secret_info=client_secret_info, token_info = token_info)


res = get_folder_contents(folder_id = FOLDER_ID, creds = creds, return_raw= False)
# pd.DataFrame(res).drop(columns = ['parent_ls','creds','service', 'parent_gdocs'])

pd.DataFrame(res)

Unnamed: 0,mime_type,webViewLink,doc_id,doc_name,modified_time,parent_ls,folder_path,service,creds
0,application/vnd.google-apps.folder,https://drive.google.com/drive/folders/1grGeTx...,1grGeTxDXjEoo8MkRsMzTg6v_Pdtuxi1T,subfolder_test,2023-08-17 21:00:55.634000+00:00,,,<googleapiclient.discovery.Resource object at ...,
1,application/vnd.google-apps.presentation,https://docs.google.com/presentation/d/1_k4NRr...,1_k4NRraKI1TmHNlpQCuqJrWr6dP7DNracdMCtfN8XlM,sample slide,2023-08-17 13:54:31.973000+00:00,,,<googleapiclient.discovery.Resource object at ...,
2,application/vnd.google-apps.document,https://docs.google.com/document/d/1j7XsbvFy0x...,1j7XsbvFy0xUgGL6i-3LSChKvzSmTZSOyimEt6tQS-Kk,Sample Doc for Google_Sync project,2023-08-17 13:53:01.574000+00:00,,,<googleapiclient.discovery.Resource object at ...,
3,application/vnd.google-apps.document,https://docs.google.com/document/d/1m48jciWr2i...,1m48jciWr2iZqwnhN7rezM8_GposSDZhGwQhsIdRqLJg,test_subfolder,2023-08-17 21:01:06.479000+00:00,,subfolder_test,<googleapiclient.discovery.Resource object at ...,


In [18]:
#| hide
import nbdev; nbdev.nbdev_export()