<a href="https://colab.research.google.com/github/joseph-loeffler/Basketball-Stat-Tracking/blob/main/gdrive_utils.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
%%capture captured
# Library imports
from googleapiclient.http import MediaIoBaseDownload
from googleapiclient.discovery import build
from nbconvert import ScriptExporter
from google.colab import userdata
from google.colab import auth
from google.auth import default
from functools import wraps
import pandas as pd
import importlib
import nbformat
import gspread
import shutil
import random
import time
import sys
import os
import re

# Import custom modules (needs a function b/c of Colab limitations)
GIT_TOKEN = userdata.get('GIT_TOKEN')
def setup_repository(token, repo_url, module_name):
    """
    Clone a GitHub repository, convert a Jupyter notebook to a Python script,
    and import the script as a module.

    Parameters:
        token (str): GitHub personal access token.
        repo_url (str): URL of the GitHub repository.
        module_name (str): Name of the module to be imported from the repository.
    """
    # Remove existing repository if it exists
    repo_name = repo_url.split('/')[-1].replace('.git', '')
    repo_path = f'./{repo_name}'
    if os.path.exists(repo_path):
        shutil.rmtree(repo_path)

    repo_url = f"https://{GIT_TOKEN}:x-oauth-basic@{repo_url.split('//')[-1]}"
    os.system(f'git clone {repo_url}')

    def convert_notebook_to_script(nb_path, py_path):
        """
        Convert a Jupyter notebook to a Python script.

        Parameters:
            nb_path (str): Path to the Jupyter notebook.
            py_path (str): Path where the Python script will be saved.
        """
        with open(nb_path, 'r') as f:
            nb = nbformat.read(f, as_version=4)
        exporter = ScriptExporter()
        source, meta = exporter.from_notebook_node(nb)
        with open(py_path, 'w') as f:
            f.write(source)

    repo_name = repo_url.split('/')[-1].replace('.git', '')
    repo_path = f'./{repo_name}'
    nb_path = os.path.join(repo_path, f'{module_name}.ipynb')
    py_path = os.path.join(repo_path, f'{module_name}.py')
    convert_notebook_to_script(nb_path, py_path)

    sys.path.insert(0, repo_path)  # Add the repository path to sys.path
    module = __import__(module_name)

    # Reload the module to ensure the latest version is used
    importlib.reload(module)

    return module

# Import globals module w/ constant values
globals = setup_repository(
    GIT_TOKEN,
    'https://github.com/Glencrest-Group/property-billbacks.git',
    'globals')
import globals as gb


# Authenticate user and build Drive service
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)
drive_service = build('drive', 'v3', credentials=creds)


# General Google Drive API functions to load spreadsheets as dataframes,
# move files, get files info from folder, etc.
def retry_on_quota_exceeded():
    """
    Decorator that retries the decorated function using an exponential backoff
    strategy when a Google Drive or gspread API quota exceeded error (HTTP 429)
    occurs. The function will be retried up to `gb.MAX_API_RETRIES` times, with
    increasing sleep intervals between retries.

    Returns:
        function: The decorated function with retry logic applied.

    Raises:
        Exception: If the maximum number of retries (`gb.MAX_API_RETRIES`) is
            exceeded without a successful function execution.
    """
    def decorator_retry(func):
        @wraps(func)
        def wrapper_retry(*args, **kwargs):
            retry_count = 0
            while retry_count < gb.MAX_API_RETRIES:
                try:
                    return func(*args, **kwargs)
                except gspread.exceptions.APIError as e:
                    error = e.response.json()
                    if error['error']['code'] == 429:
                        # Handle quota exceeded error with exponential backoff
                        retry_count += 1
                        sleep_time = (2 ** retry_count) + (random.uniform(0, 1))
                        print(f"API quota exceeded, retrying in {sleep_time:.2f} seconds...")
                        time.sleep(sleep_time)
                    else:
                        raise  # Re-raise the exception if it's not a quota exceeded error
            raise Exception("Max retries exceeded. Could not complete the request.")
        return wrapper_retry
    return decorator_retry


def extract_drive_id(url):
    """
    Extract the file ID from different formats of Google Drive URLs, including
    folder, file, and spreadsheet URLs.

    Parameters:
        url (str): The Google Drive URL from which to extract the file ID.

    Returns:
        str: The extracted GDrive file ID, or None if no ID could be found.
    """
    # Define a regular expression pattern to match the ID in various types of Google Drive URLs
    pattern = r'(?:drive/(?:folders|file|d)/|docs/(?:spreadsheets/d/))([a-zA-Z0-9-_]+)'
    match = re.search(pattern, url)
    if match:
        return match.group(1)
    else:
        # Another attempt to match "spreadsheets" and "file" type URLs specifically
        pattern_spreadsheets = r'/spreadsheets/d/([a-zA-Z0-9-_]+)'
        match_spreadsheets = re.search(pattern_spreadsheets, url)
        if match_spreadsheets:
            return match_spreadsheets.group(1)

        pattern_file = r'/file/d/([a-zA-Z0-9-_]+)'
        match_file = re.search(pattern_file, url)
        if match_file:
            return match_file.group(1)

        return None


@retry_on_quota_exceeded()
def open_sheet_from_url(spread_url, sheet_index=0, sheet_title=None):
    """
    Retrieves a worksheet from a Google Sheet specified by its URL. It can
    either return a worksheet by its title or by its index.

    Parameters:
        spread_url (str): The URL of the Google Sheet.
        sheet_index (int, optional): The index of the worksheet to retrieve.
            Defaults to 0.
        sheet_title (str, optional): The title of the worksheet to retrieve.
            If specified, this parameter takes precedence over sheet_index.

    Returns:
        gspread.models.Worksheet: The worksheet object from the Google Sheet.
    """
    if sheet_title:
        sheet = gc.open_by_url(spread_url).worksheet(sheet_title)
    else:
        sheet = gc.open_by_url(spread_url).get_worksheet(sheet_index)
    return sheet


@retry_on_quota_exceeded()
def df_from_spread_url(spread_url, sheet_index=0, sheet_title=None):
    """
    Retrieves data from a specified worksheet in a Google Sheet and converts it
    into a pandas DataFrame. It can either retrieve a worksheet by its title or
    by its index.

    Parameters:
        spread_url (str): The URL of the Google Sheet.
        sheet_index (int, optional): The index of the worksheet to retrieve.
            Defaults to 0.
        sheet_title (str, optional): The title of the worksheet to retrieve.
            If specified, this parameter takes precedence over sheet_index.

    Returns:
        pandas.DataFrame: A DataFrame containing the data from the Google Sheet.
    """
    sheet = open_sheet_from_url(spread_url, sheet_index, sheet_title)
    return pd.DataFrame(sheet.get_all_records())


@retry_on_quota_exceeded()
def df_from_csv_url(url, header_row_idx=None, index_col_idx=None):
    """
    Downloads a CSV file from a given Google Drive URL and converts it into a
    pandas DataFrame. The user can specify which row to use as the header and
    which column to use as the index.

    Parameters:
        url (str): The URL of the CSV file on Google Drive.
        header_row_idx (int, optional): The row number to use as the header
            (0-indexed). Defaults to None.
        index_col_idx (int, optional): The column number to use as the index
            (0-indexed). Defaults to None.

    Returns:
        pandas.DataFrame: A DataFrame containing the data from the CSV file.
    """
    file_id = extract_drive_id(url)
    request = drive_service.files().get_media(fileId=file_id)
    file_path = f'/tmp/{file_id}.csv'
    with open(file_path, 'wb') as f:
        downloader = MediaIoBaseDownload(f, request)
        done = False
        while done is False:
            status, done = downloader.next_chunk()
    return pd.read_csv(file_path, header=header_row_idx, index_col=index_col_idx)


@retry_on_quota_exceeded()
def get_cols_by_name_from_sheet(sheet, column_names, header_row=1):
    """
    Retrieves specified columns from a Google Sheet and returns them as a
    pandas DataFrame.

    Parameters:
        sheet (gspread.Sheet): The Google Sheet object.
        column_names (list): A list of column names to fetch.
        header_row (int): The row number of the header (1-indexed).

    Returns:
        pd.DataFrame: A DataFrame containing the columns.
        None: If any column is not found.
    """
    # Get the header row
    header = sheet.row_values(header_row)

    # Find the indices of the columns
    col_indices = []
    for column_name in column_names:
        try:
            col_idx = header.index(column_name) + 1
            col_indices.append((column_name, col_idx))
        except ValueError:
            raise ValueError(f"Column '{column_name}' not found in the sheet.")

    # Get all values in the specified columns, including the header
    data = {}
    for col_name, col_idx in col_indices:
        column_values = sheet.col_values(col_idx)
        data[col_name] = column_values[1:]  # Skip the header

    # Create a DataFrame
    df = pd.DataFrame(data)

    return df


@retry_on_quota_exceeded()
def get_files_info_from_folder(folder_url):
    """
    Fetches the details of files contained in a specified Google Drive folder
    and returns a list of dictionaries, each containing the file's name, URL,
    and MIME type.

    Parameters:
        folder_url (str): The URL of the Google Drive folder.

    Returns:
        List[dict]: A list of dictionaries containing file information,
                    each with 'name', 'url', and 'type' keys.
    """
    folder_id = extract_drive_id(folder_url)
    query = f"'{folder_id}' in parents and trashed=false"
    results = drive_service.files().list(
        q=query,
        fields="files(id, name, webViewLink, mimeType)"
    ).execute()
    items = results.get('files', [])

    files_info = []
    for item in items:
        files_info.append({
            'name': item['name'],
            'url': item['webViewLink'],
            'type': item['mimeType']
        })
    return files_info


@retry_on_quota_exceeded()
def move_file(file_url, dst_folder_url):
    """
    Moves a Google Drive file to a specified folder.

    Parameters:
        file_url (str): The URL of the Google Drive file.
        dst_folder_url (str): The URL of the destination Google Drive folder.
    """
    # Extract folder ID from destination folder URL
    dst_folder_id = extract_drive_id(dst_folder_url)

    # Extract file ID from file URL
    file_id = extract_drive_id(file_url)

    # Retrieve the current parents to remove them
    file = drive_service.files().get(fileId=file_id, fields='parents').execute()
    previous_parents = ",".join(file.get('parents'))

    # Move the file to the new folder
    drive_service.files().update(
        fileId=file_id,
        addParents=dst_folder_id,
        removeParents=previous_parents,
        fields='id, parents'
    ).execute()


@retry_on_quota_exceeded()
def append_df_to_sheet(df, dst_sheet):
    """
    Appends the rows of a pandas DataFrame to a specified Google Sheet
    worksheet, ensuring that only columns matching the destination sheet's
    columns are included and that the order of columns in the DataFrame matches
    the order in the destination sheet.

    If the DataFrame does not have all the columns present in the destination
    sheet, the missing columns will be added with empty string values.

    Parameters:
        df (pandas.DataFrame): The DataFrame to append to the Google Sheet.
        dst_sheet (gspread.models.Worksheet): The destination Google Sheet.
    """

    # Get the column names of the destination sheet
    sheet_cols = dst_sheet.row_values(1)
    df_cols = list(df.columns)

    # Only keep columns in df that exist in the destination sheet
    non_dst_cols = [col for col in df_cols if col not in sheet_cols]
    df = df.drop(columns=non_dst_cols)

    # Reindex df to match the order of columns in the destination sheet
    df = df.reindex(columns=sheet_cols)

    # Replace null values with an empty string
    df = df.fillna('')

    # Append rows to the destination sheet
    dst_sheet.append_rows(df.values.tolist())


@retry_on_quota_exceeded()
def create_spread_from_df(df, new_name, dst_folder_url):
    """
    Creates a new Google Sheet in the specified folder with the given name and
    populates it with data from the provided DataFrame.

    Parameters:
        df (pd.DataFrame): The DataFrame to populate the Google Sheet with.
        new_name (str): The name of the new Google Sheet.
        dst_folder_url (str): The URL of the destination Google Drive folder.
    """
    # Create a new Google Sheet
    spreadsheet = gc.create(new_name)
    sheet = spreadsheet.get_worksheet(0)

    # Extract folder ID from destination folder URL
    dst_folder_id = extract_drive_id(dst_folder_url)

    # Move the new Google Sheet to the destination folder
    file_id = spreadsheet.id
    drive_service.files().update(
        fileId=file_id,
        addParents=dst_folder_id,
        fields='id, parents'
    ).execute()

    # Replace null values with an empty string
    df = df.fillna('')

    # Populate the new Google Sheet with data from the DataFrame
    sheet.update([df.columns.values.tolist()] + df.values.tolist())


if __name__ == '__main__':
    pass
    # data = {
    #     'A': [1, pd.NA, 3],
    #     'B': [4, 5, 6],
    #     'C': [7, 8, 9],
    # }
    # df = pd.DataFrame(data)
    # create_spread_from_df(df, 'New Google Sheet', 'https://drive.google.com/drive/folders/1Ne6IBg1tq_ftv49Hxb1vvBh1tKCVU9MK')
