# Import Packages

In [0]:
import sys

sys.path.append("/Workspace/Shared/lib/")
import os
import time
import requests
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException
from params import get_env, get_catalog, get_schema, get_table

# ETL Object

In [0]:
class CsvDownloaderAndTableCreator:
    """
    A class to download a CSV file, save it to a specified location with a timestamp suffix,
    and create or replace a table in Databricks.

    Attributes:
        url (str): The URL to download the CSV file from.
        save_location (str): The base location to save the downloaded CSV file.
        table_name (str): The name of the table to create or replace in Databricks.
        retries (int): The number of retry attempts if the download fails. Default is 3.
        spark (SparkSession): A Spark session to interact with Databricks.
    """

    def __init__(self, url: str, save_location: str, table_name: str, retries: int = 3):
        """
        Initializes the CsvDownloaderAndTableCreator with URL, save location, table name, and retry settings.

        Args:
            url (str): URL of the CSV file to download.
            save_location (str): Local path to save the CSV file.
            table_name (str): The name of the Databricks table to create or replace.
            retries (int, optional): Number of retries for the file download. Default is 3.
        """
        self.url = url
        self.save_location = save_location
        self.table_name = table_name
        self.retries = retries
        self.spark = SparkSession.builder.appName(
            "CsvDownloaderAndTableCreator"
        ).getOrCreate()

    def _generate_filename_with_timestamp(self):
        """
        Generates a filename with the last word of the table_name and a timestamp suffix
        to prevent overwriting of files.

        Returns:
            str: The new filename with a timestamp suffix.
        """
        # Get the last part of the table_name after the last dot (e.g., 'dividend' from 'dolt.stocks.dividend')
        table_name_last_word = self.table_name.split(".")[-1]

        # Generate a timestamp and combine it with the last word of the table name
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"{table_name_last_word}_{timestamp}.csv"

        return os.path.join(os.path.dirname(self.save_location), filename)

    def download_csv(self):
        """
        Downloads the CSV file from the URL and saves it to the specified location.

        Retries the download up to a specified number of times in case of failure.

        Raises:
            Exception: If the file cannot be downloaded after the specified number of retries.
        """
        attempt = 0
        # Generate a unique filename with timestamp to prevent overwriting
        unique_filename = self._generate_filename_with_timestamp()

        while attempt < self.retries:
            try:
                # Send HTTP request to get the CSV file
                response = requests.get(self.url)
                response.raise_for_status()  # Check if the request was successful

                # Save the CSV content to the specified file path
                with open(unique_filename, "wb") as file:
                    file.write(response.content)
                print(f"File successfully downloaded to {unique_filename}")
                return unique_filename  # Exit the function if download is successful
            except requests.exceptions.RequestException as e:
                attempt += 1
                print(f"Attempt {attempt} failed: {e}")
                if attempt < self.retries:
                    print("Retrying...")
                    time.sleep(2)  # Wait for 2 seconds before retrying
                else:
                    raise Exception(
                        f"Failed to download the CSV file after {self.retries} attempts."
                    ) from e

    def create_or_replace_table(self, csv_file_path: str):
        """
        Creates or replaces a table in Databricks using the downloaded CSV file.

        This method assumes the CSV file is in the correct format for creating a table.

        Args:
            csv_file_path (str): The path of the downloaded CSV file to create the table from.

        Raises:
            AnalysisException: If there is an error when creating or replacing the table.
        """
        try:
            # Read the CSV file into a DataFrame
            df = self.spark.read.option("header", "true").csv(csv_file_path)

            # Create or replace the table in Databricks
            df.createOrReplaceTempView(self.table_name)
            print(
                f"Table {self.table_name} created or replaced successfully in Databricks."
            )
        except AnalysisException as e:
            print(f"Error while creating or replacing the table: {e}")
            raise

    def execute(self):
        """
        Executes the entire process: downloading the CSV file and creating or replacing the table in Databricks.

        First, it attempts to download the CSV file. Then, it creates or replaces the table in Databricks.
        """
        # Download the CSV file and get the file path
        downloaded_csv_path = self.download_csv()

        # Create or replace the table in Databricks using the downloaded file
        self.create_or_replace_table(downloaded_csv_path)


# # Example usage:
# if __name__ == "__main__":
#     # Define the parameters
#     url = "https://example.com/stock_prices.csv"
#     save_location = "/dbfs/mnt/your-volume/stock_prices.csv"
#     table_name = "dolt.stocks.dividend"

#     # Create an instance of CsvDownloaderAndTableCreator
#     csv_downloader = CsvDownloaderAndTableCreator(url, save_location, table_name)

#     # Execute the process
#     csv_downloader.execute()

In [0]:
dividend_url = "https://www.dolthub.com/csv/post-no-preference/stocks/post-no-preference%2Fdocs-belligerent-armadillo/dividend?include_bom=0"
div_path = "/Volumes/dolt/stocks/raw/dividend/"

In [0]:
etl_dividend = CsvDownloaderAndTableCreator(
    url=dividend_url, save_location=div_path, table_name="dolt.stocks.dividend"
)

In [0]:
etl_dividend.download_csv()

In [0]:
etl_dividend.execute()