In [None]:
import pandas as pd 
import requests
import xml.etree.ElementTree as ET
import os
import utilities
from dotenv import load_dotenv

load_dotenv()


def fetch_flashbots_files(base_url):
    
    marker = ""

    namespace = {'s3': 'http://s3.amazonaws.com/doc/2006-03-01/'}

    all_files = []

    while True:

        url = f"{base_url}?marker={marker}" if marker else base_url

        response = requests.get(url)
        
        if response.status_code != 200:
            print(f"Failed to fetch data. Status Code: {response.status_code}")
            break

        # Parse the XML response
        root = ET.fromstring(response.content)
        
        # Extract file details
        for content in root.findall('s3:Contents', namespace):
            key = content.find('s3:Key', namespace).text
            size = content.find('s3:Size', namespace).text
            last_modified = content.find('s3:LastModified', namespace).text
            all_files.append({'Key': key, 'Size': int(size), 'LastModified': last_modified})
        
        # Check if there are more files
        is_truncated = root.find('s3:IsTruncated', namespace).text == 'true'
        if not is_truncated:
            break
        
        # Update the marker for the next request
        marker = root.findall('s3:Contents', namespace)[-1].find('s3:Key', namespace).text

    return all_files


def download_csvs(files, keyword, base_url, download_dir):
    """
    Downloads CSV files from an S3 bucket that contain a specific keyword in their name.

    Parameters:
        files (list): List of file dictionaries with 'Key', 'Size', etc.
        keyword (str): Keyword to filter file names.
        base_url (str): Base URL for the S3 bucket.
        download_dir (str): Directory to save the downloaded files.
    """
    # Ensure the download directory exists
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)

    for file in files:
        file_name = file['Key']
        # Check if the file is a CSV and contains the keyword
        if file_name.endswith('.csv') and keyword.lower() in file_name.lower():
            file_url = f"{base_url}{file_name}"
            response = requests.get(file_url)
            if response.status_code == 200:
                # Construct local file path
                local_path = os.path.join(download_dir, file_name.replace('/', '_'))
                with open(local_path, 'wb') as f:
                    f.write(response.content)
                print(f"Downloaded: {file_name} to {local_path}")
            else:
                print(f"Failed to download: {file_name} (Status Code: {response.status_code})")
        else:
            pass


def aggregate_and_clean(download_dir):
    """
    Aggregates all downloaded CSVs into a single DataFrame and deletes the files.
    Parameters: download_dir (str): Directory where the downloaded CSV files are stored.
    Returns: pd.DataFrame: A single aggregated DataFrame containing all CSV data.
    """
    data_frames = []

    for file_name in os.listdir(download_dir):
        file_path = os.path.join(download_dir, file_name)
        try:
            # Read the CSV into a DataFrame
            df = pd.read_csv(file_path)
            data_frames.append(df)
            print(f"Loaded: {file_name}")
        except Exception as e:
            print(f"Failed to load {file_name}: {e}")
    
    aggregated_df = pd.concat(data_frames, ignore_index=True)
    print(f"Aggregate DF contains {aggregated_df.shape[0]} rows.")
    
    # Delete the files after processing
    for file_name in os.listdir(download_dir):
        file_path = os.path.join(download_dir, file_name)
        os.remove(file_path)
        print(f"Deleted: {file_name}")

    return aggregated_df


if __name__ == "__main__":
    base_url = "https://flashbots-data.s3.us-east-2.amazonaws.com/"

    GOOGLE_CREDENTIALS_PATH = os.getenv("GOOGLE_CREDENTIALS_PATH")
    FLASHBOTS_DATASET_ID = os.getenv("FLASHBOTS_DATASET_ID")
    
    download_dir = f'/Users/hosammahmoud/documents/{FLASHBOTS_DATASET_ID}'

    all_files = fetch_flashbots_files(base_url)
    print(f'files was fetched successfuly, we now have {len(all_files)} files')

    #ADD MEVSHARE DATA
    date = '2024-11'
    download_csvs(all_files, f'mevshare/{date}', base_url, download_dir)
    df = aggregate_and_clean(download_dir)
    table_id = os.getenv("FLASHBOTS_MEVSHARE_TABLE_ID")
    utilities.load_to_table(df, FLASHBOTS_DATASET_ID, table_id, GOOGLE_CREDENTIALS_PATH)


    #ADD PROTECT DATA 
    date = '2024-11'
    download_csvs(all_files, f'protect/{date}', base_url, download_dir)
    df = aggregate_and_clean(download_dir)
    table_id = os.getenv('FLASHBOTS_PROTECT_TABLE_ID')
    utilities.load_to_table(df, FLASHBOTS_DATASET_ID, table_id, GOOGLE_CREDENTIALS_PATH)

files was fetched successfuly, we now have 2686 files
Downloaded: protect/mevshare/2024-11/2024-11-01.csv to /Users/hosammahmoud/documents/flashbots_dataset/protect_mevshare_2024-11_2024-11-01.csv
Downloaded: protect/mevshare/2024-11/2024-11-02.csv to /Users/hosammahmoud/documents/flashbots_dataset/protect_mevshare_2024-11_2024-11-02.csv
Downloaded: protect/mevshare/2024-11/2024-11-03.csv to /Users/hosammahmoud/documents/flashbots_dataset/protect_mevshare_2024-11_2024-11-03.csv
Downloaded: protect/mevshare/2024-11/2024-11-04.csv to /Users/hosammahmoud/documents/flashbots_dataset/protect_mevshare_2024-11_2024-11-04.csv
Downloaded: protect/mevshare/2024-11/2024-11-05.csv to /Users/hosammahmoud/documents/flashbots_dataset/protect_mevshare_2024-11_2024-11-05.csv
Downloaded: protect/mevshare/2024-11/2024-11-06.csv to /Users/hosammahmoud/documents/flashbots_dataset/protect_mevshare_2024-11_2024-11-06.csv
Downloaded: protect/mevshare/2024-11/2024-11-07.csv to /Users/hosammahmoud/documents/fla