## Exercise

Find the top 5 countries with the highest number of Safari users in 2017 (from
2017-01-01 until 2017-12-31).

For the data about the browser, we are going to assume the right column
is `server_request_browser`.

In [35]:
import boto3
import time
import psycopg2
from dotenv import load_dotenv
import os

load_dotenv()

AWS_ACCESS_KEY = os.getenv('ACCESS_KEY')
AWS_SECRET_KEY = os.getenv('SECRET_ACCESS_KEY')

In [39]:
# Set up the Athena client
athena_client = boto3.client(
    'athena',
    region_name='eu-west-2',
    aws_access_key_id=AWS_ACCESS_KEY,
    aws_secret_access_key=AWS_SECRET_KEY)

# Write the SQL query
sql_query = """
    SELECT server_request_country_code, COUNT(*) as total_safari_users
    FROM vod_clickstream
    WHERE datetime >= CAST('2017-01-01' AS timestamp) AND datetime < CAST('2018-01-01' AS timestamp)
          AND server_request_browser = 'Safari'
    GROUP BY server_request_country_code
    ORDER BY total_safari_users DESC
    LIMIT 5;
"""


# Execute the Athena query
query_execution = athena_client.start_query_execution(
    QueryString=sql_query,
    QueryExecutionContext={
        "Database": "jack-athena-parquet"
    },
    ResultConfiguration={
        "OutputLocation": "s3://athena-learners-etl-bite05/jack" # <= This will be different for you, refer to the Amazon Athena pill for more information.
    }
)

# Poll the query status until it is either successful or failed
query_status = "QUEUED"
query_execution_id = query_execution["QueryExecutionId"]

while query_status in ["QUEUED", "RUNNING"]:
    query_execution = athena_client.get_query_execution(
        QueryExecutionId=query_execution_id
    )
    query_status = query_execution["QueryExecution"]["Status"]["State"]
    if query_status == "FAILED":
        raise Exception("Athena query failed!")
    time.sleep(1)

# Retrieve the query results
results = athena_client.get_query_results(
    QueryExecutionId=query_execution_id
)["ResultSet"]["Rows"]

In [41]:
results

[{'Data': [{'VarCharValue': 'server_request_country_code'},
   {'VarCharValue': 'total_safari_users'}]},
 {'Data': [{'VarCharValue': 'BR'}, {'VarCharValue': '504'}]},
 {'Data': [{'VarCharValue': 'US'}, {'VarCharValue': '137'}]},
 {'Data': [{'VarCharValue': 'NG'}, {'VarCharValue': '43'}]},
 {'Data': [{'VarCharValue': 'ES'}, {'VarCharValue': '42'}]},
 {'Data': [{'VarCharValue': 'IN'}, {'VarCharValue': '27'}]}]

In [45]:
# Connect to the local Postgres database
conn = psycopg2.connect(database="etl_bites", user="jackdench", host="localhost", port="5432")
cursor = conn.cursor()

# Create the table if it doesn't exist
cursor.execute("""
    CREATE TABLE IF NOT EXISTS top_5_safari_users (
        country_code VARCHAR(2) PRIMARY KEY,
        safari_users INTEGER
    );
""")
            
# Process the query results

for row in results[1:]:
    if not row["Data"][0] or not row["Data"][1]:
        # You could also print what a `row` has if you are curious!
        print(f"Skipping row: {row}")
        continue

    country_code = row["Data"][0]["VarCharValue"]
    safari_users = int(row["Data"][1]["VarCharValue"])

    # Insert the data into the local PostgreSQL database
    insert_query = """
        INSERT INTO top_5_safari_users (country_code, safari_users)
        VALUES (%s, %s)
        ON CONFLICT (country_code)
        DO UPDATE SET safari_users = EXCLUDED.safari_users;
    """

    try:
        cursor.execute(insert_query, (country_code, safari_users))
        
    except Exception as e:
        print("Error occurred inserting into analytical DB: %s"% e)
        conn.rollback()  # Rollback the transaction if there's an error

# Commit the changes and close the cursor and connection outside the loop
conn.commit()
cursor.close()
conn.close()