## Challenge

As part of this challenge, we would like you to create a more detailed diagram
of the whole system, with every resource reflected on it. Include an explanation
on what it is that it does and your approach to solving the challenge.

Feel free to include as well any code or screenshots that are part of your
solution if relevant!

Find the most visited URL per country per day during a week of your choice (e.g.
2018-04-01 until 2018-04-08).

For the data about the browser, we are going to assume the right column
is `event_url`.

In [60]:
import boto3
import time
import psycopg2
from dotenv import load_dotenv
import os

load_dotenv()

AWS_ACCESS_KEY = os.getenv('ACCESS_KEY')
AWS_SECRET_KEY = os.getenv('SECRET_ACCESS_KEY')

In [61]:
# Set up the Athena client
athena_client = boto3.client(
    'athena',
    region_name='eu-west-2',
    aws_access_key_id=AWS_ACCESS_KEY,
    aws_secret_access_key=AWS_SECRET_KEY)

# Write the SQL query
sql_query = """
WITH ranked_urls AS (
    SELECT 
        server_request_country_code,
        DATE(datetime) AS Date,
        event_url,
        COUNT(*) AS Count,
        ROW_NUMBER() OVER(PARTITION BY server_request_country_code, DATE(datetime) ORDER BY COUNT(*) DESC) AS rank
    FROM vod_clickstream
    WHERE 
        datetime >= CAST('2018-04-01' AS timestamp) AND 
        datetime < CAST('2018-04-08' AS timestamp) AND 
        server_request_country_code != ''
    GROUP BY 
        server_request_country_code, DATE(datetime), event_url
)
SELECT 
    server_request_country_code,
    Date,
    event_url,
    Count
FROM ranked_urls
WHERE rank = 1
ORDER BY server_request_country_code, Date;

"""

# Execute the Athena query
query_execution = athena_client.start_query_execution(
    QueryString=sql_query,
    QueryExecutionContext={
        "Database": "jack-athena-parquet"
    },
    ResultConfiguration={
        "OutputLocation": "s3://athena-learners-etl-bite05/jack"  # Change this to your output location
    }
)

# Poll the query status until it is either successful or failed
query_status = "QUEUED"
query_execution_id = query_execution["QueryExecutionId"]

while query_status in ["QUEUED", "RUNNING"]:
    query_execution = athena_client.get_query_execution(
        QueryExecutionId=query_execution_id
    )
    query_status = query_execution["QueryExecution"]["Status"]["State"]
    if query_status == "FAILED":
        raise Exception("Athena query failed!")
    time.sleep(1)

# Paginate through results to retrieve all rows
results = []
next_token = None

while True:
    # Get query results with pagination
    params = {'QueryExecutionId': query_execution_id}
    if next_token:
        params['NextToken'] = next_token
    
    response = athena_client.get_query_results(**params)
    
    rows = response["ResultSet"]["Rows"]
    results.extend(rows[1:])  # Skip the header row
    
    # Check if there are more results
    next_token = response.get('NextToken')
    if not next_token:
        break

# 'results' now contains all rows of the query results

In [63]:
len(results)

1373

In [62]:
# Connect to the local Postgres database
conn = psycopg2.connect(database="etl_bites", user="jackdench", host="localhost", port="5432")
cursor = conn.cursor()

# Create the table if it doesn't exist
cursor.execute("""
    CREATE TABLE IF NOT EXISTS top_url_per_country_per_day (
        id SERIAL PRIMARY KEY,
        country_code VARCHAR(2),
        day TEXT,
        url TEXT,
        count INTEGER
    );
""")

# Process the query results

for row in results[1:]:
    if row['Data'][0]['VarCharValue'] == '':
        # You could also print what a `row` has if you are curious!
        print(f"Skipping row: {row}")
        continue

    country_code = row["Data"][0]["VarCharValue"]
    day = row["Data"][1]["VarCharValue"]
    url = row["Data"][2]["VarCharValue"]
    count = row["Data"][3]["VarCharValue"]

    # Insert the data into the local PostgreSQL database
    insert_query = """
        INSERT INTO top_url_per_country_per_day (country_code, day, url, count)
        VALUES (%s, %s, %s, %s)
    """

    try:
        cursor.execute(insert_query, (country_code, day, url, count))
        
    except Exception as e:
        print("Error occurred inserting into analytical DB: %s"% e)
        conn.rollback()  # Rollback the transaction if there's an error

# Commit the changes and close the cursor and connection outside the loop
conn.commit()
cursor.close()
conn.close()