In [1]:
import boto3
import botocore
import os
import psycopg2
from dotenv import load_dotenv #pip install python-dotenv
from psycopg2 import connect, sql
from os import environ as env
import pandas as pd

load_dotenv()
conn_string = os.getenv('conn_string')
AWS_ACCESS_KEY = os.getenv('AWS_ACCESS_KEY')
AWS_SECRET_KEY = os.getenv('AWS_SECRET_KEY')



if 'conn_string' in env:
    print(env['conn_string'][:35])

dbname='etl_bites' user='joemiller'


## 05 Challenge

Find the most visited URL per country per day during a week of your choice (e.g. 2018-04-01 until 2018-04-08).

For the data about the browser, we are going to assume the right column is event_url


In [5]:
import boto3
import time
import psycopg2


# Set up the Athena client
athena_client = boto3.client(
    'athena',
    region_name='eu-west-2',
    aws_access_key_id= AWS_ACCESS_KEY,
    aws_secret_access_key= AWS_SECRET_KEY,
    )

# Write the SQL query
sql_query = """
    SELECT server_request_country_code, event_url, dt, COUNT(*) as num_visits
    FROM vod_clickstream
    WHERE datetime >= CAST('2018-04-01' AS timestamp) AND datetime < CAST('2018-04-08' AS timestamp)
    GROUP BY server_request_country_code, event_url, dt
    ORDER BY server_request_country_code
    ;
"""


# Execute the Athena query
query_execution = athena_client.start_query_execution(
    QueryString=sql_query,
    QueryExecutionContext={
        "Database": "joe-athena_parquet"
    },
    ResultConfiguration={
        "OutputLocation": "s3://athena-learners-etl-bite05/joe" # <= This will be different for you, refer to the Amazon Athena pill for more information.
    }
)

# Poll the query status until it is either successful or failed
query_status = "QUEUED"
query_execution_id = query_execution["QueryExecutionId"]

while query_status in ["QUEUED", "RUNNING"]:
    query_execution = athena_client.get_query_execution(
        QueryExecutionId=query_execution_id
    )
    query_status = query_execution["QueryExecution"]["Status"]["State"]
    if query_status == "FAILED":
        raise Exception("Athena query failed!")
    time.sleep(1)

# # Retrieve the query results
# results = athena_client.get_query_results(
#     QueryExecutionId=query_execution_id
#     )["ResultSet"]["Rows"]


# Retrieve the query results

results = []
query_results = athena_client.get_query_results(QueryExecutionId=query_execution_id)

next_token = query_results["NextToken"]
# Add the current set of results to the overall results
results.extend(query_results["ResultSet"]["Rows"])

while True:
    # Retrieve the query results with pagination
    query_results = athena_client.get_query_results(
        QueryExecutionId=query_execution_id,
        NextToken=next_token
        )

    # Add the current set of results to the overall results
    results.extend(query_results["ResultSet"]["Rows"])

    # Check if there are more results to retrieve
    if "NextToken" in query_results:
        next_token = query_results["NextToken"]
    else:
        break

In [6]:
len(results)

5001

In [11]:
# Process the query results
results_df = pd.DataFrame(columns=["country_code", "event_url", "dt", "num_visits"])

for row in results[1:]:
    country_code = row["Data"][0]["VarCharValue"]
    event_url = row["Data"][1]["VarCharValue"]
    dt = row["Data"][2]["VarCharValue"]
    num_visits = int(row["Data"][3]["VarCharValue"])

    values = pd.DataFrame({"country_code" : [country_code], 
              "event_url": [event_url], 
              "dt": [dt],
              "num_visits": [num_visits]})
    
    results_df = pd.concat([results_df, values], ignore_index=True)

results_df


Unnamed: 0,country_code,event_url,dt,num_visits
0,,https://www.netflix.com/watch/80008442?trackId...,2018-04-04,1
1,,https://www.netflix.com/watch/70276694?trackId...,2018-04-03,1
2,,https://www.netflix.com/watch/80175686?trackId...,2018-04-07,1
3,,https://www.netflix.com/Login?nextpage=https%3...,2018-04-03,1
4,,https://www.netflix.com/browse,2018-04-07,3
...,...,...,...,...
4995,AE,https://www.netflix.com/watch/80174941?trackId...,2018-04-02,1
4996,AE,https://www.netflix.com/watch/80095340?trackId...,2018-04-07,1
4997,AE,https://www.netflix.com/browse?jbv=80192098&jb...,2018-04-04,1
4998,AE,https://www.netflix.com/watch/70249844?trackId...,2018-04-06,1


In [8]:
groups = results_df.groupby(by="country_code").max()
groups

Unnamed: 0_level_0,event_url,dt,num_visits
country_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,https://www.netflix.com/youraccount?confirm=pa...,2018-04-07,15
A1,https://www.netflix.com/watch/80146758?trackId...,2018-04-07,21
A2,https://www.netflix.com/watch/80177465?trackId...,2018-04-04,12
AD,https://www.netflix.com/watch/80212990?trackId...,2018-04-07,12
AE,https://www.netflix.com/youraccount?message=ph...,2018-04-07,355


In [9]:
groups.groupby(by="dt").max()

Unnamed: 0_level_0,event_url,num_visits
dt,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-04-04,https://www.netflix.com/watch/80177465?trackId...,12
2018-04-07,https://www.netflix.com/youraccount?message=ph...,355
