# Lab. 5-1 Setup Redshift Serverless with sample data

To ensure a smooth experience with this notebook, please make sure you've set up your Redshift Serverless Namespace and Workgroup beforehand. 

If you haven't done so already, you can easily set up Redshift Serverless by using the `redshift_serverless.yaml` file for installation. 

This preparation step is crucial for the proper execution of the following code.

In [None]:
!pip install -U "sqlalchemy<2.0.0"
!pip install -U "pandas<2.2.0"

## Exporting data from SQLite (Chinook.DB)

In [None]:
import sqlite3
import boto3
import pandas as pd
from sqlalchemy import create_engine, text

sqlite_file = '../Chinook.db'
sqlite_conn = sqlite3.connect(sqlite_file)

sqlite_cursor = sqlite_conn.cursor()
sqlite_cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = sqlite_cursor.fetchall()

print(tables)

table_name = tables[0][0]
data = pd.read_sql_query(f"SELECT * FROM {table_name}", sqlite_conn)
    
print(data)


## Loading data to Redshift Serverless 

If you encounter any errors during this process, it may be due to version incompatibilities between `SQLAlchemy` and `pandas`. 

Let's begin loading our data into Redshift Serverless:

In [None]:
import boto3

redshift_serverless = boto3.client('redshift-serverless')

workgroup_name = 'test-workgroup'
response = redshift_serverless.get_workgroup(workgroupName=workgroup_name)
endpoint = response['workgroup']['endpoint']
print(endpoint['address'])

workgroup_arn = response['workgroup']['workgroupArn']
print(workgroup_arn)

account_id = boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


In [None]:
import json 
from botocore.exceptions import ClientError

secretsmanager = boto3.client('secretsmanager')

redshift_host = endpoint['address']
redshift_port = '5439'
redshift_dbname = 'testdb'
redshift_user = 'admin'
redshift_password = 'Admin123!'

secret_name = "redshift-serverless-credentials"
secret_value = {
    "username": redshift_user,
    "password": redshift_password
}

try:
    get_secret_value_response = secretsmanager.get_secret_value(SecretId=secret_name)
    print(f"Secret '{secret_name}' already exists. Using existing secret.")
    secret_arn = get_secret_value_response['ARN']
except ClientError as e:
    if e.response['Error']['Code'] == 'ResourceNotFoundException':
        print(f"Secret '{secret_name}' not found. Creating new secret.")
        try:
            secret_response = secretsmanager.create_secret(
                Name=secret_name,
                SecretString=json.dumps(secret_value)
            )
            secret_arn = secret_response['ARN']
            print(f"Secret '{secret_name}' created successfully.")
        except ClientError as e:
            print(f"Error creating secret: {e}")
            raise


In [None]:
redshift_data = boto3.client('redshift-data')

def grant_select_permission(workgroup_name, database, secrets_username):
    query = f'GRANT SELECT ON ALL TABLES IN SCHEMA public TO "IAMR:{secrets_username}";'
    try:
        response = redshift_data.execute_statement(
            WorkgroupName=workgroup_name,
            Database=database,
            Sql=query
        )
        print(f"Permission granted. Query execution ID: {response['Id']}")
    except ClientError as e:
        print(f"Error granting permission: {e}")
        raise

secrets_username = secret_arn.split(':')[-1]
grant_select_permission(
    workgroup_name=workgroup_name,
    database=redshift_dbname,
    secrets_username=secrets_username
)

In [None]:
%store secret_arn redshift_user workgroup_name workgroup_arn redshift_dbname region account_id

In [None]:
redshift_conn_string = f"postgresql://{redshift_user}:{redshift_password}@{redshift_host}:{redshift_port}/{redshift_dbname}"
redshift_engine = create_engine(redshift_conn_string)

for table in tables:
    table_name = table[0].lower()    
    df = pd.read_sql_query(f"SELECT * FROM {table[0]}", sqlite_conn)
    print(f"Processing table: {table_name}")
    df.to_sql(table_name, redshift_engine, index=False, if_exists='replace', method='multi', chunksize=1000)
    print(f"Table {table[0]} created and data inserted.")

## Testing Complex SQL query with Redshift

Now that we have our data loaded into Redshift Serverless, let's test a complex SQL query to analyze our data. 

In [None]:
sample_question = "What are the top 5 best-selling tracks for each of the 3 highest-grossing genres? Include genre, track, artist, album, and sales"
sample_query = """WITH TopGenres AS (
    SELECT g.GenreId, g.Name AS GenreName
    FROM Genre g
    JOIN Track t ON g.GenreId = t.GenreId
    JOIN InvoiceLine il ON t.TrackId = il.TrackId
    GROUP BY g.GenreId, g.Name
    ORDER BY SUM(il.UnitPrice * il.Quantity) DESC
    LIMIT 3
),
RankedTracks AS (
    SELECT 
        g.GenreName,
        t.Name AS TrackName,
        ar.Name AS ArtistName,
        al.Title AS AlbumTitle,
        SUM(il.UnitPrice * il.Quantity) AS Sales,
        ROW_NUMBER() OVER (PARTITION BY g.GenreId ORDER BY SUM(il.UnitPrice * il.Quantity) DESC) AS Rank
    FROM 
        TopGenres g
        JOIN Track t ON g.GenreId = t.GenreId
        JOIN Album al ON t.AlbumId = al.AlbumId
        JOIN Artist ar ON al.ArtistId = ar.ArtistId
        JOIN InvoiceLine il ON t.TrackId = il.TrackId
    GROUP BY 
        g.GenreId, g.GenreName, t.Name, ar.Name, al.Title
)
SELECT GenreName, TrackName, ArtistName, AlbumTitle, Sales
FROM RankedTracks
WHERE Rank <= 5
ORDER BY GenreName, Sales DESC;"""

with redshift_engine.connect() as conn:
    result = conn.execute(text(sample_query))
    df = pd.DataFrame(result.fetchall(), columns=result.keys())
    print(df.to_string(index=False))



In [None]:
%store sample_query sample_question