In [None]:
#7
#athena db config - llmprice
#AAI-540 Group 3 FP

### Notebook 7
This notebook registers the LLM pricing dataset in Amazon Athena for SQL-based analysis.  
The dataset is stored in Amazon S3 and exposed as an external table without duplicating data.  
This enables querying model token costs and context limits for downstream cost-aware routing.  
The resulting table supports pricing analysis and router training features.

In [1]:
import boto3
import sagemaker
from pyathena import connect
import pandas as pd

Unable to load JumpStart region config.
Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.12/site-packages/sagemaker/jumpstart/constants.py", line 69, in _load_region_config
    with open(filepath) as f:
         ^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: '/home/ec2-user/anaconda3/envs/python3/lib/python3.12/site-packages/sagemaker/jumpstart/region_config.json'


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [None]:
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
ingest_create_athena_db_passed = False

### Configure AWS and Athena Environment
Initialize AWS session, identify the project bucket, and configure the Athena staging directory for query outputs.

In [2]:
database_name = "dsoaws"
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

### Create or Select Athena Database
Ensure the Athena database exists so external tables can be registered consistently.

In [3]:
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)
pd.read_sql(statement, conn)

CREATE DATABASE IF NOT EXISTS dsoaws


  pd.read_sql(statement, conn)


In [4]:
statement = "SHOW DATABASES"
df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,database_name
0,default
1,dsoaws
2,sagemaker_featurestore


### Define S3 Data Location for Athena Table
Set the dataset source path and the dedicated S3 folder Athena will use as the table location.

In [5]:
s3_data_path = f"s3://{bucket}/llmpricingdata.csv"
s3_table_path = f"s3://{bucket}/table4/"
print("s3_data_path:", s3_data_path)
print("s3_table_path:", s3_table_path)

s3_data_path: s3://sagemaker-us-east-1-907086662522/llmpricingdata.csv
s3_table_path: s3://sagemaker-us-east-1-907086662522/table4/


### Prepare Table Data Directory in S3
Copy the dataset into the S3 folder referenced by the Athena external table.

In [6]:
!aws s3 cp {s3_data_path} {s3_table_path}

copy: s3://sagemaker-us-east-1-907086662522/llmpricingdata.csv to s3://sagemaker-us-east-1-907086662522/table4/llmpricingdata.csv


In [7]:
#table4 - llmpricing
table_name_csv = 'llmpricing'
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)
ingest_create_athena_table_csv_passed = False
dataexplore = pd.read_csv('llmpricingdata.csv')
dataexplore.head()

Unnamed: 0,provider,developer,model,input_token_1k_usd,output_token_1k_usd,context_size
0,OpenAI,OpenAI,gpt-4,0.03,0.06,8000
1,OpenAI,OpenAI,gpt-4-32k,0.06,0.12,32000
2,OpenAI,OpenAI,gpt-4-1106-preview,0.01,0.03,128000
3,OpenAI,OpenAI,gpt-4-1106-vision-preview,0.01,0.03,128000
4,OpenAI,OpenAI,gpt-3.5-turbo-1106,0.001,0.002,16000


In [8]:
dataexplore.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   provider             31 non-null     object 
 1   developer            31 non-null     object 
 2   model                31 non-null     object 
 3   input_token_1k_usd   31 non-null     float64
 4   output_token_1k_usd  31 non-null     float64
 5   context_size         31 non-null     int64  
dtypes: float64(2), int64(1), object(3)
memory usage: 1.6+ KB


In [9]:
!aws s3 cp {s3_data_path} {s3_table_path}

copy: s3://sagemaker-us-east-1-907086662522/llmpricingdata.csv to s3://sagemaker-us-east-1-907086662522/table4/llmpricingdata.csv


In [10]:
!aws s3 ls {s3_table_path}

2026-02-22 23:56:39       1587 llmpricingdata.csv


### Create Athena External Table
Define the schema and register the dataset with Athena using an external table pointing to the S3 table directory.


In [11]:
#sql table creation
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         provider STRING,
         developer STRING,
         model STRING,
         input_token_1k_usd FLOAT,
         output_token_1k_usd FLOAT,
         context_size INT
) ROW FORMAT DELIMITED 
  FIELDS TERMINATED BY ',' 
  LINES TERMINATED BY '\\n' 
LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name_csv, s3_table_path
)

In [12]:
pd.read_sql(statement, conn)

  pd.read_sql(statement, conn)


In [13]:
statement = "SHOW TABLES in {}".format(database_name)
df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,tab_name
0,aimodelpoll
1,amazon_reviews_parquet
2,amazon_reviews_tsv
3,lifearchitect
4,llmachievements


### Validate Table Access
Run a sample query (LIMIT 10) to verify that Athena can successfully read the dataset.

In [14]:
statement = """SELECT * FROM {}.{} LIMIT 10""".format(
    database_name, table_name_csv
)
print(statement)
df = pd.read_sql(statement, conn)
df.head()

SELECT * FROM dsoaws.llmpricing LIMIT 10


  df = pd.read_sql(statement, conn)


Unnamed: 0,provider,developer,model,input_token_1k_usd,output_token_1k_usd,context_size
0,OpenAI,OpenAI,gpt-4,0.03,0.06,8000
1,OpenAI,OpenAI,gpt-4-32k,0.06,0.12,32000
2,OpenAI,OpenAI,gpt-4-1106-preview,0.01,0.03,128000
3,OpenAI,OpenAI,gpt-4-1106-vision-preview,0.01,0.03,128000
4,OpenAI,OpenAI,gpt-3.5-turbo-1106,0.001,0.002,16000


### Summary

This notebook created an Athena external table over the LLM pricing dataset stored in Amazon S3.  
The table provides SQL access to token pricing and context size information for supported models.  
This enables cost-aware analysis and supports downstream routing and optimization workflows.
