In [1]:
#8
#athena db config - llmleader
#AAI-540 Group 3 FP

### Notebook 8

This notebook registers the Open LLM Leaderboard dataset in Amazon Athena for SQL-based analysis.  
The dataset is stored in Amazon S3 and exposed as an external table without duplicating data.  
This enables querying leaderboard rankings and evaluation metrics for open-source models.  
The resulting table supports downstream model comparison and routing analysis.

In [1]:
import boto3
import sagemaker
from pyathena import connect
import pandas as pd

Unable to load JumpStart region config.
Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.12/site-packages/sagemaker/jumpstart/constants.py", line 69, in _load_region_config
    with open(filepath) as f:
         ^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: '/home/ec2-user/anaconda3/envs/python3/lib/python3.12/site-packages/sagemaker/jumpstart/region_config.json'


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


### Configure AWS and Athena Environment
Initialize AWS session, identify the project bucket, and configure the Athena staging directory for query outputs.

In [2]:
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
ingest_create_athena_db_passed = False

### Create or Select Athena Database
Ensure the Athena database exists so external tables can be registered consistently.

In [3]:
database_name = "dsoaws"
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [4]:
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)
pd.read_sql(statement, conn)

CREATE DATABASE IF NOT EXISTS dsoaws


  pd.read_sql(statement, conn)


In [5]:
statement = "SHOW DATABASES"
df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,database_name
0,default
1,dsoaws
2,sagemaker_featurestore


### Define S3 Data Location for Athena Table
Set the dataset source path and the dedicated S3 folder Athena will use as the table location.

In [6]:
s3_data_path = f"s3://{bucket}/openllmleader.csv"
s3_table_path = f"s3://{bucket}/table5/"
print("s3_data_path:", s3_data_path)
print("s3_table_path:", s3_table_path)

s3_data_path: s3://sagemaker-us-east-1-907086662522/openllmleader.csv
s3_table_path: s3://sagemaker-us-east-1-907086662522/table5/


### Prepare Table Data Directory in S3
Copy the dataset into the S3 folder referenced by the Athena external table.

In [7]:
!aws s3 cp {s3_data_path} {s3_table_path}

copy: s3://sagemaker-us-east-1-907086662522/openllmleader.csv to s3://sagemaker-us-east-1-907086662522/table5/openllmleader.csv


In [8]:
#table5 - openllmleader.csv
table_name_csv = 'llmleader'
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)
ingest_create_athena_table_csv_passed = False
dataexplore = pd.read_csv('openllmleader.csv')
dataexplore.head()

Unnamed: 0,Model,Type,Class,Backend,Dtype,Optimizations,Throughput (tokens/s),Peak Memory (MB),Score (%)
0,bofenghuang/vigogne-2-7b-instruct,LLaMA,7B,pytorch,float16,BetterTransformer,39.2,14925,58.2
1,bofenghuang/vigogne-2-7b-instruct,LLaMA,7B,pytorch,float16,BetterTransformer,39.2,14925,58.2
2,HuggingFaceH4/starchat-beta,GPT-BigCode,20B,pytorch,float16,,43.9,31745,55.8
3,bofenghuang/vigogne-2-7b-instruct,LLaMA,7B,pytorch,float32,,36.5,27811,58.2
4,NousResearch/Nous-Hermes-Llama2-13b,LLaMA,10B,pytorch,float16,BetterTransformer,30.0,27069,62.6


In [9]:
dataexplore.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 370 entries, 0 to 369
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Model                  370 non-null    object 
 1   Type                   370 non-null    object 
 2   Class                  370 non-null    object 
 3   Backend                370 non-null    object 
 4   Dtype                  370 non-null    object 
 5   Optimizations          216 non-null    object 
 6   Throughput (tokens/s)  370 non-null    float64
 7   Peak Memory (MB)       370 non-null    int64  
 8   Score (%)              370 non-null    float64
dtypes: float64(2), int64(1), object(6)
memory usage: 26.1+ KB


In [10]:
!aws s3 cp {s3_data_path} {s3_table_path}

copy: s3://sagemaker-us-east-1-907086662522/openllmleader.csv to s3://sagemaker-us-east-1-907086662522/table5/openllmleader.csv


In [11]:
!aws s3 ls {s3_table_path}

2026-02-23 00:07:18      29356 openllmleader.csv


### Create Athena External Table
Define the schema and register the dataset with Athena using an external table pointing to the S3 table directory.

In [12]:
#sql table creation
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         Model STRING,
         Type STRING,
         Class STRING,
         Backend STRING,
         Dtype STRING,
         Optimizations STRING,
         Throughput FLOAT,
         Peak_memory INT,
         Score FLOAT
) ROW FORMAT DELIMITED 
  FIELDS TERMINATED BY ',' 
  LINES TERMINATED BY '\\n' 
LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name_csv, s3_table_path
)

In [13]:
pd.read_sql(statement, conn)

  pd.read_sql(statement, conn)


In [14]:
statement = "SHOW TABLES in {}".format(database_name)
df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,tab_name
0,aimodelpoll
1,amazon_reviews_parquet
2,amazon_reviews_tsv
3,lifearchitect
4,llmachievements


### Validate Table Access
Run a sample query (LIMIT 10) to verify that Athena can successfully read the dataset.

In [15]:
statement = """SELECT * FROM {}.{} LIMIT 10""".format(
    database_name, table_name_csv
)
print(statement)
df = pd.read_sql(statement, conn)
df.head()

SELECT * FROM dsoaws.llmleader LIMIT 10


  df = pd.read_sql(statement, conn)


Unnamed: 0,model,type,class,backend,dtype,optimizations,throughput,peak_memory,score
0,bofenghuang/vigogne-2-7b-instruct,LLaMA,7B,pytorch,float16,BetterTransformer,39.2,14925,58.2
1,bofenghuang/vigogne-2-7b-instruct,LLaMA,7B,pytorch,float16,BetterTransformer,39.2,14925,58.2
2,HuggingFaceH4/starchat-beta,GPT-BigCode,20B,pytorch,float16,,43.9,31745,55.8
3,bofenghuang/vigogne-2-7b-instruct,LLaMA,7B,pytorch,float32,,36.5,27811,58.2
4,NousResearch/Nous-Hermes-Llama2-13b,LLaMA,10B,pytorch,float16,BetterTransformer,30.0,27069,62.6


### Summary

This notebook created an Athena external table over the Open LLM Leaderboard dataset stored in Amazon S3.  
The table provides SQL access to leaderboard results and evaluation metrics for open-source models.  
This supports model ranking analysis and downstream routing and optimization workflows.