In [1]:
#8
#athena db config - overview model
#AAI-540 Group 3 FP

### Notebook 9

This notebook registers the Overview AI Models dataset in Amazon Athena for SQL-based analysis.  
The dataset is stored in Amazon S3 and exposed as an external table without duplicating data.  
This enables querying model metadata such as provider, model family, and release timeline.  
The resulting table supports downstream model profiling and routing analysis.
This dataset provides structured metadata describing AI models across providers and release timelines.  
Registering it in Athena enables unified querying and integration with performance, pricing, and benchmark datasets.

In [4]:
import sys
!{sys.executable} -m pip install -q PyAthena

### Configure AWS and Athena Environment

Initialize the AWS session, identify the project S3 bucket, and configure the Athena staging directory for query outputs.

In [5]:
import boto3
import sagemaker
from pyathena import connect
import pandas as pd

In [6]:
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
ingest_create_athena_db_passed = False

In [7]:
database_name = "dsoaws"
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [8]:
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)
pd.read_sql(statement, conn)

CREATE DATABASE IF NOT EXISTS dsoaws


  pd.read_sql(statement, conn)


In [19]:
statement = "SHOW DATABASES"
df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,database_name
0,default
1,dsoaws
2,sagemaker_featurestore


### Define S3 Data Location for Athena Table

Specify the source dataset path and the S3 folder that Athena will use as the external table location.

In [20]:
s3_data_path = f"s3://{bucket}/overviewmodels.csv"
s3_table_path = f"s3://{bucket}/table6/"
print("s3_data_path:", s3_data_path)
print("s3_table_path:", s3_table_path)

s3_data_path: s3://sagemaker-us-east-1-907086662522/overviewmodels.csv
s3_table_path: s3://sagemaker-us-east-1-907086662522/table6/


In [21]:
!aws s3 cp {s3_data_path} {s3_table_path}

copy: s3://sagemaker-us-east-1-907086662522/overviewmodels.csv to s3://sagemaker-us-east-1-907086662522/table6/overviewmodels.csv


In [22]:
#table6 - overviewmodels.csv
table_name_csv = 'overviewmodel'
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)
ingest_create_athena_table_csv_passed = False
dataexplore = pd.read_csv('overviewmodels.csv')
dataexplore.head()

Unnamed: 0,model,domain,tasks,organization,release_date,reference_url,parameters,training_flop,training_hours,training_hardware,accessibility,country,org_type,code_available,hf_repo_id
0,Solar Open 100B\n,Language,"Language modeling/generation,Chat",Upstage,2025-12-31,https://huggingface.co/upstage/Solar-Open-100B,102000000000.0,,1572.0,,API access,Korea (Republic of),Industry,,
1,K-EXAONE,Language,"Language modeling/generation,Chat,Question ans...",LG AI Research,2025-12-31,https://huggingface.co/LGAI-EXAONE,236000000000.0,1.52e+24,3240.0,NVIDIA B200 GPUs,API access,Korea (Republic of),Industry,Unreleased,
2,VAETKI\n,Language,Language modeling/generation,NC AI,2025-12-30,https://huggingface.co/NC-AI-consortium-VAETKI...,100000000000.0,,3238.0,Nvidia H100 80G,Open weights (unrestricted),Korea (Republic of),Industry,,
3,A.X K1,Language,"Code generation,Language modeling/generation,T...",SK Telecom,2025-12-30,https://huggingface.co/skt,519000000000.0,,1536.0,NVIDIA H200,,Korea (Republic of),Industry,,
4,HyperCLOVA X SEED 32B Think,"Multimodal,Language,Vision","Language modeling/generation,(Visual) Question...",NAVER,2025-12-29,https://huggingface.co/naver-hyperclovax/Hyper...,32000000000.0,,,,,Korea (Republic of),Industry,,


In [23]:
dataexplore.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3239 entries, 0 to 3238
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   model              3239 non-null   object 
 1   domain             3155 non-null   object 
 2   tasks              3121 non-null   object 
 3   organization       3159 non-null   object 
 4   release_date       3221 non-null   object 
 5   reference_url      3204 non-null   object 
 6   parameters         2088 non-null   float64
 7   training_flop      1376 non-null   float64
 8   training_hours     546 non-null    float64
 9   training_hardware  1162 non-null   object 
 10  accessibility      2490 non-null   object 
 11  country            3151 non-null   object 
 12  org_type           3139 non-null   object 
 13  code_available     2298 non-null   object 
 14  hf_repo_id         608 non-null    object 
dtypes: float64(3), object(12)
memory usage: 379.7+ KB


In [24]:
!aws s3 cp {s3_data_path} {s3_table_path}

copy: s3://sagemaker-us-east-1-907086662522/overviewmodels.csv to s3://sagemaker-us-east-1-907086662522/table6/overviewmodels.csv


In [25]:
!aws s3 ls {s3_table_path}

2026-02-23 01:05:48    6317510 overviewmodels.csv


### Create Athena External Table

Define the table schema and register the dataset with Athena using an external table pointing to the S3 table directory.

In [26]:
#sql table creation
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         model STRING,
         domain STRING,
         tasks STRING,
         organization STRING,
         release_date STRING,
         reference_url STRING,
         parameters FLOAT,
         training_flop FLOAT,
         training_hours FLOAT,
         training_hardware STRING,
         accessibility STRING,
         country STRING,
         org_type STRING,
         code_available STRING,
         hf_repo_id STRING
) ROW FORMAT DELIMITED 
  FIELDS TERMINATED BY ',' 
  LINES TERMINATED BY '\\n' 
LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name_csv, s3_table_path
)

In [27]:
pd.read_sql(statement, conn)

  pd.read_sql(statement, conn)


In [28]:
statement = "SHOW TABLES in {}".format(database_name)
df_show = pd.read_sql(statement, conn)
df_show.head(7)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,tab_name
0,aimodelpoll
1,amazon_reviews_parquet
2,amazon_reviews_tsv
3,lifearchitect
4,llmachievements
5,llmleader
6,llmpricing


In [29]:
statement = """SELECT * FROM {}.{} LIMIT 10""".format(
    database_name, table_name_csv
)
print(statement)
df = pd.read_sql(statement, conn)
df.head()

SELECT * FROM dsoaws.overviewmodel LIMIT 10


  df = pd.read_sql(statement, conn)


Unnamed: 0,model,domain,tasks,organization,release_date,reference_url,parameters,training_flop,training_hours,training_hardware,accessibility,country,org_type,code_available,hf_repo_id
0,"""Solar Open 100B",,,,,,,,,,,,,,
1,"""",Language,"""Language modeling/generation","Chat""",Upstage,"""Upstage AI",,,,,,,,,
2,"""",2025-12-31,Upstage's flagship 102B-parameter large langua...,https://huggingface.co/upstage/Solar-Open-100B,,Training cost,,102000000000.0,,of which 12B active,,,,,
3,"""",,,19700000000000,,1572.0,,,,,,,,,
4,"""",,,Confident,"""Solar Open is Upstage's flagship 102B-paramet...",trained entirely from scratch and released un...,,,,,,API access,Korea (Republic of),,


### Summary

This notebook created an Athena external table over the Overview AI Models dataset stored in Amazon S3.  
The table provides SQL access to structured model metadata that can be joined with other datasets.  
This supports downstream model comparison, profiling, and cost-aware routing workflows.