In [None]:
import getpass

# grab credentials from the notebook user
host = input("Host name")
username = input("User name")
password = getpass.getpass("Password")

In [49]:
from pystarburst import Session
from pystarburst import functions as F
from pystarburst.functions import *
from pystarburst.window import Window as W

# PyStarburst setup
session_properties = {
    "host":host,
    "port": 443,
    "http_scheme": "https",
    "auth": trino.auth.BasicAuthentication(username, password)
}
session = Session.builder.configs(session_properties).create()

# validate PyStarburst working
session.sql("select 'Yes' as Working").collect()

[Row(Working='Yes')]

In [50]:
from trino.dbapi import connect

# trino-python-client setup
conn = connect(
    host=host,
    port=443,
    http_scheme='https',
    auth=trino.auth.BasicAuthentication (username, password)
)
cur = conn.cursor()
cur.execute("select 'Yes' as dummy")

# validate trino-python-client working
print(cur.fetchall())

[['Yes']]


In [51]:
# LATER can pass in a catalog to work on
curr_cat = 'students'
# LATER can get a list of the schemas to loop through
curr_sch = 'mv2ice'

table_list_from_collect = session \
    .table(curr_cat + ".information_schema.tables") \
    .filter("table_schema = '" + curr_sch + "' AND table_type = 'BASE TABLE'") \
    .select("table_name") \
    .collect()

table_list = list()
for a_table in table_list_from_collect:
    table_list.append(curr_cat + "." + curr_sch + "." + a_table.table_name)

# debug line to see the tables to investigate
print(table_list)

['students.mv2ice.t_ice_orc', 'students.mv2ice.t_ice_parquet', 'students.mv2ice.t_hive_json']


In [65]:
# create some lists to separate the table types into
hive_tables_2rewrite = list()
hive_tables_2migrate = list()
iceberg_tables = list()
other_tables   = list()

# could NOT get the SHOW CREATE TABLE output via PyStarburst 
#  OR figure out how to get the table format type in other way
#  SO using trino-python-client

for a_table in table_list:
    cur.execute("SHOW CREATE TABLE " + a_table)
    cts = cur.fetchall()[0][0]
    
    # rm me
    print("\n" + cts)
    # rm me
    
    if "type = 'HIVE'" in cts:
        if "format = 'PARQUET'" in cts or "format = 'ORC'" in cts or "format = 'AVRO'" in cts:
            hive_tables_2migrate.append(a_table)
        else:
            hive_tables_2rewrite.append(a_table)
    elif "type = 'ICEBERG'" in cts:
        iceberg_tables.append(a_table)
    else:
        other_tables.append(a_table)
        
        
print("\n" + str(len(iceberg_tables)) + " Iceberg tables already exist -- no action will be taken on these...")
print(iceberg_tables)

print("\n" + str(len(other_tables)) + " non Iceberg or Hive tables exist -- no action will be taken on these " + \
      "(BUT COULD BE REWRITTEN)...")
print(other_tables)

print("\n" + str(len(hive_tables_2migrate)) + " Hive tables that are targeted to be MIGRATED (in-place) to Iceberg...")
print(hive_tables_2migrate)

print("\n" + str(len(hive_tables_2rewrite)) + " Hive tables are targeted to be REWRITTEN (ctas) to Iceberg...")
print(hive_tables_2rewrite)





CREATE TABLE students.mv2ice.t_ice_orc (
   nationkey bigint,
   name varchar,
   regionkey bigint,
   comment varchar
)
WITH (
   format = 'ORC',
   format_version = 2,
   location = 's3://edu-train-galaxy-students/students/mv2ice/t_ice_orc-2b2ed747bb28485685b3610295894e87',
   type = 'ICEBERG'
)

CREATE TABLE students.mv2ice.t_ice_parquet (
   nationkey bigint,
   name varchar,
   regionkey bigint,
   comment varchar
)
WITH (
   format = 'PARQUET',
   format_version = 2,
   location = 's3://edu-train-galaxy-students/students/mv2ice/t_ice_parquet-ecfe974580114948859ceafe1fc11f51',
   type = 'ICEBERG'
)

CREATE TABLE students.mv2ice.t_hive_json (
   nationkey bigint,
   name varchar(25),
   regionkey bigint,
   comment varchar(152)
)
WITH (
   format = 'JSON',
   type = 'HIVE'
)

2 Iceberg tables already exist -- no action will be taken on these...
['students.mv2ice.t_ice_orc', 'students.mv2ice.t_ice_parquet']

0 non Iceberg or Hive tables exist -- no action will be taken on these (BU