# Import / Config

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
import yaml
from edurel.utils.duck_utils import *

BASE_DIR = "/home/basis/work/github/edurel"
DB_DIR = f"{BASE_DIR}/databases/"



# FileDB

In [None]:
con = duckdb_file_con(f"{DB_DIR}/db-adw-oltp", read_only=True)

In [None]:
con.close()

In [None]:
spec = {
    "DimCustomer": "* exclude (NameStyle, SpanishEducation, FrenchEducation, SpanishOccupation, FrenchOccupation)",
    "DimDate": "* exclude (SpanishDayNameOfWeek, FrenchDayNameOfWeek, SpanishMonthName, FrenchMonthName)",
    "DimEmployee": "* exclude (NameStyle)",
    "DimProductCategory": "* exclude (SpanishProductCategoryName, FrenchProductCategoryName)",
    "DimProductSubcategory": "* exclude (SpanishProductSubcategoryName, FrenchProductSubcategoryName)",
    "DimProduct": "* exclude (SpanishProductName, FrenchProductName, LargePhoto, FrenchDescription, ChineseDescription, ArabicDescription, HebrewDescription, ThaiDescription, GermanDescription, JapaneseDescription, TurkishDescription)",
    "DimPromotion": "* exclude (SpanishPromotionType, FrenchPromotionType, SpanishPromotionCategory, FrenchPromotionCategory)",
}

In [None]:
spec = {
    "Address": "* exclude (SpatialLocation, rowguid)",
    "AddressType": "* exclude (rowguid)",
    "BusinessEntity": "* exclude (rowguid)",
    "BusinessEntityAddress": "* exclude (rowguid)",
    "BusinessEntityContact": "* exclude (rowguid)",
    "Customer": "* exclude (rowguid, xxx, xxx, xxx, xxx)",
    "EmailAddress": "* exclude (rowguid)",
    "Employee": "* exclude (SalariedFlag, CurrentFlag, rowguid)",
    "Password": "* exclude (PasswordHash, PasswordSalt, rowguid)",
    "Person": "* exclude (NameStyle, Demographics, rowguid)",
    "Product": "* exclude (MakeFlag, FinishedGoodsFlag, rowguid)",
    "ProductCategory": "* exclude (rowguid)",
    "ProductDescription": "* exclude (rowguid)",
    "ProductInventory": "* exclude (rowguid)",
    "ProductModel": "* exclude (rowguid)",
    "ProductSubcategory": "* exclude (rowguid)",
    "SalesOrderDetail": "* exclude (rowguid)",
    "SalesOrderHeader": "* exclude (rowguid)",
    "xxx": "* exclude (xxx, xxx, xxx, xxx, xxx)",
    "xxx": "* exclude (xxx, xxx, xxx, xxx, xxx)",
    "xxx": "* exclude (xxx, xxx, xxx, xxx, xxx)",
    "xxx": "* exclude (xxx, xxx, xxx, xxx, xxx)",
    "xxx": "* exclude (xxx, xxx, xxx, xxx, xxx)",
}

In [None]:

duckdb_df_print(con, spec)

# MemDB

In [None]:
con = duckdb_mem_con(DB_DIR + "db-hvs")
print(duckdb_schema(con))

# con.close()

In [None]:
con.close()

In [None]:
# duckdb_schema_print(con)

In [None]:
schema_dict = duckdb_schema_yaml(con)
schema_dict

In [None]:
d = duckdb_columns(con)
print(d)

In [None]:
d = duckdb_columns(con)
for r in d:
    print(f"{r}: {d[r]}")

In [None]:
duckdb_schema_yaml_print(con)

In [None]:
sql = """
-- Kurs√ºbersicht mit Studierendenanzahl
SELECT
    sem.bez AS semester_bez,
    mod.bez AS modul_bez,
    ma.bez AS modulart_bez,
    mba.bez AS modulbtart_bez,
    mb.sws AS sws,
    r.raumnr AS raumnr,
    t.bez AS tag_bez,
    zb.uhrzeitvon AS uhrzeitvon,
    zb.uhrzeitbis AS uhrzeitbis,
    COUNT(b.studentin_prid) AS anzahl_studierende
FROM kursbt kb
    JOIN kurs k ON kb.kurs_kid = k.kid
    JOIN semester sem ON k.semester_sid = sem.sid
    JOIN modul mod ON k.modul_mid = mod.mid
    JOIN modulart ma ON mod.modulart_maid = ma.maid
    JOIN modulbt mb ON kb.modulbt_mbtid = mb.mbtid
    JOIN modulbtart mba ON mb.modulbtart_mbaid = mba.mbaid
    JOIN raum r ON kb.raum_rid = r.rid
    JOIN termin ter ON kb.termin_tid = ter.tid
    JOIN wochentermin wt ON ter.tid = wt.tid
    JOIN tag t ON wt.tag_tid = t.tid
    JOIN zeitblock zb ON wt.zeitblock_zbid = zb.zbid
    LEFT JOIN belegung b ON kb.kbtid = b.kursbt_kbtid
GROUP BY
    sem.bez,
    mod.bez,
    ma.bez,
    mba.bez,
    mb.sws,
    r.raumnr,
    t.bez,
    zb.uhrzeitvon,
    zb.uhrzeitbis
ORDER BY
    sem.bez DESC,
    t.bez,
    zb.uhrzeitvon;
"""
sql_print(con, sql)

In [None]:
sql = """
SELECT
    p.name AS name,
    pr.datumbeginn AS datumbeginn,
    pr.datumende AS datumende,
    r.bez AS rolle_bez,
    s.matrnr AS matrnr,
    l.steuernummer AS steuernummer
FROM personenrolle pr
    JOIN person p ON pr.person_pid = p.pid
    JOIN rolle r ON pr.rolle_rid = r.rid
    LEFT JOIN studentin s ON pr.prid = s.prid
    LEFT JOIN lehrperson l ON pr.prid = l.prid
ORDER BY
    p.name,
    pr.datumbeginn DESC;
"""
sql_print(con, sql)

# adw-oltp

In [None]:
with (Path(DB_DIR) / "db-adw-oltp" / "schema1.sql").open("r") as f:
    sql = f.read()

sql_duckdb = sqlglot.transpile(sql, read="tsql", write="duckdb")
sql_duckdb_str = "\n".join(sql_duckdb)
print(sql_duckdb_str)


In [None]:

con = duckdb_mem_con(DB_DIR + "db-adw-oltp")
con.execute(sql_duckdb_str)
print(duckdb_schema(con))
con.close()

# CSV

In [None]:
from pathlib import Path
import duckdb
from edurel.utils.duck_utils import *


In [None]:
csv_dir = f"{DB_DIR}/db-adw-oltp/tmp/csv"
tsv_dir = f"{DB_DIR}/db-adw-oltp/tmp/tsv"
parquet_dir = f"{DB_DIR}/db-adw-oltp/tmp/parquet"



spec = "header=false, delim = '+|'"
db_file_op(csv_dir, "*", ".csv", spec, show=True, out_path=parquet_dir)