# Import / Config

In [1]:
%load_ext autoreload
%autoreload 2

In [43]:
from pathlib import Path
import sqlglot
from edurel.utils.duck_utils import *

BASE_DIR = "/home/basis/work/github/edurel"
DB_DIR = f"{BASE_DIR}/databases/"



# FileDB

In [41]:
con = duckdb_file_con(f"{DB_DIR}/db-adw-oltp", read_only=True)

In [40]:
con.close()

In [None]:
spec = {
    "DimCustomer": "* exclude (NameStyle, SpanishEducation, FrenchEducation, SpanishOccupation, FrenchOccupation)",
    "DimDate": "* exclude (SpanishDayNameOfWeek, FrenchDayNameOfWeek, SpanishMonthName, FrenchMonthName)",
    "DimEmployee": "* exclude (NameStyle)",
    "DimProductCategory": "* exclude (SpanishProductCategoryName, FrenchProductCategoryName)",
    "DimProductSubcategory": "* exclude (SpanishProductSubcategoryName, FrenchProductSubcategoryName)",
    "DimProduct": "* exclude (SpanishProductName, FrenchProductName, LargePhoto, FrenchDescription, ChineseDescription, ArabicDescription, HebrewDescription, ThaiDescription, GermanDescription, JapaneseDescription, TurkishDescription)",
    "DimPromotion": "* exclude (SpanishPromotionType, FrenchPromotionType, SpanishPromotionCategory, FrenchPromotionCategory)",
}

In [None]:
spec = {
    "Address": "* exclude (SpatialLocation, rowguid)",
    "AddressType": "* exclude (rowguid)",
    "BusinessEntity": "* exclude (rowguid)",
    "BusinessEntityAddress": "* exclude (rowguid)",
    "BusinessEntityContact": "* exclude (rowguid)",
    "Customer": "* exclude (rowguid, xxx, xxx, xxx, xxx)",
    "EmailAddress": "* exclude (rowguid)",
    "Employee": "* exclude (SalariedFlag, CurrentFlag, rowguid)",
    "Password": "* exclude (PasswordHash, PasswordSalt, rowguid)",
    "Person": "* exclude (NameStyle, Demographics, rowguid)",
    "Product": "* exclude (MakeFlag, FinishedGoodsFlag, rowguid)",
    "ProductCategory": "* exclude (rowguid)",
    "ProductDescription": "* exclude (rowguid)",
    "ProductInventory": "* exclude (rowguid)",
    "ProductModel": "* exclude (rowguid)",
    "ProductSubcategory": "* exclude (rowguid)",
    "SalesOrderDetail": "* exclude (rowguid)",
    "SalesOrderHeader": "* exclude (rowguid)",
    "xxx": "* exclude (xxx, xxx, xxx, xxx, xxx)",
    "xxx": "* exclude (xxx, xxx, xxx, xxx, xxx)",
    "xxx": "* exclude (xxx, xxx, xxx, xxx, xxx)",
    "xxx": "* exclude (xxx, xxx, xxx, xxx, xxx)",
    "xxx": "* exclude (xxx, xxx, xxx, xxx, xxx)",
}

In [51]:

duckdb_df_print(con, spec)

Address
                                                            0
AddressID                                                   1
AddressLine1                                    1970 Napa Ct.
AddressLine2                                             None
City                                                  Bothell
StateProvinceID                                            79
PostalCode                                              98011
SpatialLocation  E6100000010CAE8BFC28BCE4474067A89189898A5EC0
rowguid                  9AADCB0D-36CF-483F-84D8-585C2D4EC6E9
ModifiedDate                              2018-12-03 00:00:00
Number of rows: 19614



AddressType
                                                  0
AddressTypeID                                     1
Name                                        Billing
rowguid        B84F78B1-4EFE-4A0E-8CB7-70E9F112F886
ModifiedDate                    2019-04-30 00:00:00
Number of rows: 6



BillOfMaterials
                                     0

# MemDB

In [None]:
con = duckdb_mem_con(DB_DIR + "db-company_en")
print(duckdb_schema(con))
sql_tsql = """
select * from employee;
"""
sql_print(con, sql_tsql)

con.close()

Table: EmpProj (EID INTEGER NOT NULL, PID INTEGER NOT NULL, NoOfHoursPerWeek INTEGER NOT NULL)
Table: Employee (EID INTEGER NOT NULL, OUID INTEGER NOT NULL, LastName VARCHAR NOT NULL, Hiredate DATE NOT NULL, Salary DECIMAL(9,2) NOT NULL, Bonus DECIMAL(9,2) NULL)
Table: OrgUnit (OUID INTEGER NOT NULL, Head INTEGER NULL, SuperUnit INTEGER NULL, Name VARCHAR NOT NULL)
Table: Project (PID INTEGER NOT NULL, Title VARCHAR NOT NULL, Budget DECIMAL(13,2) NULL)
Foreign Key: EmpProj(EID) -> Employee(EID)
Foreign Key: EmpProj(PID) -> Project(PID)
Foreign Key: Employee(OUID) -> OrgUnit(OUID)

┌───────┬───────┬──────────┬────────────┬──────────────┬──────────────┐
│  EID  │ OUID  │ LastName │  Hiredate  │    Salary    │    Bonus     │
│ int32 │ int32 │ varchar  │    date    │ decimal(9,2) │ decimal(9,2) │
├───────┼───────┼──────────┼────────────┼──────────────┼──────────────┤
│   101 │    11 │ Patil    │ 2000-05-01 │    180000.00 │         NULL │
│   102 │    12 │ Durmaz   │ 2005-07-01 │    120000.

# adw-oltp

In [None]:
with (Path(DB_DIR) / "db-adw-oltp" / "schema1.sql").open("r") as f:
    sql_tsql = f.read()

sql_duckdb = sqlglot.transpile(sql_tsql, read="tsql", write="duckdb")
sql_duckdb_str = "\n".join(sql_duckdb)
print(sql_duckdb_str)


In [None]:

con = duckdb_mem_con(DB_DIR + "db-adw-oltp")
con.execute(sql_duckdb_str)
print(duckdb_schema(con))
con.close()

# CSV

In [12]:
from pathlib import Path
import duckdb
from edurel.utils.duck_utils import *


In [None]:
csv_dir = f"{DB_DIR}/db-adw-oltp/tmp/csv"
tsv_dir = f"{DB_DIR}/db-adw-oltp/tmp/tsv"
parquet_dir = f"{DB_DIR}/db-adw-oltp/tmp/parquet"



spec = "header=false, delim = '+|'"
db_file_op(csv_dir, "*", ".csv", spec, show=True, out_path=parquet_dir)