In [16]:
import pandas as pd
import pyarrow as pa
from pyarrow import csv
from pyarrow import feather

In [17]:
with open("data/raw/tpcds.sql") as f:
    sql_file = f.read()

In [18]:
# remove header
sql_file = "\n".join([line for line in sql_file.split("\n") if not line.startswith("--")])

In [21]:
import re

def get_arrow_type(sql_type):
    if sql_type == "integer":
        return pa.int32()
    elif sql_type == "date":
        return pa.date32()
    elif sql_type == "time":
        # TODO parsing as time not yet supported
        return pa.string()
    elif "varchar" in sql_type:
        return pa.string()
    elif "char" in sql_type:
        # match = re.search("char\((.*)\)", sql_type)
        # n = int(match.group(1))
        # return pa.bianry(n)
        return pa.string()
    elif "decimal" in sql_type:
        match = re.search("decimal\((.*),(.*)\)", sql_type)
        p = int(match.group(1))
        s = int(match.group(2))
        return pa.decimal128(p, s)
    else:
        raise ValueError(f"Not supported data type: {sql_type}")

In [34]:
for sql in sql_file.split("\n\n")[:-1]:
    
    lines = sql.split("\n")
    table_name = lines[0].split()[2]
    print(f"Processing {table_name}")
    
    fields = []

    for line in lines[2:-1]:
        if "primary key" in line:
            continue
        nullable = "not null" not in line
        name, dtype = line.strip().split()[:2]
        typ = get_arrow_type(dtype)
        field = pa.field(name, typ, nullable=nullable)
        fields.append(field)

    schema = pa.schema(fields)
    
    table = csv.read_csv(
        f"data/raw/{table_name}.dat",
        read_options=csv.ReadOptions(column_names=schema.names + ["dummy"], encoding="latin-1"),
        parse_options=csv.ParseOptions(delimiter="|"),
        convert_options=csv.ConvertOptions(column_types=schema, include_columns=schema.names)
    )
    feather.write_feather(table, f"data/{table_name}.feather", compression="zstd")


Processing dbgen_version
Processing customer_address
Processing customer_demographics
Processing date_dim
Processing warehouse
Processing ship_mode
Processing time_dim
Processing reason
Processing income_band
Processing item
Processing store
Processing call_center
Processing customer
Processing web_site
Processing store_returns
Processing household_demographics
Processing web_page
Processing promotion
Processing catalog_page
Processing inventory
Processing catalog_returns
Processing web_returns
Processing web_sales
Processing catalog_sales
Processing store_sales
