In [None]:
import polars as pl

from constants import (
    FE_SRC_FOLDER,
    ID_COL,
    NOT_USE_COL,
    RANK_COL,
    RAW_DATA,
    TOOL_COL,
    TOOL_COUNTS_DATA,
    TOOLS_COL_PREFIX,
    TOOLS_MAP,
    TOOLS_OTHER_COL,
    TOTAL_COUNT_COL,
    USE_COL,
)

In [None]:
pl.__version__

In [None]:
df = pl.scan_csv(RAW_DATA)

In [None]:
# df.collect().shape
# type(df)
# df.fetch(5)
df.fetch(1)
# df.columns

In [None]:
q_tools = df.select([pl.col(ID_COL), pl.col(f"^{TOOLS_COL_PREFIX}.*$")]).drop(
    TOOLS_OTHER_COL
)

In [None]:
# tools_cols = [
#     col
#     for col in df.columns
#     if col.startswith(TOOLS_COL_PREFIX) and col != f"{TOOLS_COL_PREFIX}Other__"
# ]
# tools_cols = [
#     pl.col(col)
#     for col in df.columns
#     if (col.startswith(TOOLS_COL_PREFIX) and col != f"{TOOLS_COL_PREFIX}Other__")
#     or col == "chronID"
# ]
# tools_cols

# q_tools = df.select(tools_cols)

In [None]:
print(q_tools.describe_plan())

In [None]:
print(q_tools.describe_optimized_plan())

In [None]:
# df_tools = q_tools.collect(no_optimization=False)
# df_tools = q_tools.collect(no_optimization=True)

# More info:
# - https://github.com/pola-rs/polars/issues/1659#issuecomment-956499225

# df_tools = q_tools.collect(projection_pushdown=True)
df_tools = q_tools.collect(projection_pushdown=False)

In [None]:
# type(df_tools)
df_tools.shape

In [None]:
df_tools[ID_COL].n_unique()

In [None]:
df_tools.head()

In [None]:
df_tools.tail()

In [None]:
df_tools[TOOLS_COL_PREFIX].value_counts()

In [None]:
df_tools[f"{TOOLS_COL_PREFIX}ArcGIS"].value_counts()

In [None]:
df_tools[f"{TOOLS_COL_PREFIX}D3"].value_counts()

In [None]:
df_tools.null_count()

In [None]:
predicate = pl.all().exclude([ID_COL, TOOLS_COL_PREFIX]).str.contains("^$")

# q_count_tools = df_tools.lazy().select(
#     [
#         predicate.sum().suffix("_notuse"),
#         predicate.is_not().sum().suffix("_use"),
#     ]
# )

# q_count_tools.collect().transpose(
#     include_header=True, header_name="tool", column_names=["count"]
# )

q_use_tools = df_tools.lazy().select(predicate.is_not().sum())
# q_use_tools.collect()

q_not_use_tools = df_tools.lazy().select(predicate.sum())
# q_not_use_tools.collect()

In [None]:
df_use_tools = q_use_tools.collect().transpose(
    include_header=True, header_name=TOOL_COL, column_names=[USE_COL]
)

df_not_use_tools = q_not_use_tools.collect().transpose(
    include_header=True, header_name=TOOL_COL, column_names=[NOT_USE_COL]
)

# df_use_tools.head()
# df_not_use_tools.head()

In [None]:
# ?pl.Expr.rank

In [None]:
df_count_tools = df_use_tools.join(df_not_use_tools, on=TOOL_COL)

df_counts_tools = df_count_tools.with_columns(
    [
        pl.col(TOOL_COL).str.slice(start=len(TOOLS_COL_PREFIX)),
        (pl.col(USE_COL) + pl.col(NOT_USE_COL)).alias(TOTAL_COUNT_COL),
        pl.col(USE_COL).rank(method="ordinal", reverse=True).alias(RANK_COL),
    ]
)

In [None]:
df_counts_tools.head()

In [None]:
df_counts_tools[RANK_COL].value_counts().sort(by="counts").head()

In [None]:
df_counts_tools[TOOL_COL].to_list()

In [None]:
# Adapted from: https://pola-rs.github.io/polars-book/user-guide/howcani/apply/udfs.html#apply-custom-functions
def rename_tools(tool: str) -> str:
    return TOOLS_MAP.get(tool, tool)


df_counts_tools = df_counts_tools.with_column(pl.col(TOOL_COL).apply(rename_tools))

In [None]:
df_counts_tools.head()

In [None]:
df_counts_tools.tail()

In [None]:
df_counts_tools[TOTAL_COUNT_COL].value_counts()

In [None]:
df_counts_tools.to_csv(f"{TOOL_COUNTS_DATA}.csv")

In [None]:
FE_SRC_FOLDER.resolve()

In [None]:
# print(df_counts_tools.to_json(pretty=True))

# indent = 0
indent = 2
df_counts_tools.to_pandas().to_json(
    FE_SRC_FOLDER / f"{TOOL_COUNTS_DATA}.json",
    orient="records",
    force_ascii=False,
    indent=indent,
)

---