In [None]:
from __future__ import annotations

import datetime as dt
from enum import StrEnum

import numpy as np
import polars as pl
from config import get_chat_model
from langchain.agents import create_agent
from langchain.messages import HumanMessage
from langchain.tools import BaseTool, tool
from pydantic import BaseModel, Field
from sklearn import datasets

from dfkit.context import DataFrameContext
from dfkit.models import DataFrameReference

In [None]:
data, target = datasets.load_diabetes(return_X_y=True, scaled=False)
df = pl.DataFrame(
    data=data,
    schema=["age", "sex", "bmi", "bp", "s1", "s2", "s3", "s4", "s5", "s6"],
)

df = df.with_columns(
    pl.col("sex").map_elements(lambda x: "male" if x == 1 else "female", return_dtype=pl.String),
    pl.Series(target).alias("disease_progression"),
)

df

In [None]:
dfr = DataFrameReference.from_dataframe(
    name="Diabetes Progression Dataset",
    dataframe=df,
    description="""
    Ten baseline variables, age, sex, body mass index, average blood pressure,
    and six blood serum measurements were obtained for each diabetes patient,
    as well as the response of interest, a quantitative measure of disease 
    progression one year after baseline.
    """,
    column_descriptions={
        "age": "Age of the patient in years.",
        "sex": "Sex of the patient",
        "bmi": "Body mass index.",
        "bp": "Average blood pressure.",
        "s1": "TC, total serum cholesterol.",
        "s2": "LDL, low-density lipoproteins.",
        "s3": "HDL, high-density lipoproteins.",
        "s4": "TCH, total cholesterol / HDL.",
        "s5": "LTG, possibly log of serum triglycerides level.",
        "s6": "GLU, blood sugar level.",
        "disease_progression": "A quantitative measure of disease progression one year after baseline.",
    },
)
dfr

In [None]:
def to_markdown_table(df: pl.DataFrame, columns: list[str] | None = None, num_rows: int = 10) -> str:
    """Convert a Polars DataFrame to a markdown table string representation.

    Args:
        df (pl.DataFrame): The Polars DataFrame to convert.
        columns (list[str] | None): Optional list of column names to include in the output.
            If None, all columns are included (by not necessarily displayed). Defaults to None.
        num_rows (int): The number of rows to include in the output. Defaults to 10.

    Returns:
        str: A string representation of the DataFrame in markdown table format.
    """
    with pl.Config(
        tbl_formatting="MARKDOWN",
        tbl_hide_column_data_types=True,
        tbl_hide_column_names=False,
        tbl_hide_dataframe_shape=True,
        tbl_rows=num_rows,
        tbl_cols=len(columns) if columns is not None else None,
    ):
        # Select only the specified columns if provided, otherwise include all columns
        if columns is not None:
            if extra_columns := set(columns) - set(df.columns):
                raise ValueError(f"Columns {extra_columns} not found in DataFrame.")
            df = df.select(columns)

        return str(df)

In [None]:
print(to_markdown_table(df, num_rows=25))