Demonstrates XGBoost over Iris dataset using DuckDB and JupyterLite + Pyodide.

The DuckDB usage in this example is minimal, it's just used to load the CSV, but imagine you had a more complex query & dataset.

Note: In a Jupyter environment, you'd swap the jupylite_duckdb for duckdb

In [None]:
%pip install pandas 
%pip install jupylite-duckdb
%pip install plotly
%pip install nbformat>=4.2.0

In [None]:
# This is the only JupyterLite / Pyodide specific block.
import jupylite_duckdb as jd

# Connect (create an in-memory duckdb instance) 
conn = await jd.connect()

# Get the duckdb version
r1 = await jd.query("pragma version", conn)
display(r1)

# Load the Iris dataset
r4 = await jd.query("select * from read_csv_auto('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')", conn)
display(r4.describe())

In [None]:
# Display input data

import plotly.express as px
px.scatter(r4, x="sepal_length", y="petal_length", color="species")


In [None]:
# OneHotEncode
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
import pandas as pd
transformer = make_column_transformer(
                # make_column_selector(dtype_exclude=numpy.number)
                (OneHotEncoder(drop="first"), ["species"]),
                remainder="passthrough",
            )  # type: ignore

transformed = transformer.fit_transform(r4)
r4_encoded = pd.DataFrame(
    transformed, columns=transformer.get_feature_names_out()
)

r4_encoded=r4_encoded.rename(columns={col: col.replace("remainder__", "") for col in r4_encoded if col.startswith("remainder__")})

In [None]:
# Train / Test Split
from sklearn.model_selection import train_test_split

train, _test = train_test_split(r4_encoded, test_size=0.2)

In [None]:
# X / y
y_col = "sepal_length"
x_cols = [col for col in train if col != y_col]

train_X=train[x_cols]
train_y=train[[y_col]]

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Create an XGBoost regressor
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror')

# Fit the model on the training data
xgb_reg.fit(train_X, train_y)

# Make predictions on the training data
train_preds = xgb_reg.predict(train_X)

In [None]:
# Predict over Train
train_preds = xgb_reg.predict(train_X)
train["prediction"] = train_preds

In [None]:
# Predict over Test
test = _test
test_X=test[x_cols]
test_y=test[[y_col]]

test_preds=xgb_reg.predict(test_X)
test["prediction"] = test_preds

In [None]:
# Calculate the mean squared error on the training data
mse = mean_squared_error(train_y, train_preds)
print("Training MSE:", mse)

# Calculate the mean squared error on the training data
mse = mean_squared_error(test_y, test_preds)
print("Test MSE:", mse)

In [None]:
# Show the Test Fit vs Training Fit

import plotly.graph_objs as go
combined_data = pd.concat([train, test], ignore_index=True)
combined_data["dataset"] = ["train"] * len(train) + ["test"] * len(test)

# Create a scatter plot of the actual and predicted values
trace1 = go.Scatter(
    x=train[y_col],
    y=train["prediction"],
    mode="markers",
    name="Train"
)

trace2 = go.Scatter(
    x=test[y_col],
    y=test["prediction"],
    mode="markers",
    name="Test"
)

layout = go.Layout(
    title="Accuracy against Train vs Test",
    xaxis=dict(title=y_col),
    yaxis=dict(title="Prediction")
)

fig = go.Figure(data=[trace1, trace2], layout=layout)
fig.show()
