In [None]:
import pandas as pd
import seaborn as sns

In [None]:
df = sns.load_dataset("diamonds")
df.head()

In [None]:
df.isnull().sum()

In [None]:
print(df["cut"].unique())
print(df["color"].unique())
print(df["clarity"].unique())

In [None]:
df[["carat", "depth", "table", "price", "x", "y", "x"]].describe()

In [None]:
df[["cut", "color", "clarity"]].describe()

In [None]:
sns.pairplot(df.drop(columns=["color", "clarity"]).sample(frac=1).reset_index(drop=True)[:1000], hue="cut")

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=["price"]), df["price"], test_size=0.3, random_state=1)

In [None]:
import os
from dotenv import load_dotenv
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
import mlflow

load_dotenv()
mlflow.set_tracking_uri(os.environ["MLFLOW_TRACKING_URI"])
mlflow.autolog()

ord_encoder = OrdinalEncoder(categories=[
    ["Fair", "Good", "Very Good", "Premium", "Ideal"],
    ["J", "I", "H", "G", "F", "E", "D"],
    ["I1", "SI2", "SI1", "VS2", "VS1", "VVS2", "VVS1", "IF"],
])

preprocessor = ColumnTransformer(
    transformers=[("ord", ord_encoder, ["cut", "color", "clarity"])],
    remainder="passthrough",
)

regressor = StackingRegressor([
    ("elastic_net", ElasticNet(alpha=0.05, random_state=1)),
    #("random_forest", RandomForestRegressor(n_estimators=10, random_state=1)),
])

pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", regressor),
])

from sklearn import set_config
set_config(display='diagram')   
pipe

In [None]:
set_config(display=None)

In [None]:
with mlflow.start_run(run_name='example') as run:
    pipe.fit(X_train, y_train)

pred = pipe.predict(X_test)

In [None]:
import matplotlib.pyplot as plt

plt.xlabel("pred")
plt.ylabel("y_test")
plt.scatter(pred, y_test)

plt.show()

In [None]:
run = mlflow.search_runs(order_by=["start_time desc"], max_results=1).iloc[0]
mlflow.register_model(model_uri=f"runs:/{run.run_id}/model", name="diamonds_price_pred")

In [None]:
model = mlflow.sklearn.load_model(f"models:/diamonds_price_pred/latest")

idx = 100
pred = model.predict(X_test[idx:idx+1])
ans = y_test[idx:idx+1].values
print(f"pred: {pred}, ans: {ans}")