In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
import onnxruntime as rt
import sys
from pathlib import Path

sys.path.append(str(Path.cwd().parent.parent))
from utils import DATA_DIR
from models.encoder.common import SEED
from text2features import FeatureService

sm_train = pd.read_parquet(DATA_DIR / "datasets" / "small" / "train.parquet")
sm_y = sm_train["label"].values

# lg_train = pd.read_parquet(DATA_DIR / "datasets" / "large" / "combined.parquet")


def export_sklearn_to_onnx(model, model_name: str, X: np.ndarray, y: np.ndarray):
    initial_type = [("float_input", FloatTensorType([None, X.shape[1]]))]
    onx = convert_sklearn(model, initial_types=initial_type)

    onnx_path = DATA_DIR / "models" / f"{model_name}.onnx"
    with open(onnx_path, "wb") as f:
        f.write(onx.SerializeToString())

    print(f"ONNX model saved to: {onnx_path}")

    sess = rt.InferenceSession(str(onnx_path))
    input_name = sess.get_inputs()[0].name
    label_name = sess.get_outputs()[0].name

    sample_X = X.astype(np.float32)

    pred_onx = sess.run([label_name], {input_name: sample_X})[0]
    pred_sklearn = model.predict(sample_X)

    diff = np.abs(pred_sklearn - pred_onx.flatten())
    if not np.all(diff < 1e-5):
        print("Warning: Insufficient accuracy between sklearn and ONNX predictions.")
        print(f"Max difference: {np.max(diff)}")

In [None]:
from text2features_paths import (
    FEATURE_PIPELINE_RESOURCES,
)

feature_service = FeatureService(
    feature_pipeline_resources=FEATURE_PIPELINE_RESOURCES,
)

sm_X_minilm = feature_service.get_features(
    sm_train["text"].tolist(), "sentence-transformers/all-MiniLM-L6-v2"
)

Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 32988.91it/s]


In [None]:
params = {
    "n_estimators": 129,
    "max_depth": 28,
    "min_samples_split": 4,
    "min_samples_leaf": 3,
    "max_features": "sqrt",
    "bootstrap": False,
}
rf_model = RandomForestRegressor(**params, random_state=SEED, n_jobs=-1)
rf_model.fit(sm_X_minilm, sm_y)
export_sklearn_to_onnx(rf_model, "minilm_randomforest", sm_X_minilm, sm_y)

NameError: name 'export_sklearn_to_onnx' is not defined

In [4]:
from sklearn.svm import SVR

svm_params = {
    "kernel": "rbf",
    "C": 1.9608689708007712,
    "epsilon": 0.0009522163932366842,
}
svm_pipeline = make_pipeline(MinMaxScaler(), SVR(**svm_params))
svm_pipeline.fit(sm_X_minilm, sm_y)
export_sklearn_to_onnx(svm_pipeline, "minilm_svm", sm_X_minilm, sm_y)

ONNX model saved to: /home/terra/Projects/vis-desc/modules/lab/data/models/minilm_svm.onnx
Max difference: 1.424463998622727e-05


In [None]:
from models.encoder.common import CachedOptimizationContext
from models.encoder.modernbert_finetune_nn import export_to_onnx, train_final_model

optimization_context = CachedOptimizationContext()
best_params = {}  # TODO
final_model = train_final_model(
    best_params, optimization_context.tokenizer, optimization_context.sm_train
)

onnx_path = "modernbert_finetune_nn.onnx"
export_to_onnx(final_model, onnx_path)