https://scikit-learn.org/1.1/modules/svm.html#regression

## Traning and Logging

In [1]:
from sklearn.datasets import load_diabetes
X, y = load_diabetes(return_X_y=True)
(X.shape, y.shape)

((442, 10), (442,))

In [2]:
(X[0], y[0])

(array([ 0.03807591,  0.05068012,  0.06169621,  0.02187239, -0.0442235 ,
        -0.03482076, -0.04340085, -0.00259226,  0.01990749, -0.01764613]),
 151.0)

In [3]:
(X[1], y[1])

(array([-0.00188202, -0.04464164, -0.05147406, -0.02632753, -0.00844872,
        -0.01916334,  0.07441156, -0.03949338, -0.06833155, -0.09220405]),
 75.0)

In [4]:
import getpass

import mlflow
from liga.sklearn.mlflow import log_model
from sklearn import svm


mlflow_tracking_uri = "sqlite:///mlruns.db"
mlflow.set_tracking_uri(mlflow_tracking_uri)

# train a model
with mlflow.start_run() as run:
    ####
    # Part 1: Train the model and register it on MLflow
    ####
    
    model_svr = svm.SVR(epsilon=0.3).fit(X, y)
    model_nusvr = svm.NuSVR().fit(X, y)
    model_l_svr = svm.LinearSVR().fit(X, y)
    
    svr_name = f"{getpass.getuser()}_svr"
    nusvr_name = f"{getpass.getuser()}_nusvr"
    l_svr_name = f"{getpass.getuser()}_linear_svr"
    
    log_model(model_svr, registered_model_name=svr_name)
    log_model(model_nusvr, registered_model_name=nusvr_name)
    log_model(model_l_svr, registered_model_name=l_svr_name)


Registered model 'da_svr' already exists. Creating a new version of this model...
2023/03/29 13:13:56 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: da_svr, version 2
Created version '2' of model 'da_svr'.
Registered model 'da_nusvr' already exists. Creating a new version of this model...
2023/03/29 13:13:58 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: da_nusvr, version 2
Created version '2' of model 'da_nusvr'.
Registered model 'da_linear_svr' already exists. Creating a new version of this model...
2023/03/29 13:14:00 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: da_linear_svr, version 2
Created version '2' of model 'da_linear_svr'.


## Apply the model on the large scale dataset

In [5]:
from example import spark
from liga.mlflow import CONF_MLFLOW_TRACKING_URI
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "false")
spark.conf.set(CONF_MLFLOW_TRACKING_URI, mlflow_tracking_uri)
spark.sql(f"""
CREATE OR REPLACE MODEL svr LOCATION 'mlflow:///{svr_name}';
"""
)

spark.sql(f"""
CREATE OR REPLACE MODEL nusvr LOCATION 'mlflow:///{nusvr_name}';
"""
)

spark.sql(f"""
CREATE OR REPLACE MODEL l_svr LOCATION 'mlflow:///{l_svr_name}';
"""
)

spark.sql("show models").show(10, vertical=False, truncate=False)

2023-03-29 13:14:00,112 INFO Rikai (__init__.py:121): setting spark.sql.extensions to net.xmacs.liga.spark.RikaiSparkSessionExtensions
2023-03-29 13:14:00,113 INFO Rikai (__init__.py:121): setting spark.driver.extraJavaOptions to -Dio.netty.tryReflectionSetAccessible=true
2023-03-29 13:14:00,114 INFO Rikai (__init__.py:121): setting spark.executor.extraJavaOptions to -Dio.netty.tryReflectionSetAccessible=true
2023-03-29 13:14:00,115 INFO Rikai (__init__.py:121): setting spark.jars to https://github.com/komprenilo/liga/releases/download/v0.3.0/liga-spark321-assembly_2.12-0.3.0.jar
23/03/29 13:14:01 WARN Utils: Your hostname, tubi resolves to a loopback address: 127.0.1.1; using 192.168.31.32 instead (on interface wlp0s20f3)
23/03/29 13:14:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For Spar

+-----+------+-----------------------+-------+
|name |plugin|uri                    |options|
+-----+------+-----------------------+-------+
|svr  |      |mlflow:///da_svr       |       |
|nusvr|      |mlflow:///da_nusvr     |       |
|l_svr|      |mlflow:///da_linear_svr|       |
+-----+------+-----------------------+-------+



In [6]:
result = spark.sql(f"""
select
  ML_PREDICT(svr, array(0.03807591,  0.05068012,  0.06169621,  0.02187239, -0.0442235 ,
        -0.03482076, -0.04340085, -0.00259226,  0.01990749, -0.01764613)) as svr,
  ML_PREDICT(nusvr, array(0.03807591,  0.05068012,  0.06169621,  0.02187239, -0.0442235 ,
        -0.03482076, -0.04340085, -0.00259226,  0.01990749, -0.01764613)) as nusvr,
  ML_PREDICT(l_svr, array(0.03807591,  0.05068012,  0.06169621,  0.02187239, -0.0442235 ,
        -0.03482076, -0.04340085, -0.00259226,  0.01990749, -0.01764613)) as l_svr
        
"""
)

result.printSchema()
result.toPandas()

root
 |-- svr: float (nullable = true)
 |-- nusvr: float (nullable = true)
 |-- l_svr: float (nullable = true)



                                                                                

Unnamed: 0,svr,nusvr,l_svr
0,110.177177,110.177177,110.177177


In [7]:
spark.sql(f"""
select
  ML_PREDICT(svr, array(-0.00188202, -0.04464164, -0.05147406, -0.02632753, -0.00844872,
        -0.01916334,  0.07441156, -0.03949338, -0.06833155, -0.09220405)) as svr
"""
).toPandas()

Unnamed: 0,svr
0,106.438087


In [8]:

spark.sql("""
select  ML_PREDICT(nusvr, array(-0.00188202, -0.04464164, -0.05147406, -0.02632753, -0.00844872,
        -0.01916334,  0.07441156, -0.03949338, -0.06833155, -0.09220405)) as nusvr
""").toPandas()


                                                                                

Unnamed: 0,nusvr
0,106.438087


In [9]:
spark.sql("""
select ML_PREDICT(l_svr, array(-0.00188202, -0.04464164, -0.05147406, -0.02632753, -0.00844872,
        -0.01916334,  0.07441156, -0.03949338, -0.06833155, -0.09220405)) as l_svr
""").toPandas()

                                                                                

Unnamed: 0,l_svr
0,106.438087
