## Load model

In [10]:
import mlflow.sklearn
import numpy as np
from pyspark.sql import SparkSession

from mlflow.tracking import MlflowClient
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
from search_run.ranking.ranking import Ranking
import pyspark.sql.functions as F

ranking = Ranking()

spark = SparkSession.builder.getOrCreate()
client = MlflowClient()
experiment = client.get_experiment_by_name('Default')
runs = client.list_run_infos(experiment_id=experiment.experiment_id)
run_path = f"runs:/{runs[0].run_uuid}/model/"
model = mlflow.sklearn.load_model(run_path)


model


show_columns=[]

# if you wanna print the key used use this one instead
#show_columns=['key']

## Predict ranking

In [3]:


def evaluate_model(position, key_lenght):
    """ function to predict used at the udf """
    return float(model.predict(np.array([[position, key_lenght]]))[0])

entries_df = ranking.load_entries_df(spark)
model_udf = udf(evaluate_model, FloatType())
result_df = entries_df.withColumn("predicted_key_lenght", model_udf(F.col('position'), F.col('key_lenght')))
#result_df.show()

## Analyse results - easiest to find

In [4]:

result_df.select(*ranking.model_info.features, *show_columns, 'predicted_key_lenght').orderBy(F.col
                                                                                            ('predicted_key_lenght')
                                                                                       .asc()).show()


+--------+----------+--------------------+
|position|key_lenght|predicted_key_lenght|
+--------+----------+--------------------+
|      19|        34|            2.054619|
|       2|        17|           2.6820834|
|       6|        17|           2.6820834|
|      28|        27|            2.781002|
|      23|        35|           2.9229524|
|       4|        36|           3.1142976|
|     149|        27|             3.55375|
|       1|        33|           3.6798215|
|       5|        10|           3.8520834|
|     162|        41|            3.928721|
|       3|        42|           3.9402144|
|      29|        24|            4.049433|
|       7|        18|           4.1055355|
|     146|        25|             4.25675|
|      25|        22|            4.310016|
|      26|        22|            4.310016|
|     147|        29|            4.344627|
|      14|        43|            4.542345|
|      30|        22|            4.597695|
|     148|        18|           4.7117777|
+--------+-



## Analyse results - hardest to find

In [None]:

result_df.select(*ranking.model_info.features, *show_columns, 'predicted_key_lenght',).orderBy(F.col
                                                                                               ('predicted_key_lenght')
                                                                                       .desc
                                                            ()).show()


## Save output to be consumed


In [6]:
import shutil

output = result_df.select('key', 'predicted_key_lenght').orderBy(F.col('predicted_key_lenght').desc())
shutil.rmtree('/data/search_run/predict_input_lenght/latest')
output.repartition(1).write.csv('/data/search_run/predict_input_lenght/latest', header=True)
print("Finished")

Finished


## Visualize output

In [38]:
#!cat /data/search_run/predict_input_lenght/latest/**

