In [1]:
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from search_run.ranking.ranking import Ranking
from pyspark.sql.session import SparkSession

spark = SparkSession.builder.getOrCreate()
ranking = Ranking()


## Entries

In [2]:
entries_df = ranking.load_entries_df(spark)
entries_df.select('position', 'key_lenght').show()

+--------+----------+
|position|key_lenght|
+--------+----------+
|       1|        33|
|       2|        17|
|       3|        42|
|       4|        36|
|       5|        10|
|       6|        17|
|       7|        18|
|       8|        14|
|       9|        22|
|      10|        20|
|      11|        32|
|      12|        12|
|      13|        27|
|      14|        43|
|      15|        46|
|      16|        24|
|      17|        26|
|      18|        23|
|      19|        34|
|      20|        47|
+--------+----------+
only showing top 20 rows



## Commands performed

In [3]:
performed_df = ranking.load_commands_performed_dataframe(spark)
performed_df.select('input_lenght', 'generated_date').show()

+------------+-------------------+
|input_lenght|     generated_date|
+------------+-------------------+
|           9|2021-09-05 10:53:32|
|           9|2021-09-05 10:53:14|
|          23|2021-09-05 10:49:08|
|           8|2021-09-05 10:40:26|
|           8|2021-09-05 10:36:08|
|           3|2021-09-05 10:33:43|
|           4|2021-09-05 10:32:44|
|           5|2021-09-05 10:31:57|
|           0|2021-09-05 10:31:38|
|           0|2021-09-05 10:31:02|
|           0|2021-09-05 10:30:20|
|           0|2021-09-05 10:30:03|
|           0|2021-09-05 10:29:49|
|           3|2021-09-05 10:28:49|
|           3|2021-09-05 10:17:14|
|           0|2021-09-05 10:16:25|
|          14|2021-09-05 10:08:12|
|          16|2021-09-05 10:00:53|
|          19|2021-09-05 09:58:18|
|          15|2021-09-05 09:57:42|
+------------+-------------------+
only showing top 20 rows



## Join spark datasets

In [4]:

df = performed_df.join(entries_df, on='key', how='left')
df = df.filter('position is not null')

df.select(*ranking.model_info.features, ranking.model_info.label).show()

+--------+----------+------------+
|position|key_lenght|input_lenght|
+--------+----------+------------+
|     519|        41|          11|
|    4022|        19|         134|
|    4022|        19|         134|
|     718|        27|           9|
|     718|        27|          10|
|     718|        27|          27|
|     718|        27|          27|
|     718|        27|          27|
|     718|        27|          27|
|     718|        27|          27|
|     718|        27|          27|
|     718|        27|          27|
|     718|        27|          27|
|     718|        27|          27|
|     718|        27|          27|
|     718|        27|          27|
|     718|        27|          27|
|     718|        27|          27|
|     718|        27|          27|
|     718|        27|          27|
+--------+----------+------------+
only showing top 20 rows



## Final numpy dataset + train test split

In [5]:
import numpy as np
X = np.array(df.select(*ranking.model_info.features).collect())
Y = np.array(df.select(ranking.model_info.label).collect())


X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=12)

## Train model

In [7]:

with mlflow.start_run():
    max_depth = 10
    regr = RandomForestRegressor(max_depth=max_depth, random_state=0)
    model = regr.fit(X_train, y_train)
    mlflow.sklearn.log_model(model, artifact_path="model")


    ## Validate Model on test dataset
    y_test
    results = model.predict(X_test)
    mse = mean_squared_error(y_test, results)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("max_depth", max_depth)


  model = regr.fit(X_train, y_train)
