In [1]:
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from search_run.ranking.ranking import Ranking
from pyspark.sql.session import SparkSession

spark = SparkSession.builder.getOrCreate()
ranking = Ranking()


## Entries

In [2]:
entries_df = ranking.load_entries_df(spark)
entries_df.select('position', 'key_lenght').show()

+--------+----------+
|position|key_lenght|
+--------+----------+
|       1|        25|
|       2|        18|
|       3|        21|
|       4|        14|
|       5|        18|
|       6|        24|
|       7|        15|
|       8|        54|
|       9|        45|
|      10|        23|
|      11|        27|
|      12|        17|
|      13|        32|
|      14|        19|
|      15|        31|
|      16|        53|
|      17|        45|
|      18|        31|
|      19|        36|
|      20|        17|
+--------+----------+
only showing top 20 rows



## Commands performed

In [3]:
performed_df = ranking.load_commands_performed_dataframe(spark)
performed_df.select('input_lenght', 'generated_date').show()

+------------+-------------------+
|input_lenght|     generated_date|
+------------+-------------------+
|          19|2021-08-31 12:44:44|
|           3|2021-08-31 12:41:03|
|           2|2021-08-31 12:40:54|
|           5|2021-08-31 12:39:12|
|          23|2021-08-31 12:38:28|
|          12|2021-08-31 12:24:32|
|           6|2021-08-31 12:16:34|
|          15|2021-08-31 12:12:30|
|          15|2021-08-31 12:11:20|
|           7|2021-08-31 12:07:35|
|          12|2021-08-31 11:34:18|
|           4|2021-08-31 11:31:24|
|          12|2021-08-31 11:14:52|
|           7|2021-08-31 11:12:55|
|          11|2021-08-31 11:08:53|
|          10|2021-08-31 10:56:55|
|          24|2021-08-31 10:55:06|
|           5|2021-08-31 10:53:13|
|          19|2021-08-31 10:47:41|
|          13|2021-08-31 10:43:50|
+------------+-------------------+
only showing top 20 rows



## Join spark datasets

In [4]:

df = performed_df.join(entries_df, on='key', how='left')
df = df.filter('position is not null')

df.select(*ranking.model_info.features, ranking.model_info.label).show()

+--------+----------+------------+
|position|key_lenght|input_lenght|
+--------+----------+------------+
|    3905|        19|         134|
|    3905|        19|         134|
|     664|        27|          10|
|     664|        27|          27|
|     664|        27|          27|
|     664|        27|          27|
|     664|        27|          27|
|     664|        27|          27|
|     664|        27|          27|
|     664|        27|          27|
|     664|        27|          27|
|     664|        27|          27|
|     664|        27|          27|
|     664|        27|          27|
|     664|        27|          27|
|     664|        27|          27|
|     664|        27|          27|
|     664|        27|          27|
|     664|        27|          27|
|     664|        27|          27|
+--------+----------+------------+
only showing top 20 rows



## Final numpy dataset + train test split

In [5]:
import numpy as np
X = np.array(df.select(*ranking.model_info.features).collect())
Y = np.array(df.select(ranking.model_info.label).collect())


X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=12)

## Train model

In [8]:

with mlflow.start_run():
    regr = RandomForestRegressor(max_depth=2, random_state=0)
    model = regr.fit(X_train, y_train)
    mlflow.sklearn.log_model(model, artifact_path="model")


    ## Validate Model on test dataset
    y_test
    results = model.predict(X_test)
    mse = mean_squared_error(y_test, results)
    mlflow.log_metric("mse", mse)


  model = regr.fit(X_train, y_train)
