In [2]:
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from search_run.ranking.ranking import Ranking
from pyspark.sql.session import SparkSession

spark = SparkSession.builder.getOrCreate()
ranking = Ranking()


## Entries

In [3]:
entries_df = ranking.load_entries_df(spark)
entries_df.select('position', 'key_lenght').show()

+--------+----------+
|position|key_lenght|
+--------+----------+
|       1|        42|
|       2|        31|
|       3|        31|
|       4|        24|
|       5|        47|
|       6|        43|
|       7|        15|
|       8|        43|
|       9|        22|
|      10|        27|
|      11|        36|
|      12|        23|
|      13|        24|
|      14|         6|
|      15|        54|
|      16|        14|
|      17|        72|
|      18|        30|
|      19|        32|
|      20|        22|
+--------+----------+
only showing top 20 rows



## Commands performed

In [4]:
performed_df = ranking.load_commands_performed_dataframe(spark)
performed_df.select('input_lenght', 'generated_date').show()

+------------+-------------------+
|input_lenght|     generated_date|
+------------+-------------------+
|           0|2021-09-18 08:17:45|
|           0|2021-09-18 08:17:05|
|           0|2021-09-18 08:17:02|
|           0|2021-09-18 08:15:24|
|           0|2021-09-18 08:15:00|
|           0|2021-09-18 08:14:40|
|           0|2021-09-18 08:13:59|
|           0|2021-09-18 08:13:20|
|           0|2021-09-18 08:12:42|
|           0|2021-09-18 08:11:07|
|           0|2021-09-18 08:10:45|
|           0|2021-09-18 08:09:23|
|           0|2021-09-18 08:08:57|
|           0|2021-09-18 08:08:33|
|           0|2021-09-18 08:08:08|
|           0|2021-09-18 08:07:40|
|           0|2021-09-18 08:07:32|
|           0|2021-09-18 08:07:27|
|           3|2021-09-18 07:35:41|
|          12|2021-09-18 07:30:51|
+------------+-------------------+
only showing top 20 rows



## Join spark datasets

In [5]:

df = performed_df.join(entries_df, on='key', how='left')
df = df.filter('position is not null')

df.select(*ranking.model_info.features, ranking.model_info.label).show()

+--------+----------+------------+
|position|key_lenght|input_lenght|
+--------+----------+------------+
|    2616|        12|          10|
|    2616|        12|           9|
|     689|        41|          11|
|    2687|        33|          15|
|    4176|        19|         134|
|    4176|        19|         134|
|     888|        27|           9|
|     888|        27|          10|
|     888|        27|          27|
|     888|        27|          27|
|     888|        27|          27|
|     888|        27|          27|
|     888|        27|          27|
|     888|        27|          27|
|     888|        27|          27|
|     888|        27|          27|
|     888|        27|          27|
|     888|        27|          27|
|     888|        27|          27|
|     888|        27|          27|
+--------+----------+------------+
only showing top 20 rows



## Final numpy dataset + train test split

In [6]:
import numpy as np
X = np.array(df.select(*ranking.model_info.features).collect())
Y = np.array(df.select(ranking.model_info.label).collect())


X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=12)

## Train model

In [7]:

with mlflow.start_run():
    max_depth = 10
    regr = RandomForestRegressor(max_depth=max_depth, random_state=0)
    model = regr.fit(X_train, y_train)
    mlflow.sklearn.log_model(model, artifact_path="model")


    ## Validate Model on test dataset
    y_test
    results = model.predict(X_test)
    mse = mean_squared_error(y_test, results)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("max_depth", max_depth)


  model = regr.fit(X_train, y_train)
