In [None]:
import mlflow
import mlflow.sklearn
# import mlflow.lightgbm
import pandas as pd
import matplotlib.pyplot as plt

from numpy import savetxt

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from pyspark.sql import SparkSession
import os

In [None]:
mlflow.sklearn.autolog()
# mlflow.lightgbm.autolog()

In [None]:
# SparkSessionの開始
spark = SparkSession.builder.appName("ModelTrainingJob").getOrCreate()

# Delta Lake形式のデータをS3から読み込み
# processed_delta_path = "s3a://your-bucket-name/data/processed/diabetes"
processed_delta_path = "/Volumes/databricks_test_ws/main/test-volume/data/processed/diabetes"
spark_df = spark.read.format("delta").load(processed_delta_path)

# Spark DataFrameをPandas DataFrameに変換
pdf = spark_df.toPandas()

# 特徴量とターゲットの分割
X = pdf.drop("target", axis=1)
y = pdf["target"]

# データセットの分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

output_path = "/Volumes/databricks_test_ws/main/test-volume/results"
os.makedirs(output_path, exist_ok=True)

# MLflowを使用したモデルのトレーニング
with mlflow.start_run():
    # Set the model parameters. 
    n_estimators = 100
    max_depth = 6
    max_features = 3

    # モデルの定義とトレーニング
    rf_model = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, max_features = max_features, random_state=42)
    rf_model.fit(X_train, y_train)
    
    # 予測と評価
    predictions = rf_model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    
    # トレーニングパラメータ
    mlflow.log_param("num_trees", n_estimators)
    mlflow.log_param("maxdepth", max_depth)
    mlflow.log_param("max_feat", max_features)

    # モデルパラメータとメトリクスのログ
    mlflow.log_param("n_estimators", 100)
    mlflow.log_metric("mse", mse)
    
    # モデルの保存
    mlflow.sklearn.log_model(rf_model, "random-forest-model")

    # Save the table of predicted values
    savetxt(f"{output_path}/predictions.csv", predictions, delimiter=',')
    mlflow.log_artifact(f"{output_path}/predictions.csv")
    
    print(f"Model saved with MSE: {mse}")

    # Convert the residuals to a pandas dataframe to take advantage of graphics capabilities
    residuals = pd.DataFrame(data=predictions - y_test)

    # Create a plot of residuals
    plt.figure(figsize=(10, 6))  # 図のサイズを指定
    # plt.plot(residuals, linestyle='-', marker='o', color='b')  # 線とマーカー、色を指定
    plt.scatter(range(len(residuals)), residuals, color='b')  # 散布図を作成
    plt.xlabel("Observation", fontsize=14)  # フォントサイズを指定
    plt.ylabel("Residual", fontsize=14)  # フォントサイズを指定
    plt.title("Residuals", fontsize=16)  # フォントサイズを指定
    plt.grid(True)  # グリッドを追加
    plt.tight_layout()  # レイアウトの調整

    # Save the plot and log it as an artifact
    plt.savefig(f"{output_path}/residuals_plot.png", dpi=300)
    mlflow.log_artifact(f"{output_path}/residuals_plot.png") 

In [None]:
# モデルのデプロイ
model_uri = "runs:/<run_id>/random-forest-model"
mlflow.register_model(model_uri, "random-forest-model")