## Required libraries

In [0]:
import mlflow
import mlflow.sklearn

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score


## Data load (Spark → Pandas)

In [0]:
df = (
    spark.table("default.ml_features_movies")
    .select("budget_log", "imdb_rating_clean", "revenue_log")
    .dropna()
    .toPandas()
)

df.head()


Unnamed: 0,budget_log,imdb_rating_clean,revenue_log
0,1.871802,8.0,2.564949
1,5.993961,8.0,7.601402
2,1.029619,1.9,1.410987
3,5.225747,9.0,6.914731
4,5.181784,7.8,6.572842


## Features (X) & Target (y)

In [0]:
X = df[["budget_log", "imdb_rating_clean"]]
y = df["revenue_log"]


## Train–Test split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


## Model train + MLflow logging

In [0]:
with mlflow.start_run(run_name="linear_regression_v1"):

    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("features", "budget_log, imdb_rating_clean")
    mlflow.log_param("test_size", 0.2)

    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)

    mlflow.log_metric("r2_score", r2)
    mlflow.sklearn.log_model(model, "model")

print("R2 Score:", r2)




R2 Score: 0.8387362445010138
