## Model Comparison & Feature Engineering

In [0]:
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


Train size: (26, 2)
Test size: (7, 2)


In [0]:
df = (
    spark.table("default.ml_features_movies")
    .select("budget_log", "imdb_rating_clean", "revenue_log")
    .dropna()
    .toPandas()
)

X = df[["budget_log", "imdb_rating_clean"]]
y = df["revenue_log"]


## Train 3 different models

## Linear Regression
## Decision Tress
## Random Forest

In [0]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import mlflow
import mlflow.sklearn


In [0]:
models = {
    "LinearRegression": LinearRegression(),
    "DecisionTree": DecisionTreeRegressor(max_depth=5, random_state=42),
    "RandomForest": RandomForestRegressor(n_estimators=50, random_state=42)
}

for name, model in models.items():
    with mlflow.start_run(run_name=name):

        mlflow.log_param("model_type", name)

        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        r2 = r2_score(y_test, preds)

        mlflow.log_metric("r2_score", r2)
        mlflow.sklearn.log_model(model, "model")

        print(f"{name} → R2 Score: {r2:.4f}")




LinearRegression → R2 Score: 0.8387




DecisionTree → R2 Score: 0.9106




RandomForest → R2 Score: 0.9093


In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression as SparkLR

assembler = VectorAssembler(
    inputCols=["budget", "imdb_rating_clean"],
    outputCol="features"
)

lr = SparkLR(
    featuresCol="features",
    labelCol="revenue"
)

pipeline = Pipeline(stages=[assembler, lr])

spark_df = spark.table("default.silver_movies").dropna()
train, test = spark_df.randomSplit([0.8, 0.2], seed=42)

pipeline_model = pipeline.fit(train)
