In [34]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import (LinearRegression, Ridge, Lasso, ElasticNet, HuberRegressor)
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Additional Regression Analysis

In [35]:
df = pd.read_csv("Cleaned TMDB Dataset.csv")
df = df.drop(columns=['title'])
df = pd.get_dummies(df, columns=['genres'], drop_first=True)

X = df.drop(columns=['revenue'])
y = df['revenue']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

scaler = StandardScaler()
X_train_scaler = scaler.fit_transform(X_train)
X_test_scaler = scaler.transform(X_test)

def performance(y_true, y_pred, model_name="Model"):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

    print(f"MAE  = {mae:.2f}")
    print(f"MSE  = {mse:.2f}")
    print(f"RMSE = {rmse:.2f}")
    print(f"R²   = {r2:.5f}")

## Robust Regression

In [36]:
robust_model = HuberRegressor()
robust_model.fit(X_train_scaler, y_train)
y_pred = robust_model.predict(X_test_scaler)
performance(y_test, y_pred, model_name="Robust Regression")

MAE  = 44374739.96
MSE  = 16216612101825666.00
RMSE = 127344462.39
R²   = 0.40050


## Ridge Regression

In [37]:
lasso_cv = LassoCV(alphas=np.logspace(-2, 3, 100), cv=5, max_iter=10000)
lasso_cv.fit(X_train_scaler, y_train)
y_pred = lasso_cv.predict(X_test_scaler)
performance(y_test, y_pred, model_name="LassoCV")

MAE  = 42107769.96
MSE  = 8328182681729332.00
RMSE = 91258877.28
R²   = 0.69212


## Lasso Regression

In [38]:
ridge_cv = RidgeCV(alphas=np.logspace(-2, 3, 100), cv=5)
ridge_cv.fit(X_train_scaler, y_train)
y_pred = ridge_cv.predict(X_test_scaler)
performance(y_test, y_pred, model_name="RidgeCV")

MAE  = 41840861.00
MSE  = 8327797246062684.00
RMSE = 91256765.48
R²   = 0.69214


## Elastic Net Regression Model

In [39]:
# Elastic Net with CV tuning for alpha and l1_ratio
elastic_net_cv = ElasticNetCV(alphas=np.logspace(-2, 3, 100),
                              l1_ratio=[0.1, 0.5, 0.7, 0.9, 1.0],  
                              cv=5, max_iter=10000)
elastic_net_cv.fit(X_train_scaler, y_train)
y_pred = elastic_net_cv.predict(X_test_scaler)
performance(y_test, y_pred, model_name="Elastic Net CV")

MAE  = 41945075.77
MSE  = 8327558004406748.00
RMSE = 91255454.66
R²   = 0.69215


## Compare Results

In [40]:
def return_performance(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    return {"Model": model_name,"MAE": mae, "MSE": mse, "RMSE": rmse, "R²": r2}

results = [return_performance(y_test, robust_model.predict(X_test_scaler), "Robust Regression"),
           return_performance(y_test, ridge_cv.predict(X_test_scaler), "Ridge Regression"),
           return_performance(y_test, lasso_cv.predict(X_test_scaler), "Lasso Regression"),
           return_performance(y_test, elastic_net_cv.predict(X_test_scaler), "Elastic Net Regression")]

results_df = pd.DataFrame(results)
results_df["MAE"] = results_df["MAE"].apply(lambda x: f"{x:.2f}")
results_df["MSE"] = results_df["MSE"].apply(lambda x: f"{x:.0f}")
results_df["RMSE"] = results_df["RMSE"].apply(lambda x: f"{x:.2f}")
results_df["R²"] = results_df["R²"].round(5)
results_df = results_df[["Model", "MAE", "MSE", "RMSE", "R²"]]
display(results_df)

Unnamed: 0,Model,MAE,MSE,RMSE,R²
0,Robust Regression,44374739.96,16216612101825666,127344462.39,0.4005
1,Ridge Regression,41840861.0,8327797246062684,91256765.48,0.69214
2,Lasso Regression,42107769.96,8328182681729332,91258877.28,0.69212
3,Elastic Net Regression,41945075.77,8327558004406748,91255454.66,0.69215
