In [44]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler  
from custom_metrics import f_beta_regression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import KFold

In [45]:
df = pd.read_csv('cleaned_movies.csv')

In [46]:
X = df.drop(columns=['genres', 'original_language', 'popularity_class', 'popularity']).copy()
y = df['popularity']


print(X.dtypes)

budget              int64
runtime           float64
vote_average      float64
vote_count          int64
revenue             int64
release_year        int64
genres_score      float64
language_score    float64
dtype: object


SVR

In [47]:
from sklearn.svm import SVR

# we use rbf kernel, bc we do not have linear data
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR())
])
    
# 5-fold cross-validation (for R^2 score)
scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')

print("R² score for each fold:", scores)
print("Mean R² score:", scores.mean())


R² score for each fold: [-0.23557032  0.68975404  0.71698825  0.66255143  0.20916165]
Mean R² score: 0.4085770095922269


In [48]:
# SVR with other metrics - MAE and RMSE
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR())
])

kf = KFold(n_splits=5, shuffle=True, random_state=42)

mae_scores = []
rmse_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    # MAE
    mae = mean_absolute_error(y_test, y_pred)
    mae_scores.append(mae)

    # RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)

print("\nMean Absolute Error (per fold):", mae_scores)
print("Mean MAE:", np.mean(mae_scores))

print("\nRoot Mean Squared Error (per fold):", rmse_scores)
print("Mean RMSE:", np.mean(rmse_scores))



Mean Absolute Error (per fold): [8.328556375975795, 7.354506016916082, 7.773044329294107, 7.006787536671898, 7.549327181844288]
Mean MAE: 7.602444288140434

Root Mean Squared Error (per fold): [np.float64(29.340520756947875), np.float64(18.27946231502747), np.float64(34.40533326287847), np.float64(17.87459517249698), np.float64(24.88627649962935)]
Mean RMSE: 24.957237601396027


In [49]:
# SVR with custom metric - F1 score for regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR())
])

kf = KFold(n_splits=5, shuffle=True, random_state=42)

f1_reg_scores = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    f1_reg = f_beta_regression(y_test.values, y_pred, beta=1.0, threshold=10)
    f1_reg_scores.append(f1_reg)

print("F1 adapted for regression per fold:", f1_reg_scores)
print("Mean F1 adapted for regression:", np.mean(f1_reg_scores))

F1 adapted for regression per fold: [np.float64(0.5917076548142687), np.float64(0.592436313219766), np.float64(0.6121203155753612), np.float64(0.5937048135561629), np.float64(0.6114060013989775)]
Mean F1 adapted for regression: 0.6002750197129073


KNN

In [50]:
from sklearn.neighbors import KNeighborsRegressor

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor(n_neighbors=5))
])

r2_scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')
print("R² score per fold:", r2_scores)
print("Mean R² score:", np.mean(r2_scores))



R² score per fold: [0.35060872 0.76855449 0.79609064 0.76641059 0.63277432]
Mean R² score: 0.6628877518338261


In [51]:
# KNN with other metrics - MAE and RMSE
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor(n_neighbors=5))
])

kf = KFold(n_splits=5, shuffle=True, random_state=42)

mae_scores = []
rmse_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mae_scores.append(mae)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)


print("\nMean Absolute Error (per fold):", mae_scores)
print("Mean MAE:", np.mean(mae_scores))

print("\nRoot Mean Squared Error (per fold):", rmse_scores)
print("Mean RMSE:", np.mean(rmse_scores))


Mean Absolute Error (per fold): [7.685210776875, 6.344900424583333, 7.4304743547916665, 6.2416500260416665, 6.577483014166666]
Mean MAE: 6.855943719291666

Root Mean Squared Error (per fold): [np.float64(20.707623693285107), np.float64(12.574564345362754), np.float64(28.93072761401086), np.float64(12.164305158873695), np.float64(18.31534332710816)]
Mean RMSE: 18.53851282772812


In [52]:
# KNN with custom metric - F1 score for regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor(n_neighbors=5))
])

kf = KFold(n_splits=5, shuffle=True, random_state=42)

f1_reg_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    f1_reg = f_beta_regression(y_test.values, y_pred, beta=1.0, threshold=10)
    f1_reg_scores.append(f1_reg)

print("\nF1 adapted for regression (per fold):", f1_reg_scores)
print("Mean F1 adapted for regression:", np.mean(f1_reg_scores))


F1 adapted for regression (per fold): [np.float64(0.5980467726473441), np.float64(0.6566291208576934), np.float64(0.5602757440906866), np.float64(0.6430661615959127), np.float64(0.6114262628605103)]
Mean F1 adapted for regression: 0.6138888124104295


DECISION TREE

In [53]:
from sklearn.tree import DecisionTreeRegressor

pipeline = Pipeline([
    ('tree', DecisionTreeRegressor(random_state=42))
])

r2_scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')
print("R² score per fold:", r2_scores)
print("Mean R² score:", np.mean(r2_scores))


R² score per fold: [0.33895874 0.7113546  0.76787048 0.72709615 0.74166605]
Mean R² score: 0.6573892044470153


In [54]:
# Decision Tree with other metrics - MAE and RMSE
pipeline = Pipeline([
    ('tree', DecisionTreeRegressor(random_state=42))
])

kf = KFold(n_splits=5, shuffle=True, random_state=42)

mae_scores = []
rmse_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mae_scores.append(mae)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)

print("\nMean Absolute Error (per fold):", mae_scores)
print("Mean MAE:", np.mean(mae_scores))

print("\nRoot Mean Squared Error (per fold):", rmse_scores)
print("Mean RMSE:", np.mean(rmse_scores))


Mean Absolute Error (per fold): [8.422113748958333, 6.711666844270834, 7.289402811458334, 7.2628432385416675, 7.114660371875002]
Mean MAE: 7.3601374030208335

Root Mean Squared Error (per fold): [np.float64(32.236514167339934), np.float64(15.725577907056614), np.float64(26.588897433526725), np.float64(31.809472287147177), np.float64(25.309808584165552)]
Mean RMSE: 26.3340540758472


In [55]:
# Decision Tree with custom metric - F1 score for regression
pipeline = Pipeline([
    ('tree', DecisionTreeRegressor(random_state=42))
])

f1_reg_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    f1_reg = f_beta_regression(y_test.values, y_pred, beta=1.0, threshold=10)
    f1_reg_scores.append(f1_reg)

print("\nF1 adapted for regression (per fold):", f1_reg_scores)
print("Mean F1 adapted for regression:", np.mean(f1_reg_scores))


F1 adapted for regression (per fold): [np.float64(0.5485913729506213), np.float64(0.5834330449305349), np.float64(0.5738133617767052), np.float64(0.6088375841634754), np.float64(0.5861567105323495)]
Mean F1 adapted for regression: 0.5801664148707373


MULTILAYER PERCEPTRON (MLP)

In [56]:
from sklearn.neural_network import MLPRegressor

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('mlp', MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000, early_stopping=True, random_state=42))
])

r2_scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')
print("R² score per fold:", r2_scores)
print("Mean R² score:", np.mean(r2_scores))

R² score per fold: [0.45244089 0.76851603 0.79830366 0.81485726 0.6965166 ]
Mean R² score: 0.7061268855505594


In [57]:
# MLP with other metrics - MAE and RMSE
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('mlp', MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000, early_stopping=True, random_state=42))
])

kf = KFold(n_splits=5, shuffle=True, random_state=42)

mae_scores = []
rmse_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mae_scores.append(mae)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)

print("\nMean Absolute Error (per fold):", mae_scores)
print("Mean MAE:", np.mean(mae_scores))

print("\nRoot Mean Squared Error (per fold):", rmse_scores)
print("Mean RMSE:", np.mean(rmse_scores))



Mean Absolute Error (per fold): [7.222472267504016, 5.738928581482529, 5.411889637804772, 5.88629985362974, 5.319873823066804]
Mean MAE: 5.915892832697573

Root Mean Squared Error (per fold): [np.float64(20.415992926531896), np.float64(13.05584884588473), np.float64(25.151819341895997), np.float64(12.441966634600144), np.float64(14.964968050052928)]
Mean RMSE: 17.20611915979314


In [58]:
# MLP with custom metric - F1 score for regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('mlp', MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000, early_stopping=True, random_state=42))
])

f1_reg_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    f1_reg = f_beta_regression(y_test.values, y_pred, beta=1.0, threshold=10)
    f1_reg_scores.append(f1_reg)

print("\nF1 adapted for regression (per fold):", f1_reg_scores)
print("Mean F1 adapted for regression:", np.mean(f1_reg_scores))


F1 adapted for regression (per fold): [np.float64(0.5908760431529472), np.float64(0.6658107667255179), np.float64(0.6693766921445705), np.float64(0.6484482664962141), np.float64(0.6454128233529002)]
Mean F1 adapted for regression: 0.6439849183744298
