In [38]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler  
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score,accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold

In [39]:
df = pd.read_csv('cleaned_movies.csv')

In [40]:
X = df.drop(columns=['genres', 'original_language', 'popularity_class', 'popularity']).copy()
y = df['popularity']


print(X.dtypes)

budget            int64
runtime         float64
vote_average    float64
vote_count        int64
revenue           int64
release_year      int64
genres_score    float64
dtype: object


In [41]:
df_all_results = pd.DataFrame()

In [42]:
# data for discretization of results

bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, y.max()]
labels = list(range(10))

# SVR

In [43]:
from sklearn.svm import SVR

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR())
])

kf = KFold(n_splits=5, shuffle=True, random_state=42)

mae_scores = []
rmse_scores = []
r2_scores = []

acc_scores = []
prec_scores = []
rec_scores = []
f1_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    # MAE
    mae = mean_absolute_error(y_test, y_pred)
    mae_scores.append(mae)

    # RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)
    
    # R2
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    
    
    y_pred_disc = pd.cut(y_pred, bins=bins, labels=labels, include_lowest=True)
    y_test_disc = pd.cut(y_test, bins=bins, labels=labels, include_lowest=True)
    
    mask = (~pd.isna(y_pred_disc)) & (~pd.isna(y_test_disc))
    y_pred_disc = y_pred_disc[mask]
    y_test_disc = y_test_disc[mask]

    acc = accuracy_score(y_test_disc, y_pred_disc)
    prec = precision_score(y_test_disc, y_pred_disc, average='macro', zero_division=0)
    rec = recall_score(y_test_disc, y_pred_disc, average='macro', zero_division=0)
    f1 = f1_score(y_test_disc, y_pred_disc, average='macro', zero_division=0)
    
    acc_scores.append(acc)
    prec_scores.append(prec)
    rec_scores.append(rec)
    f1_scores.append(f1)


print("\nMean Absolute Error (per fold):", mae_scores)
print("Mean MAE:", np.mean(mae_scores))

print("\nRoot Mean Squared Error (per fold):", rmse_scores)
print("Mean RMSE:", np.mean(rmse_scores))

print("\nR2 - Coefficient of Determination (per fold):", r2_scores)
print("Mean R2:", np.mean(r2_scores))

print("\n--- Summary after discretization---")
print(f"Mean Accuracy: {np.mean(acc_scores):.4f}")
print(f"Mean Precision: {np.mean(prec_scores):.4f}")
print(f"Mean Recall: {np.mean(rec_scores):.4f}")
print(f"Mean F1-score: {np.mean(f1_scores):.4f}")



Mean Absolute Error (per fold): [8.154750199044871, 7.243405171130566, 7.667763959612371, 6.816424033017546, 7.514772968074236]
Mean MAE: 7.479423266175918

Root Mean Squared Error (per fold): [np.float64(29.013212425233103), np.float64(18.011617336263093), np.float64(34.193871518736024), np.float64(17.627722679103854), np.float64(24.637311973589224)]
Mean RMSE: 24.696747186585057

R2 - Coefficient of Determination (per fold): [0.3011643009537627, 0.5369426969027662, 0.23095035536891606, 0.53535485752024, 0.3720123190651886]
Mean R2: 0.3952849059621747

--- Summary after discretization---
Mean Accuracy: 0.6261
Mean Precision: 0.2991
Mean Recall: 0.3212
Mean F1-score: 0.3030


In [44]:
# saving results to df
df_results = pd.DataFrame({
    'MAE': mae_scores,
    'RMSE': rmse_scores,
    'R2': r2_scores
})
df_results.to_csv("3a_regression_no_resample_SVR.csv", index=False)

In [45]:
# saving results to df (for comparison with classification)
df_results = pd.DataFrame({
    'acc': acc_scores,
    'prec': prec_scores,
    'rec': rec_scores,
    'f1': f1_scores
})
df_results.to_csv("3a_classification_no_resample_SVR.csv", index=False)

# KNN

In [46]:
from sklearn.neighbors import KNeighborsRegressor

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor(n_neighbors=5))
])

kf = KFold(n_splits=5, shuffle=True, random_state=42)

mae_scores = []
rmse_scores = []
r2_scores = []

acc_scores = []
prec_scores = []
rec_scores = []
f1_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mae_scores.append(mae)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)
    
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    
    y_pred_disc = pd.cut(y_pred, bins=bins, labels=labels, include_lowest=True)
    y_test_disc = pd.cut(y_test, bins=bins, labels=labels, include_lowest=True)
    
    mask = (~pd.isna(y_pred_disc)) & (~pd.isna(y_test_disc))
    y_pred_disc = y_pred_disc[mask]
    y_test_disc = y_test_disc[mask]

    acc = accuracy_score(y_test_disc, y_pred_disc)
    prec = precision_score(y_test_disc, y_pred_disc, average='macro', zero_division=0)
    rec = recall_score(y_test_disc, y_pred_disc, average='macro', zero_division=0)
    f1 = f1_score(y_test_disc, y_pred_disc, average='macro', zero_division=0)
    
    acc_scores.append(acc)
    prec_scores.append(prec)
    rec_scores.append(rec)
    f1_scores.append(f1)



print("\nMean Absolute Error (per fold):", mae_scores)
print("Mean MAE:", np.mean(mae_scores))

print("\nRoot Mean Squared Error (per fold):", rmse_scores)
print("Mean RMSE:", np.mean(rmse_scores))

print("\nR2 - Coefficient of Determination (per fold):", r2_scores)
print("Mean R2:", np.mean(r2_scores))

print("\n--- Summary after discretization---")
print(f"Mean Accuracy: {np.mean(acc_scores):.4f}")
print(f"Mean Precision: {np.mean(prec_scores):.4f}")
print(f"Mean Recall: {np.mean(rec_scores):.4f}")
print(f"Mean F1-score: {np.mean(f1_scores):.4f}")


Mean Absolute Error (per fold): [7.662202478541667, 6.2844298768749995, 7.332017008125001, 6.1111715135416675, 6.528933535833334]
Mean MAE: 6.783750882583334

Root Mean Squared Error (per fold): [np.float64(20.692759813765775), np.float64(12.493182395201714), np.float64(28.896761304110278), np.float64(12.082789819537052), np.float64(18.29223241102172)]
Mean RMSE: 18.491545148727305

R2 - Coefficient of Determination (per fold): [0.6445158183327899, 0.777220371260513, 0.45076757862236383, 0.781695129988779, 0.6538232471131935]
Mean R2: 0.6616044290635278

--- Summary after discretization---
Mean Accuracy: 0.6092
Mean Precision: 0.4261
Mean Recall: 0.3932
Mean F1-score: 0.4032


In [47]:
# saving resoults to df
df_results = pd.DataFrame({
    'MAE': mae_scores,
    'RMSE': rmse_scores,
    'R2': r2_scores
})
df_results.to_csv("3a_regression_no_resample_KNN.csv", index=False)

In [48]:
# saving results to df (for comparison with classification)
df_results = pd.DataFrame({
    'acc': acc_scores,
    'prec': prec_scores,
    'rec': rec_scores,
    'f1': f1_scores
})
df_results.to_csv("3a_classification_no_resample_KNN.csv", index=False)

# DECISION TREE

In [49]:
from sklearn.tree import DecisionTreeRegressor

pipeline = Pipeline([
    ('tree', DecisionTreeRegressor(random_state=42))
])

kf = KFold(n_splits=5, shuffle=True, random_state=42)

mae_scores = []
rmse_scores = []
r2_scores = []

acc_scores = []
prec_scores = []
rec_scores = []
f1_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mae_scores.append(mae)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)

    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    
    y_pred_disc = pd.cut(y_pred, bins=bins, labels=labels, include_lowest=True)
    y_test_disc = pd.cut(y_test, bins=bins, labels=labels, include_lowest=True)
    
    mask = (~pd.isna(y_pred_disc)) & (~pd.isna(y_test_disc))
    y_pred_disc = y_pred_disc[mask]
    y_test_disc = y_test_disc[mask]

    acc = accuracy_score(y_test_disc, y_pred_disc)
    prec = precision_score(y_test_disc, y_pred_disc, average='macro', zero_division=0)
    rec = recall_score(y_test_disc, y_pred_disc, average='macro', zero_division=0)
    f1 = f1_score(y_test_disc, y_pred_disc, average='macro', zero_division=0)
    
    acc_scores.append(acc)
    prec_scores.append(prec)
    rec_scores.append(rec)
    f1_scores.append(f1)


print("\nMean Absolute Error (per fold):", mae_scores)
print("Mean MAE:", np.mean(mae_scores))

print("\nRoot Mean Squared Error (per fold):", rmse_scores)
print("Mean RMSE:", np.mean(rmse_scores))

print("\nR2 - Coefficient of Determination (per fold):", r2_scores)
print("Mean R2:", np.mean(r2_scores))

print("\n--- Summary after discretization---")
print(f"Mean Accuracy: {np.mean(acc_scores):.4f}")
print(f"Mean Precision: {np.mean(prec_scores):.4f}")
print(f"Mean Recall: {np.mean(rec_scores):.4f}")
print(f"Mean F1-score: {np.mean(f1_scores):.4f}")


Mean Absolute Error (per fold): [7.671989844791666, 6.700015874479167, 7.0359288979166665, 6.501938765625, 7.579194231250001]
Mean MAE: 7.097813522812499

Root Mean Squared Error (per fold): [np.float64(23.73738417453535), np.float64(15.770834756542909), np.float64(25.627264368222114), np.float64(16.440500481059434), np.float64(31.222975928481976)]
Mean RMSE: 22.559791941768356

R2 - Coefficient of Determination (per fold): [0.5322119061966866, 0.6449915605819778, 0.5680213102297483, 0.5958346752736858, -0.008586305674304295]
Mean R2: 0.46649462932155883

--- Summary after discretization---
Mean Accuracy: 0.6512
Mean Precision: 0.3927
Mean Recall: 0.3904
Mean F1-score: 0.3895


In [50]:
# saving resoults to df
df_results = pd.DataFrame({
    'MAE': mae_scores,
    'RMSE': rmse_scores,
    'R2': r2_scores
})
df_results.to_csv("3a_regression_no_resample_DT.csv", index=False)

In [51]:
# saving results to df (for comparison with classification)
df_results = pd.DataFrame({
    'acc': acc_scores,
    'prec': prec_scores,
    'rec': rec_scores,
    'f1': f1_scores
})
df_results.to_csv("3a_classification_no_resample_DT.csv", index=False)

# MULTILAYER PERCEPTRON (MLP)

In [52]:
from sklearn.neural_network import MLPRegressor

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('mlp', MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000, early_stopping=True, random_state=42))
])

kf = KFold(n_splits=5, shuffle=True, random_state=42)

mae_scores = []
rmse_scores = []
r2_scores = []

acc_scores = []
prec_scores = []
rec_scores = []
f1_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mae_scores.append(mae)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)
    
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    
    y_pred_disc = pd.cut(y_pred, bins=bins, labels=labels, include_lowest=True)
    y_test_disc = pd.cut(y_test, bins=bins, labels=labels, include_lowest=True)
    
    mask = (~pd.isna(y_pred_disc)) & (~pd.isna(y_test_disc))
    y_pred_disc = y_pred_disc[mask]
    y_test_disc = y_test_disc[mask]

    acc = accuracy_score(y_test_disc, y_pred_disc)
    prec = precision_score(y_test_disc, y_pred_disc, average='macro', zero_division=0)
    rec = recall_score(y_test_disc, y_pred_disc, average='macro', zero_division=0)
    f1 = f1_score(y_test_disc, y_pred_disc, average='macro', zero_division=0)
    
    acc_scores.append(acc)
    prec_scores.append(prec)
    rec_scores.append(rec)
    f1_scores.append(f1)


print("\nMean Absolute Error (per fold):", mae_scores)
print("Mean MAE:", np.mean(mae_scores))

print("\nRoot Mean Squared Error (per fold):", rmse_scores)
print("Mean RMSE:", np.mean(rmse_scores))

print("\nR2 - Coefficient of Determination (per fold):", r2_scores)
print("Mean R2:", np.mean(r2_scores))

print("\n--- Summary after discretization---")
print(f"Mean Accuracy: {np.mean(acc_scores):.4f}")
print(f"Mean Precision: {np.mean(prec_scores):.4f}")
print(f"Mean Recall: {np.mean(rec_scores):.4f}")
print(f"Mean F1-score: {np.mean(f1_scores):.4f}")


Mean Absolute Error (per fold): [6.8593376411122255, 5.46854984869584, 5.6613948223681145, 5.907352611817044, 5.561251766625787]
Mean MAE: 5.891577338123802

Root Mean Squared Error (per fold): [np.float64(22.809452561336304), np.float64(12.28277856869783), np.float64(25.723111090214392), np.float64(12.562142182717448), np.float64(15.202357049654006)]
Mean RMSE: 17.715968290523996

R2 - Coefficient of Determination (per fold): [0.5680701973829114, 0.7846610654705007, 0.5647840417961101, 0.7640302179510564, 0.7608962786732084]
Mean R2: 0.6884883602547573

--- Summary after discretization---
Mean Accuracy: 0.6798
Mean Precision: 0.4451
Mean Recall: 0.4390
Mean F1-score: 0.4355


In [53]:
# saving resoults to df
df_results = pd.DataFrame({
    'MAE': mae_scores,
    'RMSE': rmse_scores,
    'R2': r2_scores
})
df_results.to_csv("3a_regression_no_resample_MLP.csv", index=False)

In [54]:
# saving results to df (for comparison with classification)
df_results = pd.DataFrame({
    'acc': acc_scores,
    'prec': prec_scores,
    'rec': rec_scores,
    'f1': f1_scores
})
df_results.to_csv("3a_classification_no_resample_MLP.csv", index=False)