In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler  
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score,accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold

In [2]:
df = pd.read_csv('cleaned_movies.csv')

In [3]:
X = df.drop(columns=['genres', 'original_language', 'popularity_class', 'popularity']).copy()
y = df['popularity']


print(X.dtypes)

budget            int64
runtime         float64
vote_average    float64
vote_count        int64
revenue           int64
release_year      int64
genres_score    float64
dtype: object


In [4]:
df_all_results = pd.DataFrame()

In [5]:
# data for discretization of results

bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, y.max()]
labels = list(range(10))

# SVR

In [6]:
from sklearn.svm import SVR

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR())
])

kf = KFold(n_splits=10, shuffle=True, random_state=42)

mae_scores = []
rmse_scores = []
r2_scores = []

acc_scores = []
prec_scores = []
rec_scores = []
f1_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    # MAE
    mae = mean_absolute_error(y_test, y_pred)
    mae_scores.append(mae)

    # RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)
    
    # R2
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    
    
    y_pred_disc = pd.cut(y_pred, bins=bins, labels=labels, include_lowest=True)
    y_test_disc = pd.cut(y_test, bins=bins, labels=labels, include_lowest=True)
    
    mask = (~pd.isna(y_pred_disc)) & (~pd.isna(y_test_disc))
    y_pred_disc = y_pred_disc[mask]
    y_test_disc = y_test_disc[mask]

    acc = accuracy_score(y_test_disc, y_pred_disc)
    prec = precision_score(y_test_disc, y_pred_disc, average='macro', zero_division=0)
    rec = recall_score(y_test_disc, y_pred_disc, average='macro', zero_division=0)
    f1 = f1_score(y_test_disc, y_pred_disc, average='macro', zero_division=0)
    
    acc_scores.append(acc)
    prec_scores.append(prec)
    rec_scores.append(rec)
    f1_scores.append(f1)


print("\nMean Absolute Error (per fold):", mae_scores)
print("Mean MAE:", np.mean(mae_scores))

print("\nRoot Mean Squared Error (per fold):", rmse_scores)
print("Mean RMSE:", np.mean(rmse_scores))

print("\nR2 - Coefficient of Determination (per fold):", r2_scores)
print("Mean R2:", np.mean(r2_scores))

print("\n--- Summary after discretization---")
print(f"Mean Accuracy: {np.mean(acc_scores):.4f}")
print(f"Mean Precision: {np.mean(prec_scores):.4f}")
print(f"Mean Recall: {np.mean(rec_scores):.4f}")
print(f"Mean F1-score: {np.mean(f1_scores):.4f}")



Mean Absolute Error (per fold): [7.41312994650845, 8.708325767301574, 5.710517754433817, 8.581828416052577, 8.658994436064292, 6.402859886850007, 6.012738580246524, 7.261751357302816, 6.881389692582141, 7.857072922783762]
Mean MAE: 7.348860876012596

Root Mean Squared Error (per fold): [np.float64(19.5837638042677), np.float64(35.795542299228345), np.float64(12.303923171564739), np.float64(21.97960549075109), np.float64(41.43124672246635), np.float64(24.706199902948754), np.float64(11.284902520840058), np.float64(21.86735100483833), np.float64(24.24518249022419), np.float64(24.775871099062933)]
Mean RMSE: 23.797358850619247

R2 - Coefficient of Determination (per fold): [0.4931460859659469, 0.22453250515649448, 0.7016031833104159, 0.4563236907935959, 0.17190443768798547, 0.36771562248878786, 0.7409097723366802, 0.4347352443068989, 0.3712693892023966, 0.3843004407511994]
Mean R2: 0.43464403720004013

--- Summary after discretization---
Mean Accuracy: 0.6331
Mean Precision: 0.3066
Mean 

In [7]:
# saving results to df
df_results = pd.DataFrame({
    'MAE': mae_scores,
    'RMSE': rmse_scores,
    'R2': r2_scores
})
df_results.to_csv("3a_regression_no_resample_SVR.csv", index=False)

In [8]:
# saving results to df (for comparison with classification)
df_results = pd.DataFrame({
    'acc': acc_scores,
    'prec': prec_scores,
    'rec': rec_scores,
    'f1': f1_scores
})
df_results.to_csv("3a_classification_no_resample_SVR.csv", index=False)

# KNN

In [9]:
from sklearn.neighbors import KNeighborsRegressor

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor(n_neighbors=5))
])

kf = KFold(n_splits=10, shuffle=True, random_state=42)

mae_scores = []
rmse_scores = []
r2_scores = []

acc_scores = []
prec_scores = []
rec_scores = []
f1_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mae_scores.append(mae)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)
    
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    
    y_pred_disc = pd.cut(y_pred, bins=bins, labels=labels, include_lowest=True)
    y_test_disc = pd.cut(y_test, bins=bins, labels=labels, include_lowest=True)
    
    mask = (~pd.isna(y_pred_disc)) & (~pd.isna(y_test_disc))
    y_pred_disc = y_pred_disc[mask]
    y_test_disc = y_test_disc[mask]

    acc = accuracy_score(y_test_disc, y_pred_disc)
    prec = precision_score(y_test_disc, y_pred_disc, average='macro', zero_division=0)
    rec = recall_score(y_test_disc, y_pred_disc, average='macro', zero_division=0)
    f1 = f1_score(y_test_disc, y_pred_disc, average='macro', zero_division=0)
    
    acc_scores.append(acc)
    prec_scores.append(prec)
    rec_scores.append(rec)
    f1_scores.append(f1)



print("\nMean Absolute Error (per fold):", mae_scores)
print("Mean MAE:", np.mean(mae_scores))

print("\nRoot Mean Squared Error (per fold):", rmse_scores)
print("Mean RMSE:", np.mean(rmse_scores))

print("\nR2 - Coefficient of Determination (per fold):", r2_scores)
print("Mean R2:", np.mean(r2_scores))

print("\n--- Summary after discretization---")
print(f"Mean Accuracy: {np.mean(acc_scores):.4f}")
print(f"Mean Precision: {np.mean(prec_scores):.4f}")
print(f"Mean Recall: {np.mean(rec_scores):.4f}")
print(f"Mean F1-score: {np.mean(f1_scores):.4f}")


Mean Absolute Error (per fold): [7.480623522083333, 8.0344884925, 5.452123916666666, 6.992389756666667, 8.006981311666667, 6.3714481720833325, 5.8238093666666675, 6.3829650566666665, 6.0944557420833325, 6.86932173625]
Mean MAE: 6.750860707333333

Root Mean Squared Error (per fold): [np.float64(17.452729080469144), np.float64(25.021066267582626), np.float64(10.10088048205653), np.float64(14.13294886499602), np.float64(34.962820694272736), np.float64(20.765249888354184), np.float64(8.460177649323844), np.float64(14.896544911574527), np.float64(15.57328558185318), np.float64(16.719209866197087)]
Mean RMSE: 17.808491328667987

R2 - Coefficient of Determination (per fold): [0.5974524629785032, 0.6211063456296779, 0.7988938141852971, 0.7752155844198929, 0.41029151158815325, 0.5533422511127588, 0.8543822307195041, 0.7376804513536654, 0.7405974828523392, 0.7196229412881066]
Mean R2: 0.6808585076127898

--- Summary after discretization---
Mean Accuracy: 0.6173
Mean Precision: 0.4291
Mean Recal

In [10]:
# saving resoults to df
df_results = pd.DataFrame({
    'MAE': mae_scores,
    'RMSE': rmse_scores,
    'R2': r2_scores
})
df_results.to_csv("3a_regression_no_resample_KNN.csv", index=False)

In [11]:
# saving results to df (for comparison with classification)
df_results = pd.DataFrame({
    'acc': acc_scores,
    'prec': prec_scores,
    'rec': rec_scores,
    'f1': f1_scores
})
df_results.to_csv("3a_classification_no_resample_KNN.csv", index=False)

# DECISION TREE

In [12]:
from sklearn.tree import DecisionTreeRegressor

pipeline = Pipeline([
    ('tree', DecisionTreeRegressor(random_state=42))
])

kf = KFold(n_splits=10, shuffle=True, random_state=42)

mae_scores = []
rmse_scores = []
r2_scores = []

acc_scores = []
prec_scores = []
rec_scores = []
f1_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mae_scores.append(mae)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)

    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    
    y_pred_disc = pd.cut(y_pred, bins=bins, labels=labels, include_lowest=True)
    y_test_disc = pd.cut(y_test, bins=bins, labels=labels, include_lowest=True)
    
    mask = (~pd.isna(y_pred_disc)) & (~pd.isna(y_test_disc))
    y_pred_disc = y_pred_disc[mask]
    y_test_disc = y_test_disc[mask]

    acc = accuracy_score(y_test_disc, y_pred_disc)
    prec = precision_score(y_test_disc, y_pred_disc, average='macro', zero_division=0)
    rec = recall_score(y_test_disc, y_pred_disc, average='macro', zero_division=0)
    f1 = f1_score(y_test_disc, y_pred_disc, average='macro', zero_division=0)
    
    acc_scores.append(acc)
    prec_scores.append(prec)
    rec_scores.append(rec)
    f1_scores.append(f1)


print("\nMean Absolute Error (per fold):", mae_scores)
print("Mean MAE:", np.mean(mae_scores))

print("\nRoot Mean Squared Error (per fold):", rmse_scores)
print("Mean RMSE:", np.mean(rmse_scores))

print("\nR2 - Coefficient of Determination (per fold):", r2_scores)
print("Mean R2:", np.mean(r2_scores))

print("\n--- Summary after discretization---")
print(f"Mean Accuracy: {np.mean(acc_scores):.4f}")
print(f"Mean Precision: {np.mean(prec_scores):.4f}")
print(f"Mean Recall: {np.mean(rec_scores):.4f}")
print(f"Mean F1-score: {np.mean(f1_scores):.4f}")


Mean Absolute Error (per fold): [8.771810583333336, 7.416432070833334, 6.051692344791666, 7.3774735229166675, 8.054223479166666, 5.993016248958332, 5.565093070833333, 6.6179541166666676, 6.261151345833334, 7.2668514520833325]
Mean MAE: 6.937569823541668

Root Mean Squared Error (per fold): [np.float64(37.88254440628121), np.float64(19.420977376041748), np.float64(13.001014038134183), np.float64(19.00353787316977), np.float64(35.141453233496065), np.float64(17.338832813551523), np.float64(12.500377740186332), np.float64(18.327227619347642), np.float64(21.275558363587827), np.float64(34.25752203988901)]
Mean RMSE: 22.81490455036853

R2 - Coefficient of Determination (per fold): [-0.8965694970010998, 0.7717305090685407, 0.6668334290669644, 0.5935851724264991, 0.40425022385602305, 0.6885844695198923, 0.6820918349443226, 0.6029427748131253, 0.5158548072828625, -0.1771257729237321]
Mean R2: 0.3852177951053398

--- Summary after discretization---
Mean Accuracy: 0.6567
Mean Precision: 0.4046


In [13]:
# saving resoults to df
df_results = pd.DataFrame({
    'MAE': mae_scores,
    'RMSE': rmse_scores,
    'R2': r2_scores
})
df_results.to_csv("3a_regression_no_resample_DT.csv", index=False)

In [14]:
# saving results to df (for comparison with classification)
df_results = pd.DataFrame({
    'acc': acc_scores,
    'prec': prec_scores,
    'rec': rec_scores,
    'f1': f1_scores
})
df_results.to_csv("3a_classification_no_resample_DT.csv", index=False)

# MULTILAYER PERCEPTRON (MLP)

In [15]:
from sklearn.neural_network import MLPRegressor

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('mlp', MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000, early_stopping=True, random_state=42))
])

kf = KFold(n_splits=10, shuffle=True, random_state=42)

mae_scores = []
rmse_scores = []
r2_scores = []

acc_scores = []
prec_scores = []
rec_scores = []
f1_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mae_scores.append(mae)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)
    
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    
    y_pred_disc = pd.cut(y_pred, bins=bins, labels=labels, include_lowest=True)
    y_test_disc = pd.cut(y_test, bins=bins, labels=labels, include_lowest=True)
    
    mask = (~pd.isna(y_pred_disc)) & (~pd.isna(y_test_disc))
    y_pred_disc = y_pred_disc[mask]
    y_test_disc = y_test_disc[mask]

    acc = accuracy_score(y_test_disc, y_pred_disc)
    prec = precision_score(y_test_disc, y_pred_disc, average='macro', zero_division=0)
    rec = recall_score(y_test_disc, y_pred_disc, average='macro', zero_division=0)
    f1 = f1_score(y_test_disc, y_pred_disc, average='macro', zero_division=0)
    
    acc_scores.append(acc)
    prec_scores.append(prec)
    rec_scores.append(rec)
    f1_scores.append(f1)


print("\nMean Absolute Error (per fold):", mae_scores)
print("Mean MAE:", np.mean(mae_scores))

print("\nRoot Mean Squared Error (per fold):", rmse_scores)
print("Mean RMSE:", np.mean(rmse_scores))

print("\nR2 - Coefficient of Determination (per fold):", r2_scores)
print("Mean R2:", np.mean(r2_scores))

print("\n--- Summary after discretization---")
print(f"Mean Accuracy: {np.mean(acc_scores):.4f}")
print(f"Mean Precision: {np.mean(prec_scores):.4f}")
print(f"Mean Recall: {np.mean(rec_scores):.4f}")
print(f"Mean F1-score: {np.mean(f1_scores):.4f}")


Mean Absolute Error (per fold): [5.966967173844387, 6.788381353645287, 4.9385935763260385, 6.1964510001127815, 6.2031959062906195, 5.385788728477991, 4.605209214957548, 5.87495689303194, 5.394777371763027, 5.905588979441993]
Mean MAE: 5.725991019789161

Root Mean Squared Error (per fold): [np.float64(15.062406939451163), np.float64(25.20373693944538), np.float64(9.907257980234355), np.float64(12.697130223359062), np.float64(31.871283948891563), np.float64(17.6909137172008), np.float64(7.723167721454785), np.float64(14.712262652059277), np.float64(15.917267985089358), np.float64(14.424291785825469)]
Mean RMSE: 16.52097198930112

R2 - Coefficient of Determination (per fold): [0.7001671362031765, 0.6155537918262997, 0.8065298767150998, 0.8185688846463138, 0.5099689802466658, 0.675808907508582, 0.8786481667988071, 0.744130515024903, 0.7290115712131308, 0.7913107995818068]
Mean R2: 0.7269698629764786

--- Summary after discretization---
Mean Accuracy: 0.6885
Mean Precision: 0.4418
Mean Rec

In [16]:
# saving resoults to df
df_results = pd.DataFrame({
    'MAE': mae_scores,
    'RMSE': rmse_scores,
    'R2': r2_scores
})
df_results.to_csv("3a_regression_no_resample_MLP.csv", index=False)

In [17]:
# saving results to df (for comparison with classification)
df_results = pd.DataFrame({
    'acc': acc_scores,
    'prec': prec_scores,
    'rec': rec_scores,
    'f1': f1_scores
})
df_results.to_csv("3a_classification_no_resample_MLP.csv", index=False)