In [12]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold

In [13]:
df = pd.read_csv('cleaned_movies.csv')

In [14]:
X = df.drop(columns=['genres', 'original_language', 'popularity_class', 'popularity']).copy()
y = df['popularity_class']
print(X.dtypes)


budget            int64
runtime         float64
vote_average    float64
vote_count        int64
revenue           int64
release_year      int64
genres_score    float64
dtype: object


# SVM

In [15]:
from sklearn.svm import SVC

accuracies = []
precisions = []
recalls = []
f1s = []


pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC())
])


kf = KFold(n_splits=10, shuffle=True, random_state=42)

# 5-fold cross-validation
for fold, (train_idx, test_idx) in enumerate(kf.split(X, y)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
    rec = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)
    f1s.append(f1)
    

    print(f"Fold {fold+1} - Acc: {acc:.4f} | Prec: {prec:.4f} | Rec: {rec:.4f} | F1: {f1:.4f}")


print("\n--- Summary ---")
print(f"Mean Accuracy: {np.mean(accuracies):.4f}")
print(f"Mean Precision: {np.mean(precisions):.4f}")
print(f"Mean Recall: {np.mean(recalls):.4f}")
print(f"Mean F1-score: {np.mean(f1s):.4f}")


Fold 1 - Acc: 0.6833 | Prec: 0.4685 | Rec: 0.4256 | F1: 0.4319
Fold 2 - Acc: 0.6708 | Prec: 0.3853 | Rec: 0.3929 | F1: 0.3875
Fold 3 - Acc: 0.6875 | Prec: 0.3751 | Rec: 0.3836 | F1: 0.3627
Fold 4 - Acc: 0.6312 | Prec: 0.3446 | Rec: 0.3354 | F1: 0.3313
Fold 5 - Acc: 0.6813 | Prec: 0.4243 | Rec: 0.4010 | F1: 0.3914
Fold 6 - Acc: 0.6813 | Prec: 0.4260 | Rec: 0.4277 | F1: 0.4184
Fold 7 - Acc: 0.6854 | Prec: 0.3680 | Rec: 0.4152 | F1: 0.3650
Fold 8 - Acc: 0.6792 | Prec: 0.3698 | Rec: 0.3909 | F1: 0.3741
Fold 9 - Acc: 0.6937 | Prec: 0.3572 | Rec: 0.3815 | F1: 0.3633
Fold 10 - Acc: 0.6937 | Prec: 0.3819 | Rec: 0.3753 | F1: 0.3763

--- Summary ---
Mean Accuracy: 0.6787
Mean Precision: 0.3901
Mean Recall: 0.3929
Mean F1-score: 0.3802


In [16]:
# saving resoults to df
df_results = pd.DataFrame({
    'acc': accuracies,
    'prec': precisions,
    'rec': recalls,
    'f1': f1s
})
df_results.to_csv("2a_classification_no_resample_SVC.csv", index=False)

# KNN

In [17]:
from sklearn.neighbors import KNeighborsClassifier

accuracies = []
precisions = []
recalls = []
f1s = []

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=5))
])


kf = KFold(n_splits=10, shuffle=True, random_state=42)

# 5-fold cross-validation
for fold, (train_idx, test_idx) in enumerate(kf.split(X, y)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
    rec = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)
    f1s.append(f1)

    print(f"Fold {fold+1} - Acc: {acc:.4f} | Prec: {prec:.4f} | Rec: {rec:.4f} | F1: {f1:.4f}")


print("\n--- Summary ---")
print(f"Mean Accuracy: {np.mean(accuracies):.4f}")
print(f"Mean Precision: {np.mean(precisions):.4f}")
print(f"Mean Recall: {np.mean(recalls):.4f}")
print(f"Mean F1-score: {np.mean(f1s):.4f}")

Fold 1 - Acc: 0.5750 | Prec: 0.3947 | Rec: 0.3450 | F1: 0.3529
Fold 2 - Acc: 0.5687 | Prec: 0.3462 | Rec: 0.3151 | F1: 0.3266
Fold 3 - Acc: 0.6229 | Prec: 0.3950 | Rec: 0.3451 | F1: 0.3552
Fold 4 - Acc: 0.5813 | Prec: 0.3272 | Rec: 0.3202 | F1: 0.3221
Fold 5 - Acc: 0.5563 | Prec: 0.3064 | Rec: 0.2848 | F1: 0.2926
Fold 6 - Acc: 0.5813 | Prec: 0.4530 | Rec: 0.3355 | F1: 0.3692
Fold 7 - Acc: 0.5625 | Prec: 0.3166 | Rec: 0.3040 | F1: 0.3035
Fold 8 - Acc: 0.5854 | Prec: 0.3372 | Rec: 0.3141 | F1: 0.3176
Fold 9 - Acc: 0.6000 | Prec: 0.3267 | Rec: 0.3105 | F1: 0.3146
Fold 10 - Acc: 0.6042 | Prec: 0.3713 | Rec: 0.3262 | F1: 0.3411

--- Summary ---
Mean Accuracy: 0.5837
Mean Precision: 0.3574
Mean Recall: 0.3201
Mean F1-score: 0.3295


In [18]:
# saving resoults to df
df_results = pd.DataFrame({
    'acc': accuracies,
    'prec': precisions,
    'rec': recalls,
    'f1': f1s
})
df_results.to_csv("2a_classification_no_resample_KNN.csv", index=False)

# DECISION TREE

In [19]:
from sklearn.tree import DecisionTreeClassifier

accuracies = []
precisions = []
recalls = []
f1s = []

pipeline = Pipeline([
    ('tree', DecisionTreeClassifier(random_state=42))
])

kf = KFold(n_splits=10, shuffle=True, random_state=42)


# 5-fold cross-validation
for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
    rec = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)
    f1s.append(f1)

    print(f"Fold {fold+1} - Acc: {acc:.4f} | Prec: {prec:.4f} | Rec: {rec:.4f} | F1: {f1:.4f}")


print("\n--- Summary ---")
print(f"Mean Accuracy: {np.mean(accuracies):.4f}")
print(f"Mean Precision: {np.mean(precisions):.4f}")
print(f"Mean Recall: {np.mean(recalls):.4f}")
print(f"Mean F1-score: {np.mean(f1s):.4f}")


Fold 1 - Acc: 0.6271 | Prec: 0.4005 | Rec: 0.3927 | F1: 0.3945
Fold 2 - Acc: 0.6854 | Prec: 0.4541 | Rec: 0.4537 | F1: 0.4529
Fold 3 - Acc: 0.6542 | Prec: 0.4262 | Rec: 0.4478 | F1: 0.4282
Fold 4 - Acc: 0.6438 | Prec: 0.4184 | Rec: 0.3968 | F1: 0.4024
Fold 5 - Acc: 0.6292 | Prec: 0.4120 | Rec: 0.3690 | F1: 0.3856
Fold 6 - Acc: 0.6708 | Prec: 0.3768 | Rec: 0.3674 | F1: 0.3713
Fold 7 - Acc: 0.6813 | Prec: 0.4068 | Rec: 0.4152 | F1: 0.3989
Fold 8 - Acc: 0.6438 | Prec: 0.3808 | Rec: 0.3770 | F1: 0.3751
Fold 9 - Acc: 0.6625 | Prec: 0.3960 | Rec: 0.3936 | F1: 0.3926
Fold 10 - Acc: 0.6583 | Prec: 0.4261 | Rec: 0.3925 | F1: 0.4000

--- Summary ---
Mean Accuracy: 0.6556
Mean Precision: 0.4098
Mean Recall: 0.4006
Mean F1-score: 0.4001


In [20]:
# saving resoults to df
df_results = pd.DataFrame({
    'acc': accuracies,
    'prec': precisions,
    'rec': recalls,
    'f1': f1s
})
df_results.to_csv("2a_classification_no_resample_DT.csv", index=False)

# MULTILAYER PERCEPTRON (MLP)

In [21]:
from sklearn.neural_network import MLPClassifier

accuracies = []
precisions = []
recalls = []
f1s = []

mlp = MLPClassifier(
    hidden_layer_sizes=(100, 50),     # two hidden layers: 100 and 50 neurons
    activation='relu',                # a commonly used activation function
    solver='adam',                    # modern optimizer, better than 'sgd' in most cases
    learning_rate_init=0.001,         # lower learning rate improves stability
    max_iter=1000,                    # more iterations to allow convergence
    early_stopping=True,              # stops training if validation score doesn't improve
    random_state=42                   # ensures reproducibility
)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('mlp', mlp)
])


kf = KFold(n_splits=10, shuffle=True, random_state=42)


# 5-fold cross-validation
for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
    rec = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)
    f1s.append(f1)

    print(f"Fold {fold+1} - Acc: {acc:.4f} | Prec: {prec:.4f} | Rec: {rec:.4f} | F1: {f1:.4f}")


print("\n--- Summary ---")
print(f"Mean Accuracy: {np.mean(accuracies):.4f}")
print(f"Mean Precision: {np.mean(precisions):.4f}")
print(f"Mean Recall: {np.mean(recalls):.4f}")
print(f"Mean F1-score: {np.mean(f1s):.4f}")


Fold 1 - Acc: 0.7000 | Prec: 0.3995 | Rec: 0.4271 | F1: 0.4074
Fold 2 - Acc: 0.7333 | Prec: 0.3970 | Rec: 0.4455 | F1: 0.4156
Fold 3 - Acc: 0.6958 | Prec: 0.3409 | Rec: 0.3793 | F1: 0.3534
Fold 4 - Acc: 0.7125 | Prec: 0.3824 | Rec: 0.4072 | F1: 0.3913
Fold 5 - Acc: 0.6875 | Prec: 0.4104 | Rec: 0.3910 | F1: 0.3831
Fold 6 - Acc: 0.7125 | Prec: 0.4612 | Rec: 0.4748 | F1: 0.4608
Fold 7 - Acc: 0.6917 | Prec: 0.3982 | Rec: 0.4303 | F1: 0.3932
Fold 8 - Acc: 0.7250 | Prec: 0.4034 | Rec: 0.4198 | F1: 0.3896
Fold 9 - Acc: 0.7312 | Prec: 0.4371 | Rec: 0.4505 | F1: 0.4377
Fold 10 - Acc: 0.7354 | Prec: 0.4554 | Rec: 0.4458 | F1: 0.4420

--- Summary ---
Mean Accuracy: 0.7125
Mean Precision: 0.4085
Mean Recall: 0.4271
Mean F1-score: 0.4074


In [22]:
# saving resoults to df
df_results = pd.DataFrame({
    'acc': accuracies,
    'prec': precisions,
    'rec': recalls,
    'f1': f1s
})
df_results.to_csv("2a_classification_no_resample_MLP.csv", index=False)