In [98]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold

In [99]:
df = pd.read_csv('cleaned_movies.csv')

In [100]:
X = df.drop(columns=['genres', 'original_language', 'popularity_class', 'popularity']).copy()
y = df['popularity_class']
print(X.dtypes)


budget              int64
runtime           float64
vote_average      float64
vote_count          int64
revenue             int64
release_year        int64
genres_score      float64
language_score    float64
dtype: object


# SVM

In [101]:
from sklearn.svm import SVC

accuracies = []
precisions = []
recalls = []
f1s = []

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC())
])


kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 5-fold cross-validation
for fold, (train_idx, test_idx) in enumerate(kf.split(X, y)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
    rec = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)
    f1s.append(f1)

    print(f"Fold {fold+1} - Acc: {acc:.4f} | Prec: {prec:.4f} | Rec: {rec:.4f} | F1: {f1:.4f}")


print("\n--- Summary ---")
print(f"Mean Accuracy: {np.mean(accuracies):.4f}")
print(f"Mean Precision: {np.mean(precisions):.4f}")
print(f"Mean Recall: {np.mean(recalls):.4f}")
print(f"Mean F1-score: {np.mean(f1s):.4f}")


Fold 1 - Acc: 0.9021 | Prec: 0.8548 | Rec: 0.7709 | F1: 0.8024
Fold 2 - Acc: 0.8927 | Prec: 0.8141 | Rec: 0.7281 | F1: 0.7583
Fold 3 - Acc: 0.9219 | Prec: 0.8450 | Rec: 0.8289 | F1: 0.8366
Fold 4 - Acc: 0.9042 | Prec: 0.8116 | Rec: 0.7467 | F1: 0.7728
Fold 5 - Acc: 0.9167 | Prec: 0.9188 | Rec: 0.7940 | F1: 0.8399

--- Summary ---
Mean Accuracy: 0.9075
Mean Precision: 0.8489
Mean Recall: 0.7737
Mean F1-score: 0.8020


# KNN

In [102]:
from sklearn.neighbors import KNeighborsClassifier

accuracies = []
precisions = []
recalls = []
f1s = []

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=5))
])


kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 5-fold cross-validation
for fold, (train_idx, test_idx) in enumerate(kf.split(X, y)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
    rec = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)
    f1s.append(f1)

    print(f"Fold {fold+1} - Acc: {acc:.4f} | Prec: {prec:.4f} | Rec: {rec:.4f} | F1: {f1:.4f}")


print("\n--- Summary ---")
print(f"Mean Accuracy: {np.mean(accuracies):.4f}")
print(f"Mean Precision: {np.mean(precisions):.4f}")
print(f"Mean Recall: {np.mean(recalls):.4f}")
print(f"Mean F1-score: {np.mean(f1s):.4f}")

Fold 1 - Acc: 0.8604 | Prec: 0.7965 | Rec: 0.7347 | F1: 0.7604
Fold 2 - Acc: 0.8667 | Prec: 0.8123 | Rec: 0.7239 | F1: 0.7561
Fold 3 - Acc: 0.8802 | Prec: 0.8507 | Rec: 0.7546 | F1: 0.7931
Fold 4 - Acc: 0.8583 | Prec: 0.8183 | Rec: 0.6991 | F1: 0.7399
Fold 5 - Acc: 0.8823 | Prec: 0.8654 | Rec: 0.7619 | F1: 0.8028

--- Summary ---
Mean Accuracy: 0.8696
Mean Precision: 0.8286
Mean Recall: 0.7348
Mean F1-score: 0.7704


# DECISION TREE

In [103]:
from sklearn.tree import DecisionTreeClassifier

accuracies = []
precisions = []
recalls = []
f1s = []

pipeline = Pipeline([
    ('tree', DecisionTreeClassifier(random_state=42))
])

kf = KFold(n_splits=5, shuffle=True, random_state=42)


# 5-fold cross-validation
for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
    rec = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)
    f1s.append(f1)

    print(f"Fold {fold+1} - Acc: {acc:.4f} | Prec: {prec:.4f} | Rec: {rec:.4f} | F1: {f1:.4f}")


print("\n--- Summary ---")
print(f"Mean Accuracy: {np.mean(accuracies):.4f}")
print(f"Mean Precision: {np.mean(precisions):.4f}")
print(f"Mean Recall: {np.mean(recalls):.4f}")
print(f"Mean F1-score: {np.mean(f1s):.4f}")


Fold 1 - Acc: 0.8865 | Prec: 0.7916 | Rec: 0.7916 | F1: 0.7916
Fold 2 - Acc: 0.8917 | Prec: 0.7944 | Rec: 0.8027 | F1: 0.7984
Fold 3 - Acc: 0.8854 | Prec: 0.7602 | Rec: 0.8417 | F1: 0.7910
Fold 4 - Acc: 0.8802 | Prec: 0.7566 | Rec: 0.7374 | F1: 0.7464
Fold 5 - Acc: 0.8823 | Prec: 0.7902 | Rec: 0.7812 | F1: 0.7854

--- Summary ---
Mean Accuracy: 0.8852
Mean Precision: 0.7786
Mean Recall: 0.7909
Mean F1-score: 0.7826


# MULTILAYER PERCEPTRON (MLP)

In [104]:
from sklearn.neural_network import MLPClassifier

accuracies = []
precisions = []
recalls = []
f1s = []

mlp = MLPClassifier(
    hidden_layer_sizes=(100, 50),     # two hidden layers: 100 and 50 neurons
    activation='relu',                # a commonly used activation function
    solver='adam',                    # modern optimizer, better than 'sgd' in most cases
    learning_rate_init=0.001,         # lower learning rate improves stability
    max_iter=1000,                    # more iterations to allow convergence
    early_stopping=True,              # stops training if validation score doesn't improve
    random_state=42                   # ensures reproducibility
)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('mlp', mlp)
])


kf = KFold(n_splits=5, shuffle=True, random_state=42)


# 5-fold cross-validation
for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
    rec = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)
    f1s.append(f1)

    print(f"Fold {fold+1} - Acc: {acc:.4f} | Prec: {prec:.4f} | Rec: {rec:.4f} | F1: {f1:.4f}")


print("\n--- Summary ---")
print(f"Mean Accuracy: {np.mean(accuracies):.4f}")
print(f"Mean Precision: {np.mean(precisions):.4f}")
print(f"Mean Recall: {np.mean(recalls):.4f}")
print(f"Mean F1-score: {np.mean(f1s):.4f}")


Fold 1 - Acc: 0.9156 | Prec: 0.8692 | Rec: 0.8241 | F1: 0.8438
Fold 2 - Acc: 0.8938 | Prec: 0.8094 | Rec: 0.7561 | F1: 0.7772
Fold 3 - Acc: 0.9240 | Prec: 0.8258 | Rec: 0.8060 | F1: 0.8154
Fold 4 - Acc: 0.9094 | Prec: 0.8026 | Rec: 0.8018 | F1: 0.8022
Fold 5 - Acc: 0.9208 | Prec: 0.9356 | Rec: 0.8131 | F1: 0.8569

--- Summary ---
Mean Accuracy: 0.9127
Mean Precision: 0.8485
Mean Recall: 0.8002
Mean F1-score: 0.8191
