In [19]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold

In [20]:
df = pd.read_csv('cleaned_movies.csv')

In [21]:
X = df.drop(columns=['genres', 'original_language', 'popularity_class', 'popularity']).copy()
y = df['popularity_class']
print(X.dtypes)


budget            int64
runtime         float64
vote_average    float64
vote_count        int64
revenue           int64
release_year      int64
genres_score    float64
dtype: object


# SVM

In [22]:
from sklearn.svm import SVC

accuracies = []
precisions = []
recalls = []
f1s = []


pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC())
])


kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 5-fold cross-validation
for fold, (train_idx, test_idx) in enumerate(kf.split(X, y)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
    rec = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)
    f1s.append(f1)
    

    print(f"Fold {fold+1} - Acc: {acc:.4f} | Prec: {prec:.4f} | Rec: {rec:.4f} | F1: {f1:.4f}")


print("\n--- Summary ---")
print(f"Mean Accuracy: {np.mean(accuracies):.4f}")
print(f"Mean Precision: {np.mean(precisions):.4f}")
print(f"Mean Recall: {np.mean(recalls):.4f}")
print(f"Mean F1-score: {np.mean(f1s):.4f}")


Fold 1 - Acc: 0.6760 | Prec: 0.4169 | Rec: 0.4089 | F1: 0.4066
Fold 2 - Acc: 0.6604 | Prec: 0.3624 | Rec: 0.3613 | F1: 0.3499
Fold 3 - Acc: 0.6719 | Prec: 0.3630 | Rec: 0.3798 | F1: 0.3614
Fold 4 - Acc: 0.6781 | Prec: 0.3802 | Rec: 0.3984 | F1: 0.3771
Fold 5 - Acc: 0.6927 | Prec: 0.3776 | Rec: 0.3884 | F1: 0.3818

--- Summary ---
Mean Accuracy: 0.6758
Mean Precision: 0.3800
Mean Recall: 0.3874
Mean F1-score: 0.3753


In [23]:
# saving resoults to df
df_results = pd.DataFrame({
    'acc': accuracies,
    'prec': precisions,
    'rec': recalls,
    'f1': f1s
})
df_results.to_csv("2a_classification_no_resample_SVC.csv", index=False)

# KNN

In [24]:
from sklearn.neighbors import KNeighborsClassifier

accuracies = []
precisions = []
recalls = []
f1s = []

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=5))
])


kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 5-fold cross-validation
for fold, (train_idx, test_idx) in enumerate(kf.split(X, y)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
    rec = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)
    f1s.append(f1)

    print(f"Fold {fold+1} - Acc: {acc:.4f} | Prec: {prec:.4f} | Rec: {rec:.4f} | F1: {f1:.4f}")


print("\n--- Summary ---")
print(f"Mean Accuracy: {np.mean(accuracies):.4f}")
print(f"Mean Precision: {np.mean(precisions):.4f}")
print(f"Mean Recall: {np.mean(recalls):.4f}")
print(f"Mean F1-score: {np.mean(f1s):.4f}")

Fold 1 - Acc: 0.5594 | Prec: 0.3486 | Rec: 0.3228 | F1: 0.3320
Fold 2 - Acc: 0.5938 | Prec: 0.3510 | Rec: 0.3295 | F1: 0.3349
Fold 3 - Acc: 0.5615 | Prec: 0.3712 | Rec: 0.3089 | F1: 0.3297
Fold 4 - Acc: 0.5687 | Prec: 0.3000 | Rec: 0.2929 | F1: 0.2899
Fold 5 - Acc: 0.6052 | Prec: 0.3457 | Rec: 0.3072 | F1: 0.3217

--- Summary ---
Mean Accuracy: 0.5777
Mean Precision: 0.3433
Mean Recall: 0.3123
Mean F1-score: 0.3217


In [25]:
# saving resoults to df
df_results = pd.DataFrame({
    'acc': accuracies,
    'prec': precisions,
    'rec': recalls,
    'f1': f1s
})
df_results.to_csv("2a_classification_no_resample_KNN.csv", index=False)

# DECISION TREE

In [26]:
from sklearn.tree import DecisionTreeClassifier

accuracies = []
precisions = []
recalls = []
f1s = []

pipeline = Pipeline([
    ('tree', DecisionTreeClassifier(random_state=42))
])

kf = KFold(n_splits=5, shuffle=True, random_state=42)


# 5-fold cross-validation
for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
    rec = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)
    f1s.append(f1)

    print(f"Fold {fold+1} - Acc: {acc:.4f} | Prec: {prec:.4f} | Rec: {rec:.4f} | F1: {f1:.4f}")


print("\n--- Summary ---")
print(f"Mean Accuracy: {np.mean(accuracies):.4f}")
print(f"Mean Precision: {np.mean(precisions):.4f}")
print(f"Mean Recall: {np.mean(recalls):.4f}")
print(f"Mean F1-score: {np.mean(f1s):.4f}")


Fold 1 - Acc: 0.6542 | Prec: 0.4626 | Rec: 0.4512 | F1: 0.4512
Fold 2 - Acc: 0.6615 | Prec: 0.4137 | Rec: 0.3956 | F1: 0.3977
Fold 3 - Acc: 0.6531 | Prec: 0.4104 | Rec: 0.3785 | F1: 0.3904
Fold 4 - Acc: 0.6687 | Prec: 0.4169 | Rec: 0.4238 | F1: 0.4161
Fold 5 - Acc: 0.6687 | Prec: 0.3964 | Rec: 0.3706 | F1: 0.3804

--- Summary ---
Mean Accuracy: 0.6613
Mean Precision: 0.4200
Mean Recall: 0.4039
Mean F1-score: 0.4072


In [27]:
# saving resoults to df
df_results = pd.DataFrame({
    'acc': accuracies,
    'prec': precisions,
    'rec': recalls,
    'f1': f1s
})
df_results.to_csv("2a_classification_no_resample_DT.csv", index=False)

# MULTILAYER PERCEPTRON (MLP)

In [28]:
from sklearn.neural_network import MLPClassifier

accuracies = []
precisions = []
recalls = []
f1s = []

mlp = MLPClassifier(
    hidden_layer_sizes=(100, 50),     # two hidden layers: 100 and 50 neurons
    activation='relu',                # a commonly used activation function
    solver='adam',                    # modern optimizer, better than 'sgd' in most cases
    learning_rate_init=0.001,         # lower learning rate improves stability
    max_iter=1000,                    # more iterations to allow convergence
    early_stopping=True,              # stops training if validation score doesn't improve
    random_state=42                   # ensures reproducibility
)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('mlp', mlp)
])


kf = KFold(n_splits=5, shuffle=True, random_state=42)


# 5-fold cross-validation
for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
    rec = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)
    f1s.append(f1)

    print(f"Fold {fold+1} - Acc: {acc:.4f} | Prec: {prec:.4f} | Rec: {rec:.4f} | F1: {f1:.4f}")


print("\n--- Summary ---")
print(f"Mean Accuracy: {np.mean(accuracies):.4f}")
print(f"Mean Precision: {np.mean(precisions):.4f}")
print(f"Mean Recall: {np.mean(recalls):.4f}")
print(f"Mean F1-score: {np.mean(f1s):.4f}")


Fold 1 - Acc: 0.7104 | Prec: 0.4396 | Rec: 0.4380 | F1: 0.4236
Fold 2 - Acc: 0.6990 | Prec: 0.3756 | Rec: 0.3907 | F1: 0.3763
Fold 3 - Acc: 0.7063 | Prec: 0.3993 | Rec: 0.4121 | F1: 0.3981
Fold 4 - Acc: 0.7135 | Prec: 0.4046 | Rec: 0.4281 | F1: 0.3939
Fold 5 - Acc: 0.7312 | Prec: 0.4416 | Rec: 0.4214 | F1: 0.4206

--- Summary ---
Mean Accuracy: 0.7121
Mean Precision: 0.4122
Mean Recall: 0.4181
Mean F1-score: 0.4025


In [29]:
# saving resoults to df
df_results = pd.DataFrame({
    'acc': accuracies,
    'prec': precisions,
    'rec': recalls,
    'f1': f1s
})
df_results.to_csv("2a_classification_no_resample_MLP.csv", index=False)