In [18]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [19]:
df = pd.read_csv('cleaned_movies.csv')

In [20]:
X = df.drop(columns=['genres', 'original_language', 'popularity_class', 'popularity']).copy()
y = df['popularity_class']

print(X.dtypes)

budget              int64
runtime           float64
vote_average      float64
vote_count          int64
revenue             int64
release_year        int64
genres_score      float64
language_score    float64
dtype: object


SVM

In [21]:
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict
from sklearn.svm import SVC

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC())
])

# 5-fold cross-validation (for accuracy score)
scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
print("Accuracy score for each fold:", scores)
print("Mean accuracy score:", scores.mean())

y_pred = cross_val_predict(pipeline, X, y, cv=5)

print("Classification Report (Precision, Recall, F1):\n")
print(classification_report(y, y_pred))


Accuracy score for each fold: [0.51770833 0.8875     0.91354167 0.88854167 0.821875  ]
Mean accuracy score: 0.8058333333333334
Classification Report (Precision, Recall, F1):

              precision    recall  f1-score   support

           0       0.89      0.87      0.88      3085
           1       0.70      0.70      0.70      1553
           2       0.35      0.50      0.41       162

    accuracy                           0.81      4800
   macro avg       0.65      0.69      0.67      4800
weighted avg       0.81      0.81      0.81      4800


KNN

In [22]:
from sklearn.neighbors import KNeighborsClassifier

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=5))
])

scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
print("Accuracy score for each fold:", scores)
print("Mean accuracy score:", scores.mean())

y_pred = cross_val_predict(pipeline, X, y, cv=5)

print("Classification Report (Precision, Recall, F1):\n")
print(classification_report(y, y_pred))

Accuracy score for each fold: [0.471875   0.79166667 0.85       0.83854167 0.78229167]
Mean accuracy score: 0.746875
Classification Report (Precision, Recall, F1):

              precision    recall  f1-score   support

           0       0.84      0.84      0.84      3085
           1       0.62      0.59      0.60      1553
           2       0.31      0.46      0.37       162

    accuracy                           0.75      4800
   macro avg       0.59      0.63      0.60      4800
weighted avg       0.75      0.75      0.75      4800


DECISION TREE

In [23]:
from sklearn.tree import DecisionTreeClassifier

pipeline = Pipeline([
    ('tree', DecisionTreeClassifier(random_state=42))
])

# 5-fold cross-validation
scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
print("Accuracy score for each fold:", scores)
print("Mean accuracy score:", scores.mean())

y_pred = cross_val_predict(pipeline, X, y, cv=5)

print("Classification Report (Precision, Recall, F1):\n")
print(classification_report(y, y_pred))


Accuracy score for each fold: [0.48020833 0.82395833 0.87916667 0.81979167 0.77083333]
Mean accuracy score: 0.7547916666666667
Classification Report (Precision, Recall, F1):

              precision    recall  f1-score   support

           0       0.86      0.85      0.85      3085
           1       0.63      0.60      0.62      1553
           2       0.28      0.49      0.36       162

    accuracy                           0.75      4800
   macro avg       0.59      0.65      0.61      4800
weighted avg       0.77      0.75      0.76      4800


MULTILAYER PERCEPTRON (MLP)

In [24]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(
    hidden_layer_sizes=(100, 50),     # two hidden layers: 100 and 50 neurons
    activation='relu',                # a commonly used activation function
    solver='adam',                    # modern optimizer, better than 'sgd' in most cases
    learning_rate_init=0.001,         # lower learning rate improves stability
    max_iter=1000,                    # more iterations to allow convergence
    early_stopping=True,              # stops training if validation score doesn't improve
    random_state=42                   # ensures reproducibility
)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('mlp', mlp)
])

# 5-fold cross-validation
scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
print("Accuracy score for each fold:", scores)
print("Mean accuracy score:", scores.mean())

y_pred = cross_val_predict(pipeline, X, y, cv=5)

print("\nClassification Report (Precision, Recall, F1-score):")
print(classification_report(y, y_pred))


Accuracy score for each fold: [0.56666667 0.88020833 0.91875    0.91041667 0.865625  ]
Mean accuracy score: 0.8283333333333331

Classification Report (Precision, Recall, F1-score):
              precision    recall  f1-score   support

           0       0.92      0.88      0.90      3085
           1       0.73      0.76      0.74      1553
           2       0.37      0.54      0.44       162

    accuracy                           0.83      4800
   macro avg       0.67      0.73      0.69      4800
weighted avg       0.84      0.83      0.83      4800
