In [25]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler  
from custom_metrics import f_beta_regression

In [26]:
df = pd.read_csv('cleaned_movies.csv')

SVM

In [27]:
X = df.drop(columns=['genres', 'original_language', 'popularity_class', 'popularity']).copy()
y = df['popularity']


print(X.dtypes)

budget              int64
runtime           float64
vote_average      float64
vote_count          int64
revenue             int64
release_year        int64
genres_score      float64
language_score    float64
dtype: object


In [28]:
from sklearn.svm import SVR # if the import is needed only in one cell should we put it here or on the top? 

# we use rbf kernel, bc we do not have linear data
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR())
])
    
# 5-fold cross-validation (for R^2 score)
scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')

print("R² score for each fold:", scores)
print("Mean R² score:", scores.mean())

# we have to adjust metrics

R² score for each fold: [-0.23557032  0.68975404  0.71698825  0.66255143  0.20916165]
Mean R² score: 0.4085770095922269


In [29]:
# svm with custom metric - F1 score for regression
from sklearn.model_selection import KFold
import numpy as np

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR())
])

kf = KFold(n_splits=5, shuffle=True, random_state=42)

f1_scores = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    f1 = f_beta_regression(y_test.values, y_pred, beta=1.0, threshold=10)
    f1_scores.append(f1)

print("F1 adapted for regression per fold:", f1_scores)
print("Mean F1 adapted for regression:", np.mean(f1_scores))

F1 adapted for regression per fold: [np.float64(0.5917076548142687), np.float64(0.592436313219766), np.float64(0.6121203155753612), np.float64(0.5937048135561629), np.float64(0.6114060013989775)]
Mean F1 adapted for regression: 0.6002750197129073


KNN

In [30]:
from sklearn.neighbors import KNeighborsRegressor

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor(n_neighbors=5))
])

scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')
print("R² score for each fold:", scores)
print("Mean R² score:", scores.mean())
# we have to adjust metrics

R² score for each fold: [0.35060872 0.76855449 0.79609064 0.76641059 0.63277432]
Mean R² score: 0.6628877518338261
