In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score, classification_report, make_scorer
from sklearn.svm import SVC
from surprise import Dataset, Reader, SVD
from train_valid_test_loader import load_train_valid_test_datasets

# Load user and movie information
user_info = pd.read_csv("../data_movie_lens_100k/user_info.csv")
movie_info = pd.read_csv("../data_movie_lens_100k/movie_info.csv")

# Drop unnecessary columns
user_info = user_info.drop(columns=['orig_user_id'])
movie_info = movie_info.drop(columns=['orig_item_id'])

# Ensure proper column naming
user_info.columns = ['user_id', 'age', 'is_male']
movie_info.columns = ['item_id', 'title', 'release_year']

# Load datasets
train_tuple, valid_tuple, test_tuple, n_users, n_items = load_train_valid_test_datasets()

# Convert tuples to a `surprise` dataset
def tuple_to_surprise_dataset(tupl):
    ratings_dict = {
        "userID": tupl[0],
        "itemID": tupl[1],
        "rating": tupl[2],
    }
    df = pd.DataFrame(ratings_dict)
    reader = Reader(rating_scale=(1, 5))
    dataset = Dataset.load_from_df(df[["userID", "itemID", "rating"]], reader)
    return dataset

# Movie Title Feature Extraction using TF-IDF
titles = movie_info['title'].fillna('')
tfidf = TfidfVectorizer(max_features=100)
title_vectors = tfidf.fit_transform(titles).toarray()
title_dict = {item: title_vectors[i] for i, item in enumerate(movie_info['item_id'])}

# Normalize or bucket movie years
movie_info['year_normalized'] = (movie_info['release_year'] - movie_info['release_year'].min()) / (movie_info['release_year'].max() - movie_info['release_year'].min())
year_dict = {item: movie_info.loc[movie_info['item_id'] == item, 'year_normalized'].values[0] for item in movie_info['item_id']}

# Function to create feature vectors
def create_features(user_id, item_id, user_factors, item_factors, n_factors):
    zero_vector = np.zeros(n_factors)

    user_emb = user_factors.get(user_id, zero_vector)
    item_emb = item_factors.get(item_id, zero_vector)

    # Add demographic features
    user_info_row = user_info[user_info['user_id'] == user_id]
    if not user_info_row.empty:
        is_male = user_info_row.iloc[0]['is_male']
        age = user_info_row.iloc[0]['age']
    else:
        is_male = 0
        age = 0

    # Add movie title and year features
    title_emb = title_dict.get(item_id, np.zeros(100))
    year_emb = year_dict.get(item_id, 0)

    return np.concatenate((user_emb, item_emb, [is_male, age], title_emb, [year_emb]))

# Prepare the final DataFrame for training
def prepare_features_and_labels(tupl, user_factors, item_factors, n_factors, threshold=4.5):
    ratings_df = pd.DataFrame({
        'userID': tupl[0],
        'itemID': tupl[1],
        'rating': tupl[2]
    })
    ratings_df['features'] = ratings_df.apply(lambda row: create_features(row['userID'], row['itemID'], user_factors, item_factors, n_factors), axis=1)
    ratings_df['label'] = (ratings_df['rating'] >= threshold).astype(int)
    X = np.stack(ratings_df['features'])
    y = ratings_df['label']
    return X, y

# Function to train and extract features using SVD
def train_svd_and_prepare_features(n_factors):
    # Train SVD model using the training set
    trainset = tuple_to_surprise_dataset(train_tuple).build_full_trainset()
    algo = SVD(n_factors=n_factors)
    algo.fit(trainset)

    # Extract user and item factors
    user_factors = {trainset.to_raw_uid(uid): algo.pu[uid] for uid in range(trainset.n_users)}
    item_factors = {trainset.to_raw_iid(iid): algo.qi[iid] for iid in range(trainset.n_items)}

    # Create features and labels
    X_train, y_train = prepare_features_and_labels(train_tuple, user_factors, item_factors, n_factors)
    X_valid, y_valid = prepare_features_and_labels(valid_tuple, user_factors, item_factors, n_factors)
    X_test, y_test = prepare_features_and_labels(test_tuple, user_factors, item_factors, n_factors)

    # Combine training and validation for model selection
    X_combined = np.vstack((X_train, X_valid))
    y_combined = np.hstack((y_train, y_valid))

    return X_combined, y_combined, X_test, y_test

# Function to tune SVM with RBF kernel using GridSearchCV
def tune_and_evaluate_svm(X_combined, y_combined, X_test, y_test):
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto', 0.01, 0.001]
    }

    svm = SVC(kernel='rbf', probability=True, random_state=42)
    grid_search = GridSearchCV(svm, param_grid, scoring='roc_auc', cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), n_jobs=-1, verbose=10)
    grid_search.fit(X_combined, y_combined)

    best_svm = grid_search.best_estimator_
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best Cross-Validation AUC Score: {grid_search.best_score_:.4f}")

    # Test the best model
    y_pred_proba = best_svm.predict_proba(X_test)[:, 1]
    y_pred = best_svm.predict(X_test)

    print(f"\nTest Set AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

# Evaluate models for different `n_factors`
for n_factors in [2, 10, 50]:
    print(f"\nEvaluating SVM models with n_factors = {n_factors}\n")
    X_combined, y_combined, X_test, y_test = train_svd_and_prepare_features(n_factors)
    tune_and_evaluate_svm(X_combined, y_combined, X_test, y_test)



Evaluating SVM models with n_factors = 2

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5; 1/16] START C=0.1, gamma=scale.........................................
[CV 3/5; 3/16] START C=0.1, gamma=0.01..........................................
[CV 2/5; 1/16] START C=0.1, gamma=scale.........................................[CV 4/5; 2/16] START C=0.1, gamma=auto..........................................
[CV 3/5; 1/16] START C=0.1, gamma=scale.........................................

[CV 4/5; 1/16] START C=0.1, gamma=scale.........................................
[CV 4/5; 3/16] START C=0.1, gamma=0.01..........................................
[CV 5/5; 2/16] START C=0.1, gamma=auto..........................................
[CV 2/5; 2/16] START C=0.1, gamma=auto..........................................
[CV 1/5; 2/16] START C=0.1, gamma=auto..........................................[CV 5/5; 3/16] START C=0.1, gamma=0.01..........................................
[CV 5/

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}