In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV, StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.svm import SVC
from scipy.sparse import hstack, csr_matrix, vstack
from surprise import Dataset, Reader, SVD
from train_valid_test_loader import load_train_valid_test_datasets

# Load user and movie information
user_info = pd.read_csv("../data_movie_lens_100k/user_info.csv")
movie_info = pd.read_csv("../data_movie_lens_100k/movie_info.csv")

# Drop unnecessary columns
user_info = user_info.drop(columns=['orig_user_id'])
movie_info = movie_info.drop(columns=['orig_item_id','title'])

# Ensure proper column naming
user_info.columns = ['user_id', 'age', 'is_male']
movie_info.columns = ['item_id', 'release_year']

# Normalize release years
movie_info['year_normalized'] = (movie_info['release_year'] - movie_info['release_year'].min()) / (movie_info['release_year'].max() - movie_info['release_year'].min())
year_dict = {item: movie_info.loc[movie_info['item_id'] == item, 'year_normalized'].values[0] for item in movie_info['item_id']}

# Load datasets
train_tuple, valid_tuple, test_tuple, n_users, n_items = load_train_valid_test_datasets()

# Convert tuples to a `surprise` dataset
def tuple_to_surprise_dataset(tupl):
    ratings_dict = {
        "userID": tupl[0],
        "itemID": tupl[1],
        "rating": tupl[2],
    }
    df = pd.DataFrame(ratings_dict)
    reader = Reader(rating_scale=(1, 5))
    dataset = Dataset.load_from_df(df[["userID", "itemID", "rating"]], reader)
    return dataset

# Function to create feature vectors
def create_features(user_id, item_id, user_factors, item_factors, n_factors):
    zero_vector = np.zeros(n_factors)

    user_emb = user_factors.get(user_id, zero_vector)
    item_emb = item_factors.get(item_id, zero_vector)

    # Add demographic features
    user_info_row = user_info[user_info['user_id'] == user_id]
    if not user_info_row.empty:
        is_male = user_info_row.iloc[0]['is_male']
        age = user_info_row.iloc[0]['age']
    else:
        is_male = 0
        age = 0

    # Add normalized year feature
    year_emb = year_dict.get(item_id, 0)

    demographic_features = np.array([is_male, age, year_emb])

    # Combine all features and return as a sparse matrix
    return hstack([csr_matrix(user_emb), csr_matrix(item_emb), csr_matrix(demographic_features)])

# Prepare the final DataFrame for training
def prepare_features_and_labels(tupl, user_factors, item_factors, n_factors, threshold=4.5):
    ratings_df = pd.DataFrame({
        'userID': tupl[0],
        'itemID': tupl[1],
        'rating': tupl[2]
    })
    ratings_df['features'] = ratings_df.apply(lambda row: create_features(row['userID'], row['itemID'], user_factors, item_factors, n_factors), axis=1)
    ratings_df['label'] = (ratings_df['rating'] >= threshold).astype(int)
    X = vstack(ratings_df['features']).tocsr()
    y = ratings_df['label'].values
    return X, y

# Function to train and extract features using SVD
def train_svd_and_prepare_features(n_factors, sample_size=20000):
    # Train SVD model using the training set
    trainset = tuple_to_surprise_dataset(train_tuple).build_full_trainset()
    algo = SVD(n_factors=n_factors)
    algo.fit(trainset)

    # Extract user and item factors
    user_factors = {trainset.to_raw_uid(uid): algo.pu[uid] for uid in range(trainset.n_users)}
    item_factors = {trainset.to_raw_iid(iid): algo.qi[iid] for iid in range(trainset.n_items)}

    # Create features and labels
    X_train, y_train = prepare_features_and_labels(train_tuple, user_factors, item_factors, n_factors)
    X_valid, y_valid = prepare_features_and_labels(valid_tuple, user_factors, item_factors, n_factors)
    X_test, y_test = prepare_features_and_labels(test_tuple, user_factors, item_factors, n_factors)

    # Combine training and validation for model selection
    X_combined = vstack([X_train, X_valid]).tocsr()
    y_combined = np.hstack([y_train, y_valid])

    # Stratified sampling
    sss = StratifiedShuffleSplit(n_splits=1, test_size=len(y_combined) - sample_size, random_state=42)
    train_index, _ = next(sss.split(X_combined, y_combined))

    X_combined = X_combined[train_index]
    y_combined = y_combined[train_index]

    return X_combined, y_combined, X_test, y_test

# Function to tune SVM with GridSearchCV
def tune_and_evaluate_svm(X_combined, y_combined, X_test, y_test):
    param_grid = {
        'C': [0.1, 1, 10],
        'gamma': ['scale', 'auto']
    }

    svm = SVC(kernel='rbf', probability=True, random_state=42)
    grid_search = GridSearchCV(svm, param_grid, scoring='roc_auc', cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), verbose=10, n_jobs=-1)
    grid_search.fit(X_combined, y_combined)

    best_svm = grid_search.best_estimator_
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best Cross-Validation AUC Score: {grid_search.best_score_:.4f}")

    # Test the best model
    y_pred_proba = best_svm.predict_proba(X_test)[:, 1]
    y_pred = best_svm.predict(X_test)

    print(f"\nTest Set AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

# Evaluate models for different `n_factors`



Evaluating SVM models with n_factors = 2



KeyboardInterrupt: 

In [4]:
extracted_features = [train_svd_and_prepare_features(2, sample_size=20000),train_svd_and_prepare_features(10, sample_size=20000),train_svd_and_prepare_features(50, sample_size=20000),]

In [None]:
for features in extracted_features:
    print(f"\nEvaluating SVM models\n")
    X_combined, y_combined, X_test, y_test = features
    tune_and_evaluate_svm(X_combined, y_combined, X_test, y_test)