# Loading data and train SVD for embedding extraction

In [2]:
import pandas as pd
from surprise import Dataset, NormalPredictor, Reader, SVD, accuracy
from surprise.model_selection import cross_validate
import numpy as np


from train_valid_test_loader import load_train_valid_test_datasets

# Load the dataset in the same way as the main problem 
train_tuple, valid_tuple, test_tuple, n_users, n_items = \
        load_train_valid_test_datasets()


def tuple_to_surprise_dataset(tupl):
    """
    This function convert a subset in the tuple form to a `surprise` dataset. 
    """
    ratings_dict = {
        "userID": tupl[0],
        "itemID": tupl[1],
        "rating": tupl[2],
    }

    df = pd.DataFrame(ratings_dict)

    # A reader is still needed but only the rating_scale param is requiered.
    reader = Reader(rating_scale=(1, 5))

    # The columns must correspond to user id, item id and ratings (in that order).
    dataset = Dataset.load_from_df(df[["userID", "itemID", "rating"]], reader)

    return dataset

## Below we train an SVD model and get its vectors 

# train an SVD model using the training set
trainset = tuple_to_surprise_dataset(train_tuple).build_full_trainset()
algo = SVD()
algo.fit(trainset)


# Use an example to show to to slice out user and item vectors learned by the SVD 
uid = valid_tuple[0][0]
iid = valid_tuple[1][0]
rui = valid_tuple[2][0]

# Get model parameters
# NOTE: the SVD model has its own index system because the storage using raw user and item ids
# is not efficient. We need to convert raw ids to internal ids. Please read the few lines below
# carefully 

mu = algo.trainset.global_mean # SVD does not even fit mu -- it directly use the rating mean 
bu = algo.bu[trainset.to_inner_uid(uid)]
bi = algo.bi[trainset.to_inner_iid(iid)] 
pu = algo.pu[trainset.to_inner_uid(uid)] 
qi = algo.qi[trainset.to_inner_iid(iid)]

# Sanity check: we compute our own prediction and compare it against the model's prediction 
# our prediction
my_est = mu + bu + bi + np.dot(pu, qi) 

# the model's prediction
# NOTE: the training of the SVD model is random, so the prediction can be different with 
# different runs -- this is normal.   
svd_pred = algo.predict(uid, iid, r_ui=rui)

# The two predictions should be the same
print("My prediction: " + str(my_est) + ", SVD's prediction: " + str(svd_pred.est) + ", difference: " + str(np.abs(my_est - svd_pred.est)))

assert(np.abs(my_est - svd_pred.est) < 1e-6)

My prediction: 3.0605793703642816, SVD's prediction: 3.0605793703642816, difference: 0.0


# Load in user and movies info

In [3]:
user_info_df = pd.read_csv('../data_movie_lens_100k/user_info.csv')
movie_info_df = pd.read_csv('../data_movie_lens_100k/movie_info.csv')

# Data Clean Up and Feature extraction

In [4]:
feature_vector = []

## Constructing Movies Features

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming you have loaded the movie information into a DataFrame called 'movies_df'
movie_titles = movie_info_df['title'].tolist()

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the movie titles
tfidf_matrix = tfidf_vectorizer.fit_transform(movie_titles)

# Convert the TF-IDF matrix to a dense numpy array
tfidf_features = tfidf_matrix.toarray()

# Combine the TF-IDF features with other movie features
movie_features = np.hstack((tfidf_features, movie_info_df[['release_year']].values))

In [6]:
movie_features = movie_info_df[['release_year']].values

## Constructing User Features

In [12]:
user_features = user_info_df[["age", "is_male"]].values
print(user_features)
print(user_features.shape)

[[  0  24   1]
 [  1  53   0]
 [  2  23   1]
 ...
 [940  20   1]
 [941  48   0]
 [942  22   1]]
(943, 3)


In [15]:
print(train_tuple[0])
print(train_tuple[0].shape)

[662 298  90 ... 574 757 503]
(70000,)
(70000, 3)


In [16]:
# Get the user and item indices from the validation tuple
user_tr = train_tuple[0]
item_tr = train_tuple[1]

# Initialize the combined feature matrix with zeros
n_user_item_pairs = len(user_tr)
n_user_features = user_features.shape[1]
n_movie_features = movie_features.shape[1]
n_user_embeddings = pu.shape[0]
n_item_embeddings = qi.shape[0]
X = np.zeros((n_user_item_pairs, n_user_features + n_movie_features + n_user_embeddings + n_item_embeddings))

# Combine user information, movie information, and embeddings
for i, (user_id, item_id) in enumerate(zip(user_tr, item_tr)):
    user_inner_id = trainset.to_inner_uid(user_id)
    item_inner_id = trainset.to_inner_iid(item_id)
    if  user_inner_id >= 0 and user_inner_id < algo.pu.shape[0]:
        X[i, -n_user_embeddings-n_item_embeddings:-n_item_embeddings] = algo.pu[user_inner_id]
        X[i, :n_user_features] = user_features[user_id]
    if item_inner_id >=0 and item_inner_id < algo.qi.shape[0]:
        X[i, -n_item_embeddings:] = algo.qi[item_inner_id]
        X[i, n_user_features:n_user_features+n_movie_features] = movie_features[item_id]


In [17]:
print(X.shape)

(70000, 2493)


# Create Binary Labeling
We'll be using a 4.5 threshold on the ratings for classification

In [8]:
# Create target vector based on actual ratings
y = np.array(train_tuple[2])

y_binary = (y >= 4.5).astype(int)  # Binarize ratings: >= 4 is positive, < 4 is negative
print(y_binary.shape)

(70000,)


# Model Selection

In [9]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


In [22]:
model = LogisticRegression()
params = {'C': [0.1, 0.01, 0.001]}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(model, params, scoring='roc_auc', cv=kf, n_jobs=-1)
grid_search.fit(X, y_binary)

: 

In [14]:
models = [
    # ('Logistic Regression', LogisticRegression(), {'C': [0.1, 1, 10]}),
    # ('SVM', SVC(probability=True), {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}),
    ('Random Forest', RandomForestClassifier(), {'n_estimators': [50, 100, 200], 'max_depth': [None, 5, 10]})
]

# Create a KFold object for cross-validation

# Create a scorer object for AUC
auc_scorer = make_scorer(roc_auc_score)

# Perform model selection and hyperparameter tuning
best_model = None
best_params = None
best_auc = 0

for name, model, params in models:
    print(f"Evaluating {name}:")
    
    # Perform grid search with cross-validation
    grid_search.fit(X, y_binary)
    
    # Get the best model and its corresponding hyperparameters
    if grid_search.best_score_ > best_auc:
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        best_auc = grid_search.best_score_
    
    print(f"Best AUC: {grid_search.best_score_:.4f}")
    print(f"Best hyperparameters: {grid_search.best_params_}\n")

print("Best model:", best_model)
print("Best hyperparameters:", best_params)
print(f"Best AUC: {best_auc:.4f}")

Evaluating Random Forest:


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9), SIGKILL(-9)}

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from scipy.sparse import hstack, csr_matrix
from surprise import Dataset, Reader, SVD
from train_valid_test_loader import load_train_valid_test_datasets

# Load user and movie information
user_info = pd.read_csv("../data_movie_lens_100k/user_info.csv")
movie_info = pd.read_csv("../data_movie_lens_100k/movie_info.csv")

# Drop unnecessary columns
user_info = user_info.drop(columns=['orig_user_id'])
movie_info = movie_info.drop(columns=['orig_item_id'])

# Ensure proper column naming
user_info.columns = ['user_id', 'age', 'is_male']
movie_info.columns = ['item_id', 'title', 'release_year']

# Load datasets
train_tuple, valid_tuple, test_tuple, n_users, n_items = load_train_valid_test_datasets()

# Convert tuples to a `surprise` dataset
def tuple_to_surprise_dataset(tupl):
    ratings_dict = {
        "userID": tupl[0],
        "itemID": tupl[1],
        "rating": tupl[2],
    }
    df = pd.DataFrame(ratings_dict)
    reader = Reader(rating_scale=(1, 5))
    dataset = Dataset.load_from_df(df[["userID", "itemID", "rating"]], reader)
    return dataset

# Movie Title Feature Extraction using TF-IDF
titles = movie_info['title'].fillna('')
tfidf = TfidfVectorizer(max_features=100)
title_vectors = csr_matrix(tfidf.fit_transform(titles))
title_dict = {item: title_vectors[i] for i, item in enumerate(movie_info['item_id'])}

# Normalize or bucket movie years
movie_info['year_normalized'] = (movie_info['release_year'] - movie_info['release_year'].min()) / (movie_info['release_year'].max() - movie_info['release_year'].min())
year_dict = {item: movie_info.loc[movie_info['item_id'] == item, 'year_normalized'].values[0] for item in movie_info['item_id']}

# Function to create feature vectors
def create_features(user_id, item_id, user_factors, item_factors, n_factors):
    zero_vector = np.zeros(n_factors)

    user_emb = user_factors.get(user_id, zero_vector)
    item_emb = item_factors.get(item_id, zero_vector)

    # Add demographic features
    user_info_row = user_info[user_info['user_id'] == user_id]
    if not user_info_row.empty:
        is_male = user_info_row.iloc[0]['is_male']
        age = user_info_row.iloc[0]['age']
    else:
        is_male = 0
        age = 0

    # Add movie title and year features
    title_emb = title_dict.get(item_id, csr_matrix((1, 100)))
    year_emb = year_dict.get(item_id, 0)

    demographic_features = np.array([is_male, age, year_emb])

    # Combine all features and return as a sparse matrix
    return hstack([csr_matrix(user_emb), csr_matrix(item_emb), csr_matrix(demographic_features), title_emb])

# Prepare the final DataFrame for training
def prepare_features_and_labels(tupl, user_factors, item_factors, n_factors, threshold=4.5):
    ratings_df = pd.DataFrame({
        'userID': tupl[0],
        'itemID': tupl[1],
        'rating': tupl[2]
    })
    ratings_df['features'] = ratings_df.apply(lambda row: create_features(row['userID'], row['itemID'], user_factors, item_factors, n_factors), axis=1)
    ratings_df['label'] = (ratings_df['rating'] >= threshold).astype(int)
    X = np.vstack(ratings_df['features']).tocsr()
    y = ratings_df['label'].values
    return X, y

# Function to train and extract features using SVD
def train_svd_and_prepare_features(n_factors):
    # Train SVD model using the training set
    trainset = tuple_to_surprise_dataset(train_tuple).build_full_trainset()
    algo = SVD(n_factors=n_factors)
    algo.fit(trainset)

    # Extract user and item factors
    user_factors = {trainset.to_raw_uid(uid): algo.pu[uid] for uid in range(trainset.n_users)}
    item_factors = {trainset.to_raw_iid(iid): algo.qi[iid] for iid in range(trainset.n_items)}

    # Create features and labels
    X_train, y_train = prepare_features_and_labels(train_tuple, user_factors, item_factors, n_factors)
    X_valid, y_valid = prepare_features_and_labels(valid_tuple, user_factors, item_factors, n_factors)
    X_test, y_test = prepare_features_and_labels(test_tuple, user_factors, item_factors, n_factors)

    # Combine training and validation for model selection
    X_combined = np.vstack([X_train, X_valid]).tocsr()
    y_combined = np.hstack([y_train, y_valid])

    return X_combined, y_combined, X_test, y_test

# Function to tune SVM with GridSearchCV
def tune_and_evaluate_svm(X_combined, y_combined, X_test, y_test):
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto', 0.01, 0.001]
    }

    svm = SVC(kernel='rbf', probability=True, random_state=42)
    grid_search = GridSearchCV(svm, param_grid, scoring='roc_auc', cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), n_jobs=-1)
    grid_search.fit(X_combined, y_combined)

    best_svm = grid_search.best_estimator_
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best Cross-Validation AUC Score: {grid_search.best_score_:.4f}")

    # Test the best model
    y_pred_proba = best_svm.predict_proba(X_test)[:, 1]
    y_pred = best_svm.predict(X_test)

    print(f"\nTest Set AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

# Evaluate models for different `n_factors`
for n_factors in [2, 10, 50]:
    print(f"\nEvaluating SVM models with n_factors = {n_factors}\n")
    X_combined, y_combined, X_test, y_test = train_svd_and_prepare_features(n_factors)
    tune_and_evaluate_svm(X_combined, y_combined, X_test, y_test)



Evaluating SVM models with n_factors = 2



NameError: name 'vstack' is not defined