In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..", "eval_framework")))

from data_loader import load_data, split_data
import reporter 


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## Load Data

In [3]:
def load_data_mac(sample_size=None):
    """Load MovieLens dataset"""
    print("Loading data...")
    print("Loading MovieLens 100K dataset...")
    
    # Load ratings
    ratings = pd.read_csv('../data/ml-latest-small/ml-latest-small/ratings.csv')
    
    # Load movies
    movies = pd.read_csv('../data/ml-latest-small/ml-latest-small/movies.csv')
    
    
    # Convert genres to list format
    movies['genres'] = movies['genres'].str.split('|')
    movies = movies[['movieId', 'title', 'genres']]
    
    if sample_size:
        # Sample users
        unique_users = ratings['userId'].unique()
        sampled_users = np.random.choice(unique_users, size=sample_size, replace=False)
        ratings = ratings[ratings['userId'].isin(sampled_users)]
        movies = movies[movies['movieId'].isin(ratings['movieId'].unique())]
    
    print(f"Loaded {len(ratings)} ratings and {len(movies)} movies")
    return ratings, movies

In [4]:
import os
print(os.getcwd())

/Users/angela/Desktop/probabilistic-movie-recommender/SPN


In [5]:
ratings, movies = load_data_mac()

# Encode movieId in both datasets
movie_encoder = LabelEncoder()
movies["movieId"] = movie_encoder.fit_transform(movies["movieId"])
ratings["movieId"] = movie_encoder.transform(ratings["movieId"])

# Merge genres with ratings
df = ratings.merge(movies[["movieId", "genres"]], on="movieId", how="left")
df_expanded = df["genres"].str.get_dummies(sep="|")
df = pd.concat([df.drop(columns=["genres"]), df_expanded], axis=1)

# Binarize rating to create label
df["rating_class"] = (df["rating"] >= 4).astype(int)


Loading data...
Loading MovieLens 100K dataset...
Loaded 100836 ratings and 9742 movies


In [6]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np

In [7]:
# Dimensionality reduction on genre using PCA
genre_columns = df_expanded.columns.tolist()
scaler = StandardScaler()
pca = PCA(n_components=5)

scaled_genres = scaler.fit_transform(df[genre_columns])
pca_genres = pca.fit_transform(scaled_genres)

pca_columns = [f"pca_genre_{i}" for i in range(pca_genres.shape[1])]
df_pca = pd.DataFrame(pca_genres, columns=pca_columns)
df = pd.concat([df.drop(columns=genre_columns), df_pca], axis=1)


## Train Model

In [8]:
from spn.structure.Base import Context
from spn.structure.StatisticalTypes import MetaType
from spn.algorithms.LearningWrappers import learn_classifier
from spn.structure.leaves.parametric.Parametric import Categorical, Gaussian
from spn.algorithms.LearningWrappers import learn_parametric
import scipy
scipy.NINF = float("-inf")

In [9]:
import time
from memory_profiler import memory_usage
feature_columns = ["userId", "movieId"] + pca_columns
X = df[feature_columns].values.astype(float)
y = df[["rating_class"]].values.astype(float)
data = np.concatenate([X, y], axis=1)
train_data, test_data = split_data(data)

# Define SPN context
param_types = [Categorical, Categorical] + [Gaussian] * len(pca_columns) + [Categorical]
context = Context(parametric_types=param_types)
context.add_domains(train_data)


def train():
    start = time.time()
    # Learn SPN classifier
    spn = learn_classifier(train_data, context, spn_learn_wrapper=learn_parametric, label_idx=train_data.shape[1] - 1)
    end = time.time()
    return spn, end - start

mem_usage, (spn, runtime) = memory_usage(train, retval=True, max_iterations=1)
print(f"Runtime: {runtime:.4f} seconds")
print(f"Memory peak: {max(mem_usage) - min(mem_usage):.2f} MiB")


\nSplitting data...
Training set size: 80668, Test set size: 20168
Runtime: 182.7926 seconds
Memory peak: 4258.97 MiB


In [12]:
from spn.algorithms.MPE import mpe
from sklearn.metrics import classification_report, accuracy_score

start = time.time()
mpe_input = test_data.copy()
mpe_input[:, -1] = np.nan
predictions = mpe(spn, mpe_input)
end = time.time()
print(f"Inference time: {end - start:.2f} s")
pred_classes = predictions[:, -1]

y_true = test_data[:, -1].astype(int)
y_pred = predictions[:, -1].astype(int)

print("\n===== SPN Classification Report =====")
print(classification_report(y_true, y_pred))
print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")


Inference time: 1.36 s

===== SPN Classification Report =====
              precision    recall  f1-score   support

           0       0.67      0.69      0.68     10477
           1       0.66      0.64      0.65      9691

    accuracy                           0.67     20168
   macro avg       0.67      0.66      0.66     20168
weighted avg       0.67      0.67      0.67     20168

Accuracy: 0.6659


In [17]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mae = mean_absolute_error(y_true, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"MAE:  {mae:.4f}")

RMSE: 0.5780
MAE:  0.3341
