In [9]:
import numpy as np
import pandas as pd

from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_squared_error

In [2]:
from surprise import Reader, Dataset, SVD, accuracy

In [2]:
X_train =  np.load("J3_Xtr.npy")
X_val =  np.load("J3_Xval.npy")
X_test =  np.load("J3_Xtest.npy")

In [4]:
# binary label

In [3]:
# load labels
y_train_bin =  np.load("J3_ytrBin.npy")
y_val_bin =  np.load("J3_yvalBin.npy")
y_test_bin =  np.load("J3_ytestBin.npy")

In [11]:
train = np.concatenate([X_train, y_train_bin], axis=1)
val = np.concatenate([X_val, y_val_bin], axis=1)
test = np.concatenate([X_test, y_test_bin], axis=1)

train_df = pd.DataFrame(data = train)
val_df = pd.DataFrame(data = val)                 
test_df = pd.DataFrame(data = test)        

train_df.columns = ['user id', 'item id', 'ratings']
val_df.columns = ['user id', 'item id', 'ratings']
test_df.columns = ['user id', 'item id', 'ratings']

In [12]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(0, 1))

# The columns must correspond to user id, item id and ratings (in that order).
trainset = Dataset.load_from_df(train_df, reader).build_full_trainset()

# load as test sets
valset = Dataset.load_from_df(val_df, reader).build_full_trainset().build_testset()
testset = Dataset.load_from_df(test_df, reader).build_full_trainset().build_testset()

In [13]:
# using the famous SVD algorithm
svd = SVD()
svd.fit(trainset)

val_pred = svd.test(valset)
val_rmse = accuracy.rmse(val_pred, verbose=True)
print(val_rmse)
test_pred = svd.test(testset)
test_rmse = accuracy.rmse(test_pred, verbose=True)
print(test_rmse)

RMSE: 0.2299
0.2298564809683357
RMSE: 0.2303
0.23029355937537352


In [4]:
# Simple dummy regressor based on mean of training set

In [5]:
model = DummyRegressor(strategy='mean')
model.fit(X_train, y_train_bin)

val_pred = model.predict(X_val)
val_rmse = mean_squared_error(val_pred, y_val_bin)**0.5
print(val_rmse)

test_pred = model.predict(X_test)
test_rmse = mean_squared_error(test_pred, y_test_bin)**0.5
print(test_rmse)


0.27355387633398764
0.27425805884877025


In [None]:
# Regression Tree

In [10]:
model = DecisionTreeRegressor(random_state=0)
model.fit(X_train, y_train_bin)

val_pred = model.predict(X_val)
val_rmse = mean_squared_error(val_pred, y_val_bin)**0.5
print(val_rmse)

test_pred = model.predict(X_test)
test_rmse = mean_squared_error(test_pred, y_test_bin)**0.5
print(test_rmse)


0.3349711111389338
0.33533793104180865


In [8]:
# continuous label (-10, 10)

In [6]:
# load labels
y_train_cat =  np.load("J3_ytrCat.npy")
y_val_cat =  np.load("J3_yvalCat.npy")
y_test_cat =  np.load("J3_ytestCat.npy")

In [17]:
train = np.concatenate([X_train, y_train_cat], axis=1)
val = np.concatenate([X_val, y_val_cat], axis=1)
test = np.concatenate([X_test, y_test_cat], axis=1)

train_df = pd.DataFrame(data = train)
val_df = pd.DataFrame(data = val)                 
test_df = pd.DataFrame(data = test)        

train_df.columns = ['user id', 'item id', 'ratings']
val_df.columns = ['user id', 'item id', 'ratings']
test_df.columns = ['user id', 'item id', 'ratings']

In [18]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(-10, 10))

# The columns must correspond to user id, item id and ratings (in that order).
trainset = Dataset.load_from_df(train_df, reader).build_full_trainset()

# load as test sets
valset = Dataset.load_from_df(val_df, reader).build_full_trainset().build_testset()
testset = Dataset.load_from_df(test_df, reader).build_full_trainset().build_testset()

In [19]:
# using the famous SVD algorithm
svd = SVD()
svd.fit(trainset)

val_pred = svd.test(valset)
val_rmse = accuracy.rmse(val_pred, verbose=True)
print(val_rmse)
test_pred = svd.test(testset)
test_rmse = accuracy.rmse(test_pred, verbose=True)
print(test_rmse)

RMSE: 77.5467
77.5467191083278
RMSE: 77.5337
77.53366844681393


In [7]:
# Simple dummy regressor based on mean of training set

In [8]:
model = DummyRegressor(strategy='mean')
model.fit(X_train, y_train_cat)

val_pred = model.predict(X_val)
val_rmse = mean_squared_error(val_pred, y_val_cat)**0.5
print(val_rmse)

test_pred = model.predict(X_test)
test_rmse = mean_squared_error(test_pred, y_test_cat)**0.5
print(test_rmse)


41.871256782944286
41.90156515416368


In [11]:
# Regression Tree

In [12]:
model = DecisionTreeRegressor(random_state=0)
model.fit(X_train, y_train_cat)

val_pred = model.predict(X_val)
val_rmse = mean_squared_error(val_pred, y_val_cat)**0.5
print(val_rmse)

test_pred = model.predict(X_test)
test_rmse = mean_squared_error(test_pred, y_test_cat)**0.5
print(test_rmse)


47.868795393394045
47.879123711100824
