In [7]:
import numpy as np
import pandas as pd

from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_squared_error

In [2]:
from surprise import Reader, Dataset, SVD, accuracy

In [None]:
X_train =  np.load("J1_Xtr.npy")
X_val =  np.load("J1_Xval.npy")
X_test =  np.load("J1_Xtest.npy")

In [20]:
# binary label

In [3]:
# load labels
y_train_bin =  np.load("J1_ytrBin.npy")
y_val_bin =  np.load("J1_yvalBin.npy")
y_test_bin =  np.load("J1_ytestBin.npy")

In [38]:
train = np.concatenate([X_train, y_train_bin], axis=1)
val = np.concatenate([X_val, y_val_bin], axis=1)
test = np.concatenate([X_test, y_test_bin], axis=1)

In [5]:
train_df = pd.DataFrame(data = train)
val_df = pd.DataFrame(data = val)                 
test_df = pd.DataFrame(data = test)                 

In [6]:
train_df.columns = ['user id', 'item id', 'ratings']
val_df.columns = ['user id', 'item id', 'ratings']
test_df.columns = ['user id', 'item id', 'ratings']

In [None]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(0, 1))

# The columns must correspond to user id, item id and ratings (in that order).
trainset = Dataset.load_from_df(train_df, reader).build_full_trainset()

In [11]:
valset = Dataset.load_from_df(val_df, reader).build_full_trainset().build_testset()
testset = Dataset.load_from_df(test_df, reader).build_full_trainset().build_testset()

In [8]:
# using the famous SVD algorithm
svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f96401ec0b8>

In [17]:
val_pred = svd.test(valset)
val_rmse = accuracy.rmse(val_pred, verbose=True)
print(val_rmse)

RMSE: 0.4354


In [19]:
test_pred = svd.test(testset)
test_rmse = accuracy.rmse(test_pred, verbose=True)
print(test_rmse)

RMSE: 0.4350
0.4354149646036034


In [None]:
# Simple dummy regressor based on mean of training set

In [4]:
model = DummyRegressor(strategy='mean')
model.fit(X_train, y_train_bin)

val_pred = model.predict(X_val)
val_rmse = mean_squared_error(val_pred, y_val_bin)**0.5
print(val_rmse)

test_pred = model.predict(X_test)
test_rmse = mean_squared_error(test_pred, y_test_bin)**0.5
print(test_rmse)


0.4927393332725065
0.4927184032540328


In [None]:
# Regression Tree

In [8]:
model = DecisionTreeRegressor(random_state=0)
model.fit(X_train, y_train_bin)

val_pred = model.predict(X_val)
val_rmse = mean_squared_error(val_pred, y_val_bin)**0.5
print(val_rmse)

test_pred = model.predict(X_test)
test_rmse = mean_squared_error(test_pred, y_test_bin)**0.5
print(test_rmse)


0.6660231778872007
0.6656882371912507


In [None]:
# continuous label (-10, 10)

In [5]:
# load labels
y_train_cat =  np.load("J1_ytrCat.npy")
y_val_cat =  np.load("J1_yvalCat.npy")
y_test_cat =  np.load("J1_ytestCat.npy")

In [32]:
train = np.concatenate([X_train, y_train_cat], axis=1)
val = np.concatenate([X_val, y_val_cat], axis=1)
test = np.concatenate([X_test, y_test_cat], axis=1)

In [33]:
train_df = pd.DataFrame(data = train)
val_df = pd.DataFrame(data = val)                 
test_df = pd.DataFrame(data = test)   

train_df.columns = ['user id', 'item id', 'ratings']
val_df.columns = ['user id', 'item id', 'ratings']
test_df.columns = ['user id', 'item id', 'ratings']

In [35]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(-10, 10))

# The columns must correspond to user id, item id and ratings (in that order).
trainset = Dataset.load_from_df(train_df, reader).build_full_trainset()

#load as test sets
valset = Dataset.load_from_df(val_df, reader).build_full_trainset().build_testset()
testset = Dataset.load_from_df(test_df, reader).build_full_trainset().build_testset()

In [36]:
# using the famous SVD algorithm
svd = SVD()
svd.fit(trainset)

val_pred = svd.test(valset)
val_rmse = accuracy.rmse(val_pred, verbose=True)
print(val_rmse)
test_pred = svd.test(testset)
test_rmse = accuracy.rmse(test_pred, verbose=True)
print(test_rmse)

RMSE: 4.3584
4.358351679313397
RMSE: 4.3542
4.354231918267263


In [39]:
# Simple dummy regressor based on mean of training set

In [6]:
model = DummyRegressor(strategy='mean')
model.fit(X_train, y_train_cat)

val_pred = model.predict(X_val)
val_rmse = mean_squared_error(val_pred, y_val_cat)**0.5
print(val_rmse)

test_pred = model.predict(X_test)
test_rmse = mean_squared_error(test_pred, y_test_cat)**0.5
print(test_rmse)


5.299175289592836
5.293947897405884


In [None]:
# Regression Tree

In [9]:
model = DecisionTreeRegressor(random_state=0)
model.fit(X_train, y_train_cat)

val_pred = model.predict(X_val)
val_rmse = mean_squared_error(val_pred, y_val_cat)**0.5
print(val_rmse)

test_pred = model.predict(X_test)
test_rmse = mean_squared_error(test_pred, y_test_cat)**0.5
print(test_rmse)


7.072845958367973
7.067145812984051
