In [1]:
import numpy as np
import pandas as pd

from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_squared_error

In [2]:
from surprise import Reader, Dataset, SVD, accuracy

In [3]:
X_train =  np.load("ML20_Xtr.npy")
X_val =  np.load("ML20_Xval.npy")
X_test =  np.load("ML20_Xtest.npy")

In [5]:
# binary label

In [9]:
# load labels
y_train_bin =  np.load("ML20_ytrBin.npy")[:,None] # shape(N,1) instead of (N,)
y_val_bin =  np.load("ML20_yvalBin.npy")[:,None]
y_test_bin =  np.load("ML20_ytestBin.npy")[:,None]

In [10]:
train = np.concatenate([X_train, y_train_bin], axis=1)
val = np.concatenate([X_val, y_val_bin], axis=1)
test = np.concatenate([X_test, y_test_bin], axis=1)

train_df = pd.DataFrame(data = train)
val_df = pd.DataFrame(data = val)                 
test_df = pd.DataFrame(data = test)        

train_df.columns = ['user id', 'item id', 'ratings']
val_df.columns = ['user id', 'item id', 'ratings']
test_df.columns = ['user id', 'item id', 'ratings']

In [None]:
# SVD

In [12]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(0, 1))

# The columns must correspond to user id, item id and ratings (in that order).
trainset = Dataset.load_from_df(train_df, reader).build_full_trainset()

# load as test sets
valset = Dataset.load_from_df(val_df, reader).build_full_trainset().build_testset()
testset = Dataset.load_from_df(test_df, reader).build_full_trainset().build_testset()

In [13]:
# using the famous SVD algorithm
svd = SVD()
svd.fit(trainset)

val_pred = svd.test(valset)
val_rmse = accuracy.rmse(val_pred, verbose=True)
print(val_rmse)
test_pred = svd.test(testset)
test_rmse = accuracy.rmse(test_pred, verbose=True)
print(test_rmse)

RMSE: 0.4174
0.41743675369333794
RMSE: 0.4176
0.4175909781120648


In [19]:
# Simple dummy regressor based on mean of training set

In [20]:
model = DummyRegressor(strategy='mean')
model.fit(X_train, y_train_bin)

val_pred = model.predict(X_val)
val_rmse = mean_squared_error(val_pred, y_val_bin)**0.5
print(val_rmse)

test_pred = model.predict(X_test)
test_rmse = mean_squared_error(test_pred, y_test_bin)**0.5
print(test_rmse)


0.4877543956196794
0.4878512727666714


In [26]:
# Regression Tree

In [27]:
model = DecisionTreeRegressor(random_state=0)
model.fit(X_train, y_train_bin)

val_pred = model.predict(X_val)
val_rmse = mean_squared_error(val_pred, y_val_bin)**0.5
print(val_rmse)

test_pred = model.predict(X_test)
test_rmse = mean_squared_error(test_pred, y_test_bin)**0.5
print(test_rmse)


0.6307408173725418
0.630906075368746


In [4]:
# continuous label (1, 11)

In [5]:
# load labels
y_train_cat =  np.load("ML20_ytrCat.npy")[:,None]
y_val_cat =  np.load("ML20_yvalCat.npy")[:,None]
y_test_cat =  np.load("ML20_ytestCat.npy")[:,None]

In [6]:
train = np.concatenate([X_train, y_train_cat], axis=1)
val = np.concatenate([X_val, y_val_cat], axis=1)
test = np.concatenate([X_test, y_test_cat], axis=1)

train_df = pd.DataFrame(data = train)
val_df = pd.DataFrame(data = val)                 
test_df = pd.DataFrame(data = test)        

train_df.columns = ['user id', 'item id', 'ratings']
val_df.columns = ['user id', 'item id', 'ratings']
test_df.columns = ['user id', 'item id', 'ratings']

In [7]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 11))

# The columns must correspond to user id, item id and ratings (in that order).
trainset = Dataset.load_from_df(train_df, reader).build_full_trainset()

# load as test sets
valset = Dataset.load_from_df(val_df, reader).build_full_trainset().build_testset()
testset = Dataset.load_from_df(test_df, reader).build_full_trainset().build_testset()

In [8]:
# using the famous SVD algorithm
svd = SVD()
svd.fit(trainset)

val_pred = svd.test(valset)
val_rmse = accuracy.rmse(val_pred, verbose=True)
print(val_rmse)
test_pred = svd.test(testset)
test_rmse = accuracy.rmse(test_pred, verbose=True)
print(test_rmse)

RMSE: 1.6751
1.67510723506543
RMSE: 1.6763
1.6763265024694802


In [9]:
# Simple dummy regressor based on mean of training set

In [10]:
model = DummyRegressor(strategy='mean')
model.fit(X_train, y_train_cat)

val_pred = model.predict(X_val)
val_rmse = mean_squared_error(val_pred, y_val_cat)**0.5
print(val_rmse)

test_pred = model.predict(X_test)
test_rmse = mean_squared_error(test_pred, y_test_cat)**0.5
print(test_rmse)


2.104242741821585
2.10419212748119


In [11]:
# Regression Tree

In [12]:
model = DecisionTreeRegressor(random_state=0)
model.fit(X_train, y_train_cat)

val_pred = model.predict(X_val)
val_rmse = mean_squared_error(val_pred, y_val_cat)**0.5
print(val_rmse)

test_pred = model.predict(X_test)
test_rmse = mean_squared_error(test_pred, y_test_cat)**0.5
print(test_rmse)


2.6582225966888666
2.6598047413416044
