In [14]:
import autograd.numpy as ag_np
import numpy as np
import pandas as pd
import os

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

from CollabFilterOneVectorPerItem import CollabFilterOneVectorPerItem
from train_valid_test_loader import load_train_valid_test_datasets

DATA_DIR = './data_movie_lens_100k'

import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [15]:
train_tuple, valid_tuple, test_tuple, n_users, n_items = load_train_valid_test_datasets()

In [22]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import pandas as pd

user_id_train, item_id_train, y_train = train_tuple
user_id_valid, item_id_valid, y_valid = valid_tuple
user_id_test, item_id_test, y_test = test_tuple

train_df = pd.DataFrame({'user_id': user_id_train, 'item_id': item_id_train, 'rating': y_train})
valid_df = pd.DataFrame({'user_id': user_id_valid, 'item_id': item_id_valid, 'rating': y_valid})
test_df = pd.DataFrame({'user_id': user_id_test, 'item_id': item_id_test})

user_means = train_df.groupby('user_id')['rating'].mean().rename("user_mean_rating")
item_means = train_df.groupby('item_id')['rating'].mean().rename("item_mean_rating")

train_df = train_df.merge(user_means, on='user_id').merge(item_means, on='item_id')
valid_df = valid_df.merge(user_means, on='user_id', how='left').merge(item_means, on='item_id', how='left')
test_df = test_df.merge(user_means, on='user_id', how='left').merge(item_means, on='item_id', how='left')

valid_df.fillna({'user_mean_rating': train_df['rating'].mean(),
                 'item_mean_rating': train_df['rating'].mean()}, inplace=True)
test_df.fillna({'user_mean_rating': train_df['rating'].mean(),
                'item_mean_rating': train_df['rating'].mean()}, inplace=True)

X_train = train_df.drop(columns=["rating"])
y_train = train_df["rating"]
X_valid = valid_df.drop(columns=["rating"])
y_valid = valid_df["rating"]
X_test = test_df 

dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)
dtest = xgb.DMatrix(X_test)

# tuned hyperparameters
params = {
    "objective": "reg:absoluteerror",
    "eval_metric": "mae",
    "max_depth": 6,
    "eta": 0.1,
    "subsample": 0.8,
    "colsample_bytree": 0.8
}

evals = [(dtrain, "train"), (dvalid, "eval")]
xgb_model = xgb.train(params, dtrain, num_boost_round=1000, evals=evals, early_stopping_rounds=50)

y_pred_test = xgb_model.predict(dtest)

test_df["rating"] = y_test 
test_mae = mean_absolute_error(test_df["rating"], y_pred_test)
print(f"Test MAE: {test_mae:.4f}")

[0]	train-mae:0.86929	eval-mae:0.86908
[1]	train-mae:0.85482	eval-mae:0.85781
[2]	train-mae:0.83460	eval-mae:0.84126
[3]	train-mae:0.82326	eval-mae:0.83173
[4]	train-mae:0.81291	eval-mae:0.82345
[5]	train-mae:0.79748	eval-mae:0.81050
[6]	train-mae:0.78420	eval-mae:0.79944
[7]	train-mae:0.77519	eval-mae:0.79260
[8]	train-mae:0.76548	eval-mae:0.78471
[9]	train-mae:0.76020	eval-mae:0.78053
[10]	train-mae:0.75549	eval-mae:0.77688
[11]	train-mae:0.74873	eval-mae:0.77188
[12]	train-mae:0.74528	eval-mae:0.76928
[13]	train-mae:0.74211	eval-mae:0.76690
[14]	train-mae:0.73808	eval-mae:0.76391
[15]	train-mae:0.73311	eval-mae:0.76024
[16]	train-mae:0.72889	eval-mae:0.75706
[17]	train-mae:0.72504	eval-mae:0.75412
[18]	train-mae:0.72269	eval-mae:0.75251
[19]	train-mae:0.72060	eval-mae:0.75116
[20]	train-mae:0.71920	eval-mae:0.75022
[21]	train-mae:0.71797	eval-mae:0.74943
[22]	train-mae:0.71554	eval-mae:0.74759
[23]	train-mae:0.71344	eval-mae:0.74597
[24]	train-mae:0.71145	eval-mae:0.74457
[25]	train

[203]	train-mae:0.67980	eval-mae:0.73233
[204]	train-mae:0.67972	eval-mae:0.73238
[205]	train-mae:0.67969	eval-mae:0.73236
[206]	train-mae:0.67961	eval-mae:0.73239
[207]	train-mae:0.67950	eval-mae:0.73232
[208]	train-mae:0.67947	eval-mae:0.73232
[209]	train-mae:0.67938	eval-mae:0.73228
[210]	train-mae:0.67930	eval-mae:0.73227
[211]	train-mae:0.67920	eval-mae:0.73224
[212]	train-mae:0.67916	eval-mae:0.73221
[213]	train-mae:0.67914	eval-mae:0.73219
[214]	train-mae:0.67910	eval-mae:0.73219
[215]	train-mae:0.67898	eval-mae:0.73220
[216]	train-mae:0.67890	eval-mae:0.73222
[217]	train-mae:0.67881	eval-mae:0.73227
[218]	train-mae:0.67871	eval-mae:0.73220
[219]	train-mae:0.67866	eval-mae:0.73220
[220]	train-mae:0.67851	eval-mae:0.73218
[221]	train-mae:0.67843	eval-mae:0.73214
[222]	train-mae:0.67841	eval-mae:0.73212
[223]	train-mae:0.67832	eval-mae:0.73204
[224]	train-mae:0.67825	eval-mae:0.73203
[225]	train-mae:0.67820	eval-mae:0.73206
[226]	train-mae:0.67815	eval-mae:0.73210
[227]	train-mae:

In [18]:
import pandas as pd

leaderboard_df = pd.read_csv("data_movie_lens_100k/ratings_masked_leaderboard_set.csv")

leaderboard_df = leaderboard_df.merge(user_means, on="user_id", how="left")
leaderboard_df = leaderboard_df.merge(item_means, on="item_id", how="left")

leaderboard_df.fillna({
    "user_mean_rating": train_df["rating"].mean(),
    "item_mean_rating": train_df["rating"].mean()
}, inplace=True)

X_leaderboard = leaderboard_df.drop(columns=["rating"])

dleaderboard = xgb.DMatrix(X_leaderboard)

In [19]:
y_pred_leaderboard = xgb_model.predict(dleaderboard)

In [20]:
with open("predicted_ratings_leaderboard.txt", "w") as f:
    for pred in y_pred_leaderboard:
        f.write(f"{pred}\n")

print("Predictions saved to predicted_ratings_leaderboard.txt")

Predictions saved to predicted_ratings_leaderboard.txt


In [21]:
predictions = np.loadtxt("predicted_ratings_leaderboard.txt")
print(f"Loaded predictions shape: {predictions.shape}")  # Should print (10000,)

Loaded predictions shape: (10000,)
