In [1]:
import autograd.numpy as ag_np
import numpy as np
import pandas as pd
import os

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

from CollabFilterOneVectorPerItem import CollabFilterOneVectorPerItem
from train_valid_test_loader import load_train_valid_test_datasets

DATA_DIR = './data_movie_lens_100k'

import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split


In [2]:
train_tuple, valid_tuple, test_tuple, n_users, n_items = load_train_valid_test_datasets()

In [3]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import pandas as pd

user_id_train, item_id_train, y_train = train_tuple
user_id_valid, item_id_valid, y_valid = valid_tuple
user_id_test, item_id_test, y_test = test_tuple

train_df = pd.DataFrame({'user_id': user_id_train, 'item_id': item_id_train, 'rating': y_train})
valid_df = pd.DataFrame({'user_id': user_id_valid, 'item_id': item_id_valid, 'rating': y_valid})
test_df = pd.DataFrame({'user_id': user_id_test, 'item_id': item_id_test})

user_means = train_df.groupby('user_id')['rating'].mean().rename("user_mean_rating")
item_means = train_df.groupby('item_id')['rating'].mean().rename("item_mean_rating")

train_df = train_df.merge(user_means, on='user_id').merge(item_means, on='item_id')
valid_df = valid_df.merge(user_means, on='user_id', how='left').merge(item_means, on='item_id', how='left')
test_df = test_df.merge(user_means, on='user_id', how='left').merge(item_means, on='item_id', how='left')

valid_df.fillna({'user_mean_rating': train_df['rating'].mean(),
                 'item_mean_rating': train_df['rating'].mean()}, inplace=True)
test_df.fillna({'user_mean_rating': train_df['rating'].mean(),
                'item_mean_rating': train_df['rating'].mean()}, inplace=True)

X_train = train_df.drop(columns=["rating"])
y_train = train_df["rating"]
X_valid = valid_df.drop(columns=["rating"])
y_valid = valid_df["rating"]
X_test = test_df 

dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)
dtest = xgb.DMatrix(X_test)

# tuned hyperparameters
params = {
    "objective": "reg:squarederror",
    "eval_metric": "mae",
    "max_depth": 6,
    "eta": 0.1,
    "subsample": 0.8,
    "colsample_bytree": 0.8
}

evals = [(dtrain, "train"), (dvalid, "eval")]
xgb_model = xgb.train(params, dtrain, num_boost_round=1000, evals=evals, early_stopping_rounds=50)

y_pred_test = xgb_model.predict(dtest)

test_df["rating"] = y_test 
test_mae = mean_absolute_error(test_df["rating"], y_pred_test)
print(f"Test MAE: {test_mae:.4f}")

[0]	train-mae:0.91408	eval-mae:0.91812
[1]	train-mae:0.89500	eval-mae:0.90169
[2]	train-mae:0.86940	eval-mae:0.87915
[3]	train-mae:0.85483	eval-mae:0.86626
[4]	train-mae:0.84186	eval-mae:0.85479
[5]	train-mae:0.82207	eval-mae:0.83760
[6]	train-mae:0.80559	eval-mae:0.82335
[7]	train-mae:0.79575	eval-mae:0.81514
[8]	train-mae:0.78397	eval-mae:0.80510
[9]	train-mae:0.77796	eval-mae:0.80003
[10]	train-mae:0.77288	eval-mae:0.79568
[11]	train-mae:0.76517	eval-mae:0.78931
[12]	train-mae:0.76161	eval-mae:0.78652
[13]	train-mae:0.75868	eval-mae:0.78405
[14]	train-mae:0.75452	eval-mae:0.78095
[15]	train-mae:0.75004	eval-mae:0.77750
[16]	train-mae:0.74650	eval-mae:0.77494
[17]	train-mae:0.74352	eval-mae:0.77263
[18]	train-mae:0.74147	eval-mae:0.77112
[19]	train-mae:0.73967	eval-mae:0.77001
[20]	train-mae:0.73826	eval-mae:0.76914
[21]	train-mae:0.73704	eval-mae:0.76843
[22]	train-mae:0.73525	eval-mae:0.76735
[23]	train-mae:0.73353	eval-mae:0.76625
[24]	train-mae:0.73217	eval-mae:0.76536
[25]	train

[203]	train-mae:0.68982	eval-mae:0.75359
[204]	train-mae:0.68970	eval-mae:0.75364
[205]	train-mae:0.68951	eval-mae:0.75341
[206]	train-mae:0.68931	eval-mae:0.75339
[207]	train-mae:0.68913	eval-mae:0.75331
[208]	train-mae:0.68895	eval-mae:0.75328
[209]	train-mae:0.68885	eval-mae:0.75326
[210]	train-mae:0.68869	eval-mae:0.75326
[211]	train-mae:0.68848	eval-mae:0.75318
[212]	train-mae:0.68834	eval-mae:0.75321
[213]	train-mae:0.68822	eval-mae:0.75315
[214]	train-mae:0.68815	eval-mae:0.75310
[215]	train-mae:0.68805	eval-mae:0.75309
[216]	train-mae:0.68790	eval-mae:0.75313
[217]	train-mae:0.68776	eval-mae:0.75330
[218]	train-mae:0.68766	eval-mae:0.75330
[219]	train-mae:0.68753	eval-mae:0.75332
[220]	train-mae:0.68740	eval-mae:0.75341
[221]	train-mae:0.68729	eval-mae:0.75341
[222]	train-mae:0.68720	eval-mae:0.75339
[223]	train-mae:0.68705	eval-mae:0.75347
[224]	train-mae:0.68694	eval-mae:0.75344
[225]	train-mae:0.68676	eval-mae:0.75347
[226]	train-mae:0.68660	eval-mae:0.75338
[227]	train-mae:

[403]	train-mae:0.66450	eval-mae:0.75197
[404]	train-mae:0.66437	eval-mae:0.75196
[405]	train-mae:0.66423	eval-mae:0.75189
[406]	train-mae:0.66414	eval-mae:0.75183
[407]	train-mae:0.66401	eval-mae:0.75181
[408]	train-mae:0.66389	eval-mae:0.75175
[409]	train-mae:0.66379	eval-mae:0.75179
[410]	train-mae:0.66362	eval-mae:0.75183
[411]	train-mae:0.66350	eval-mae:0.75178
[412]	train-mae:0.66348	eval-mae:0.75184
[413]	train-mae:0.66341	eval-mae:0.75191
[414]	train-mae:0.66327	eval-mae:0.75187
[415]	train-mae:0.66315	eval-mae:0.75188
[416]	train-mae:0.66301	eval-mae:0.75187
[417]	train-mae:0.66287	eval-mae:0.75193
[418]	train-mae:0.66278	eval-mae:0.75186
[419]	train-mae:0.66264	eval-mae:0.75183
[420]	train-mae:0.66253	eval-mae:0.75181
[421]	train-mae:0.66240	eval-mae:0.75181
[422]	train-mae:0.66231	eval-mae:0.75172
[423]	train-mae:0.66224	eval-mae:0.75172
[424]	train-mae:0.66209	eval-mae:0.75179
[425]	train-mae:0.66201	eval-mae:0.75177
[426]	train-mae:0.66188	eval-mae:0.75175
[427]	train-mae:

In [4]:
import pandas as pd

leaderboard_df = pd.read_csv("data_movie_lens_100k/ratings_masked_leaderboard_set.csv")

leaderboard_df = leaderboard_df.merge(user_means, on="user_id", how="left")
leaderboard_df = leaderboard_df.merge(item_means, on="item_id", how="left")

leaderboard_df.fillna({
    "user_mean_rating": train_df["rating"].mean(),
    "item_mean_rating": train_df["rating"].mean()
}, inplace=True)

X_leaderboard = leaderboard_df.drop(columns=["rating"])

dleaderboard = xgb.DMatrix(X_leaderboard)

In [5]:
y_pred_leaderboard = xgb_model.predict(dleaderboard)

In [6]:
with open("predicted_ratings_leaderboard.txt", "w") as f:
    for pred in y_pred_leaderboard:
        f.write(f"{pred}\n")

print("Predictions saved to predicted_ratings_leaderboard.txt")

Predictions saved to predicted_ratings_leaderboard.txt


In [7]:
predictions = np.loadtxt("predicted_ratings_leaderboard.txt")
print(f"Loaded predictions shape: {predictions.shape}")  # Should print (10000,)

Loaded predictions shape: (10000,)
