# Ensemble #

### Train the SVD ###

In [None]:

from surprise import SVD, Reader, Dataset
from surprise.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import os

# Specify the folder containing the data
data_folder = 'data_movie_lens_100k'

# Load the additional data (user and movie info) from the specified folder
user_info = pd.read_csv(os.path.join(data_folder, 'user_info.csv'))
movie_info = pd.read_csv(os.path.join(data_folder, 'movie_info.csv'))

# Example of encoding additional features for users and items
# One-hot encode user info (age and gender)
user_info['age_group'] = pd.cut(user_info['age'], bins=[0, 18, 30, 40, 50, 100], labels=["0-18", "19-30", "31-40", "41-50", "50+"])

# Use the correct argument for sparse matrix
encoder = OneHotEncoder(sparse_output=False)

# Encode the 'age_group' and 'is_male' columns separately
encoded_user_info = encoder.fit_transform(user_info[['age_group', 'is_male']])

# One-hot encode movie info (release year)
movie_info['release_year'] = movie_info['release_year'].astype(str)
encoder_movie = OneHotEncoder(sparse_output=False)

# Encode the 'release_year' column
encoded_movie_info = encoder_movie.fit_transform(movie_info[['release_year']])

# Now, assign the feature names correctly by using encoder.get_feature_names_out()
user_info_encoded = pd.DataFrame(encoded_user_info, columns=encoder.get_feature_names_out(['age_group', 'is_male']))
movie_info_encoded = pd.DataFrame(encoded_movie_info, columns=encoder_movie.get_feature_names_out(['release_year']))

# Merge with the original datasets
user_info = pd.concat([user_info, user_info_encoded], axis=1)
movie_info = pd.concat([movie_info, movie_info_encoded], axis=1)

# Merge user and movie info with the ratings data
train_data = pd.read_csv(os.path.join(data_folder, "ratings_all_development_set.csv"))
train_data = pd.merge(train_data, user_info, on="user_id", how="left")
train_data = pd.merge(train_data, movie_info, on="item_id", how="left")

# Convert to Surprise format
from surprise import Reader, Dataset
reader = Reader(rating_scale=(1, 5))
train_dataset = Dataset.load_from_df(train_data[['user_id', 'item_id', 'rating']], reader)

# Train-test split for validation and testing
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(train_dataset, test_size=0.2)


param_grid = {
    'n_factors': [50, 100, 150, 200],  # Try more factors
    'reg_all': [0.1, 0.2, 0.3, 0.5],  # Try stronger regularization
    'lr_all': [0.001, 0.002, 0.005],   # Lower learning rates
    'n_epochs': [30, 50, 100]          # Try more epochs for training
}

grid_search = GridSearchCV(SVD, param_grid, measures=['mae'], cv=3)
grid_search.fit(train_dataset)


# Get the best model
best_svd = grid_search.best_estimator['mae']

# Train the best model on the entire training data
trainset = train_dataset.build_full_trainset()
best_svd.fit(trainset)

# Test on the test set
test_dataset = Dataset.load_from_df(train_data[['user_id', 'item_id', 'rating']], reader)  # Reuse train_data as example
testset = test_dataset.build_full_trainset().build_testset()
predictions = best_svd.test(testset)

# Evaluate MAE
from surprise import accuracy
mae = accuracy.mae(predictions)
print(f"Test MAE: {mae}")



### Train the XGB ###

In [None]:

import autograd.numpy as ag_np
import numpy as np
import pandas as pd
import os

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

from CollabFilterOneVectorPerItem import CollabFilterOneVectorPerItem
from train_valid_test_loader import load_train_valid_test_datasets

DATA_DIR = './data_movie_lens_100k'

import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

train_tuple, valid_tuple, test_tuple, n_users, n_items = load_train_valid_test_datasets()


import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import pandas as pd

user_id_train, item_id_train, y_train = train_tuple
user_id_valid, item_id_valid, y_valid = valid_tuple
user_id_test, item_id_test, y_test = test_tuple

train_df = pd.DataFrame({'user_id': user_id_train, 'item_id': item_id_train, 'rating': y_train})
valid_df = pd.DataFrame({'user_id': user_id_valid, 'item_id': item_id_valid, 'rating': y_valid})
test_df = pd.DataFrame({'user_id': user_id_test, 'item_id': item_id_test})

user_means = train_df.groupby('user_id')['rating'].mean().rename("user_mean_rating")
item_means = train_df.groupby('item_id')['rating'].mean().rename("item_mean_rating")

train_df = train_df.merge(user_means, on='user_id').merge(item_means, on='item_id')
valid_df = valid_df.merge(user_means, on='user_id', how='left').merge(item_means, on='item_id', how='left')
test_df = test_df.merge(user_means, on='user_id', how='left').merge(item_means, on='item_id', how='left')

valid_df.fillna({'user_mean_rating': train_df['rating'].mean(),
                 'item_mean_rating': train_df['rating'].mean()}, inplace=True)
test_df.fillna({'user_mean_rating': train_df['rating'].mean(),
                'item_mean_rating': train_df['rating'].mean()}, inplace=True)

X_train = train_df.drop(columns=["rating"])
y_train = train_df["rating"]
X_valid = valid_df.drop(columns=["rating"])
y_valid = valid_df["rating"]
X_test = test_df 

dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)
dtest = xgb.DMatrix(X_test)

# tuned hyperparameters
params = {
    "objective": "reg:absoluteerror",
    "eval_metric": "mae",
    "max_depth": 6,
    "eta": 0.1,
    "subsample": 0.8,
    "colsample_bytree": 0.8
}

evals = [(dtrain, "train"), (dvalid, "eval")]
xgb_model = xgb.train(params, dtrain, num_boost_round=1000, evals=evals, early_stopping_rounds=50)

y_pred_test = xgb_model.predict(dtest)

test_df["rating"] = y_test 
test_mae = mean_absolute_error(test_df["rating"], y_pred_test)
print(f"Test MAE: {test_mae:.4f}")


# import pandas as pd

# leaderboard_df = pd.read_csv("data_movie_lens_100k/ratings_masked_leaderboard_set.csv")

# leaderboard_df = leaderboard_df.merge(user_means, on="user_id", how="left")
# leaderboard_df = leaderboard_df.merge(item_means, on="item_id", how="left")

# leaderboard_df.fillna({
#     "user_mean_rating": train_df["rating"].mean(),
#     "item_mean_rating": train_df["rating"].mean()
# }, inplace=True)

# X_leaderboard = leaderboard_df.drop(columns=["rating"])

# dleaderboard = xgb.DMatrix(X_leaderboard)

# y_pred_leaderboard = xgb_model.predict(dleaderboard)



### Train the KNN ###

In [None]:
import pandas as pd
import os
from surprise import Reader, Dataset, KNNBasic
from surprise.model_selection import GridSearchCV

# Specify the folder containing the data
data_folder = 'data_movie_lens_100k'

# Load the additional data (user and movie info) from the specified folder
user_info = pd.read_csv(os.path.join(data_folder, 'user_info.csv'))
movie_info = pd.read_csv(os.path.join(data_folder, 'movie_info.csv'))

# Example of encoding additional features for users and items
# One-hot encode user info (age and gender)
user_info['age_group'] = pd.cut(user_info['age'], bins=[0, 18, 30, 40, 50, 100], labels=["0-18", "19-30", "31-40", "41-50", "50+"])

from sklearn.preprocessing import OneHotEncoder

# Use the correct argument for sparse matrix
encoder = OneHotEncoder(sparse_output=False)

# Encode the 'age_group' and 'is_male' columns separately
encoded_user_info = encoder.fit_transform(user_info[['age_group', 'is_male']])

# One-hot encode movie info (release year)
movie_info['release_year'] = movie_info['release_year'].astype(str)
encoder_movie = OneHotEncoder(sparse_output=False)

# Encode the 'release_year' column
encoded_movie_info = encoder_movie.fit_transform(movie_info[['release_year']])

# Now, assign the feature names correctly by using encoder.get_feature_names_out()
user_info_encoded = pd.DataFrame(encoded_user_info, columns=encoder.get_feature_names_out(['age_group', 'is_male']))
movie_info_encoded = pd.DataFrame(encoded_movie_info, columns=encoder_movie.get_feature_names_out(['release_year']))

# Merge with the original datasets
user_info = pd.concat([user_info, user_info_encoded], axis=1)
movie_info = pd.concat([movie_info, movie_info_encoded], axis=1)

# Load and merge the ratings data
train_data = pd.read_csv(os.path.join(data_folder, "ratings_all_development_set.csv"))
train_data = pd.merge(train_data, user_info, on="user_id", how="left")
train_data = pd.merge(train_data, movie_info, on="item_id", how="left")

# Convert to Surprise format
reader = Reader(rating_scale=(1, 5))
surprise_data = Dataset.load_from_df(train_data[['user_id', 'item_id', 'rating']], reader)

# Define parameter grid for tuning
param_grid = {
    'k': [20, 30, 50, 100],  # Number of neighbors
    'sim_options': {
        'name': ['cosine', 'pearson', 'msd'],
        'user_based': [True, False]
    }
}

# Use GridSearchCV to find the best parameters
gs = GridSearchCV(KNNBasic, param_grid, measures=['mae'], cv=5, n_jobs=-1)
gs.fit(surprise_data)

# Output the best score and parameters
print(f"Best MAE: {gs.best_score['mae']}")
print(f"Best Parameters: {gs.best_params['mae']}")

# Train the best model on the full dataset
best_model = gs.best_estimator['mae']
trainset = surprise_data.build_full_trainset()
best_model.fit(trainset)




### Weighted Average Ensemble ###

In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
from surprise import Dataset, Reader, KNNBasic, SVD, accuracy
from surprise.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder

# Load the data (assuming the paths are correct)
data_folder = 'data_movie_lens_100k'
user_info = pd.read_csv(os.path.join(data_folder, 'user_info.csv'))
movie_info = pd.read_csv(os.path.join(data_folder, 'movie_info.csv'))
train_data = pd.read_csv(os.path.join(data_folder, "ratings_all_development_set.csv"))

# Example of encoding user and movie info
user_info['age_group'] = pd.cut(user_info['age'], bins=[0, 18, 30, 40, 50, 100], labels=["0-18", "19-30", "31-40", "41-50", "50+"])
encoder = OneHotEncoder(sparse_output=False)
encoded_user_info = encoder.fit_transform(user_info[['age_group', 'is_male']])

movie_info['release_year'] = movie_info['release_year'].astype(str)
encoder_movie = OneHotEncoder(sparse_output=False)
encoded_movie_info = encoder_movie.fit_transform(movie_info[['release_year']])

user_info_encoded = pd.DataFrame(encoded_user_info, columns=encoder.get_feature_names_out(['age_group', 'is_male']))
movie_info_encoded = pd.DataFrame(encoded_movie_info, columns=encoder_movie.get_feature_names_out(['release_year']))

user_info = pd.concat([user_info, user_info_encoded], axis=1)
movie_info = pd.concat([movie_info, movie_info_encoded], axis=1)

# Merge with ratings
train_data = pd.merge(train_data, user_info, on="user_id", how="left")
train_data = pd.merge(train_data, movie_info, on="item_id", how="left")

# Assuming you have already split user_id_test, item_id_test, and ratings for the test set
test_data = pd.DataFrame({'user_id': user_id_test, 'item_id': item_id_test, 'rating': y_test})
test_data = pd.merge(test_data, user_info, on="user_id", how="left")
test_data = pd.merge(test_data, movie_info, on="item_id", how="left")

# Prepare the feature matrix X_test for XGBoost and KNN
X_test = test_data.drop(columns=['user_id', 'item_id', 'rating'])  # remove target column

# Convert categorical columns to numeric using pd.get_dummies
X_test = pd.get_dummies(X_test, drop_first=True)

# Ensure the columns match between the training and test sets
X_train = train_data.drop(columns=['user_id', 'item_id', 'rating'])  # remove target column
X_train = pd.get_dummies(X_train, drop_first=True)

# Remove duplicate columns from X_train and X_test
X_train = X_train.loc[:, ~X_train.columns.duplicated()]
X_test = X_test.loc[:, ~X_test.columns.duplicated()]

# Align columns between training and test data
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Convert the DataFrame to a NumPy array to avoid dtype issues with DMatrix
X_test_np = X_test.values
X_train_np = X_train.values

# For XGBoost, ensure you convert it to DMatrix format
dtrain = xgb.DMatrix(X_train_np, label=train_data['rating'].values)
dtest = xgb.DMatrix(X_test_np)  # X_test should be in the same format as the training data

# Train the XGBoost model
params = {
    "objective": "reg:absoluteerror",
    "eval_metric": "mae",
    "max_depth": 6,
    "eta": 0.1,
    "subsample": 0.8,
    "colsample_bytree": 0.8
}
evals = [(dtrain, "train")]
xgb_model = xgb.train(params, dtrain, num_boost_round=1000, evals=evals, early_stopping_rounds=50)

# Get predictions from XGBoost model
xgb_pred = xgb_model.predict(dtest)

# Assuming KNN and SVD models are already trained and predictions are available
knn_pred = []
for _, row in test_data.iterrows():
    knn_pred.append(best_model.predict(row['user_id'], row['item_id']).est)
knn_pred = np.array(knn_pred)

svd_pred = []
for _, row in test_data.iterrows():
    svd_pred.append(best_svd.predict(row['user_id'], row['item_id']).est)
svd_pred = np.array(svd_pred)

# Assign weights (these can be based on model performance, e.g., validation MAE)
w_xgb = 0.5  # weight for xgb_model
w_knn = 0.3  # weight for best_model (KNN)
w_svd = 0.2  # weight for best_svd

# Compute the weighted average
ensemble_pred = (w_xgb * xgb_pred + w_knn * knn_pred + w_svd * svd_pred) / (w_xgb + w_knn + w_svd)

# Evaluate the ensemble model
test_mae = mean_absolute_error(y_test, ensemble_pred)
print(f"Ensemble MAE: {test_mae:.4f}")

### Meta-Model Ensemble ###

In [None]:
from sklearn.linear_model import LinearRegression

# Generate predictions on the training set
xgb_pred_train = xgb_model.predict(dtrain)
knn_pred_train = np.array([best_model.predict(uid, iid).est for uid, iid in zip(train_data['user_id'], train_data['item_id'])])
svd_pred_train = np.array([best_svd.predict(uid, iid).est for uid, iid in zip(train_data['user_id'], train_data['item_id'])])

# Create meta-features for the training set
meta_features_train = np.column_stack((xgb_pred_train, knn_pred_train, svd_pred_train))
meta_target_train = train_data['rating'].values

# Train the meta-model
meta_model = LinearRegression()
meta_model.fit(meta_features_train, meta_target_train)

# Generate predictions on the test set
xgb_pred_test = xgb_model.predict(dtest)
knn_pred_test = np.array([best_model.predict(uid, iid).est for uid, iid in zip(test_data['user_id'], test_data['item_id'])])
svd_pred_test = np.array([best_svd.predict(uid, iid).est for uid, iid in zip(test_data['user_id'], test_data['item_id'])])

# Create meta-features for the test set
meta_features_test = np.column_stack((xgb_pred_test, knn_pred_test, svd_pred_test))

# Predict using the meta-model
ensemble_pred = meta_model.predict(meta_features_test)

# Evaluate the ensemble model
test_mae = mean_absolute_error(y_test, ensemble_pred)
print(f"Ensemble MAE: {test_mae:.4f}")

# Load the leaderboard dataset (ratings_masked_leaderboard_set.csv)
leaderboard_set = pd.read_csv(os.path.join(data_folder, 'ratings_masked_leaderboard_set.csv'))
leaderboard_set = pd.merge(leaderboard_set, user_info, on="user_id", how="left")
leaderboard_set = pd.merge(leaderboard_set, movie_info, on="item_id", how="left")

# Prepare the feature matrix X_leaderboard for XGBoost and KNN
X_leaderboard = leaderboard_set.drop(columns=['user_id', 'item_id', 'rating'])  # remove target column
X_leaderboard = pd.get_dummies(X_leaderboard, drop_first=True)

# Remove duplicate columns from X_leaderboard
X_leaderboard = X_leaderboard.loc[:, ~X_leaderboard.columns.duplicated()]

# Ensure the columns match between the training and leaderboard sets
X_leaderboard = X_leaderboard.reindex(columns=X_train.columns, fill_value=0)

# Convert the DataFrame to a NumPy array to avoid dtype issues with DMatrix
X_leaderboard_np = X_leaderboard.values
dleaderboard = xgb.DMatrix(X_leaderboard_np)

# Get predictions from each pre-trained model
xgb_pred_leaderboard = xgb_model.predict(dleaderboard)
knn_pred_leaderboard = np.array([best_model.predict(uid, iid).est for uid, iid in zip(leaderboard_set['user_id'], leaderboard_set['item_id'])])
svd_pred_leaderboard = np.array([best_svd.predict(uid, iid).est for uid, iid in zip(leaderboard_set['user_id'], leaderboard_set['item_id'])])

# Create meta-features for the leaderboard set
meta_features_leaderboard = np.column_stack((xgb_pred_leaderboard, knn_pred_leaderboard, svd_pred_leaderboard))

# Predict using the meta-model
ensemble_pred_leaderboard = meta_model.predict(meta_features_leaderboard)

# Save the ensemble predictions to a plain text file
np.savetxt('predicted_ratings_ensemble_leaderboard.txt', ensemble_pred_leaderboard, fmt='%f')

# Confirm the format is correct
print("Predictions saved as 'predicted_ratings_ensemble_leaderboard.txt'")