In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score, classification_report, make_scorer
from sklearn.svm import SVC
from surprise import Dataset, Reader, SVD
from train_valid_test_loader import load_train_valid_test_datasets
from xgboost import XGBClassifier

# Load user and movie information
user_info = pd.read_csv("../data_movie_lens_100k/user_info.csv")
movie_info = pd.read_csv("../data_movie_lens_100k/movie_info.csv")

# Drop unnecessary columns
user_info = user_info.drop(columns=['orig_user_id'])
# movie_info = movie_info.drop(columns=['orig_item_id'])
movie_info = movie_info.drop(columns=['orig_item_id', 'title'])

# Ensure proper column naming
user_info.columns = ['user_id', 'age', 'is_male']
# movie_info.columns = ['item_id', 'title', 'release_year']
movie_info.columns = ['item_id', 'release_year']

# Load datasets
train_tuple, valid_tuple, test_tuple, n_users, n_items = load_train_valid_test_datasets()

# Convert tuples to a `surprise` dataset
def tuple_to_surprise_dataset(tupl):
    ratings_dict = {
        "userID": tupl[0],
        "itemID": tupl[1],
        "rating": tupl[2],
    }
    df = pd.DataFrame(ratings_dict)
    reader = Reader(rating_scale=(1, 5))
    dataset = Dataset.load_from_df(df[["userID", "itemID", "rating"]], reader)
    return dataset

# Movie Title Feature Extraction using TF-IDF
# titles = movie_info['title'].fillna('')
# tfidf = TfidfVectorizer(max_features=100)
# title_vectors = tfidf.fit_transform(titles).toarray()
# title_dict = {item: title_vectors[i] for i, item in enumerate(movie_info['item_id'])}

# # Normalize or bucket movie years
movie_info['year_normalized'] = (movie_info['release_year'] - movie_info['release_year'].min()) / (movie_info['release_year'].max() - movie_info['release_year'].min())
year_dict = {item: movie_info.loc[movie_info['item_id'] == item, 'year_normalized'].values[0] for item in movie_info['item_id']}

# Function to create feature vectors
def create_features(user_id, item_id, user_factors, item_factors, n_factors):
    zero_vector = np.zeros(n_factors)

    user_emb = user_factors.get(user_id, zero_vector)
    item_emb = item_factors.get(item_id, zero_vector)

    # Add demographic features
    user_info_row = user_info[user_info['user_id'] == user_id]
    if not user_info_row.empty:
        is_male = user_info_row.iloc[0]['is_male']
        age = user_info_row.iloc[0]['age']
    else:
        is_male = 0
        age = 0

    # Add movie title and year features
    # title_emb = title_dict.get(item_id, np.zeros(100))
    year_emb = year_dict.get(item_id, 0)

    # return np.concatenate((user_emb, item_emb, [is_male, age], title_emb, [year_emb]))
    return np.concatenate((user_emb, item_emb, [is_male, age], [year_emb]))

# Prepare the final DataFrame for training
def prepare_features_and_labels(tupl, user_factors, item_factors, n_factors, threshold=4.5):
    ratings_df = pd.DataFrame({
        'userID': tupl[0],
        'itemID': tupl[1],
        'rating': tupl[2]
    })
    ratings_df['features'] = ratings_df.apply(lambda row: create_features(row['userID'], row['itemID'], user_factors, item_factors, n_factors), axis=1)
    ratings_df['label'] = (ratings_df['rating'] >= threshold).astype(int)
    X = np.stack(ratings_df['features'])
    y = ratings_df['label']
    return X, y

# Function to train and extract features using SVD
def train_svd_and_prepare_features(n_factors=100):
    # Train SVD model using the training set
    trainset = tuple_to_surprise_dataset(train_tuple).build_full_trainset()
    algo = SVD(n_factors=n_factors)
    algo.fit(trainset)

    # Extract user and item factors
    user_factors = {trainset.to_raw_uid(uid): algo.pu[uid] for uid in range(trainset.n_users)}
    item_factors = {trainset.to_raw_iid(iid): algo.qi[iid] for iid in range(trainset.n_items)}

    # Create features and labels
    X_train, y_train = prepare_features_and_labels(train_tuple, user_factors, item_factors, n_factors)
    X_valid, y_valid = prepare_features_and_labels(valid_tuple, user_factors, item_factors, n_factors)
    X_test, y_test = prepare_features_and_labels(test_tuple, user_factors, item_factors, n_factors)

    # Combine training and validation for model selection
    X_combined = np.vstack((X_train, X_valid))
    y_combined = np.hstack((y_train, y_valid))

    return X_combined, y_combined, X_test, y_test

def tune_and_evaluate_xgboost(X_combined, y_combined, X_test, y_test):
    param_grid = {
        'learning_rate': [0.01, 0.1, 0.3],
        'max_depth': [3, 6, 10],
        'n_estimators': [50, 100, 150],
        'subsample': [0.7, 0.8, 0.9]
    }

    xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    grid_search = GridSearchCV(xgb, param_grid, scoring='roc_auc', cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), verbose=10, n_jobs=-1)
    grid_search.fit(X_combined, y_combined)

    best_xgb = grid_search.best_estimator_
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best Cross-Validation AUC Score: {grid_search.best_score_:.4f}")

    # Test the best model
    y_pred_proba = best_xgb.predict_proba(X_test)[:, 1]
    y_pred = best_xgb.predict(X_test)

    print(f"\nTest Set AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

# Evaluate models for different `n_factors`

X_combined, y_combined, X_test, y_test = train_svd_and_prepare_features(2)
tune_and_evaluate_xgboost(X_combined, y_combined, X_test, y_test)


Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV 1/5; 4/81] START learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.7
[CV 4/5; 2/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8
[CV 5/5; 3/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.9
[CV 3/5; 3/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.9
[CV 2/5; 3/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.9
[CV 4/5; 3/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.9
[CV 2/5; 2/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8
[CV 1/5; 1/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.7
[CV 3/5; 2/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8
[CV 1/5; 2/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8
[CV 4/5; 1/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.7
[C

In [11]:
X_combined, y_combined, X_test, y_test = train_svd_and_prepare_features(10)
tune_and_evaluate_xgboost(X_combined, y_combined, X_test, y_test)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV 1/5; 1/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.7
[CV 2/5; 1/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.7
[CV 3/5; 1/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.7
[CV 4/5; 1/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.7
[CV 5/5; 1/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.7
[CV 1/5; 2/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8
[CV 2/5; 2/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8
[CV 3/5; 2/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8
[CV 4/5; 2/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8
[CV 5/5; 2/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8
[CV 1/5; 3/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.9
[CV

In [12]:

X_combined, y_combined, X_test, y_test = train_svd_and_prepare_features(50)
tune_and_evaluate_xgboost(X_combined, y_combined, X_test, y_test)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV 1/5; 1/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.7
[CV 2/5; 1/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.7
[CV 3/5; 1/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.7
[CV 4/5; 1/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.7
[CV 5/5; 1/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.7
[CV 1/5; 2/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8
[CV 2/5; 2/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8
[CV 3/5; 2/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8
[CV 4/5; 2/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8
[CV 5/5; 2/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8
[CV 1/5; 3/81] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.9
[CV



[CV 1/5; 7/81] START learning_rate=0.01, max_depth=3, n_estimators=150, subsample=0.7
[CV 1/5; 4/81] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.7;, score=0.672 total time=   3.7s
[CV 2/5; 7/81] START learning_rate=0.01, max_depth=3, n_estimators=150, subsample=0.7
[CV 3/5; 4/81] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.7;, score=0.665 total time=   3.3s
[CV 2/5; 4/81] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.7;, score=0.666 total time=   3.3s
[CV 4/5; 4/81] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.7;, score=0.668 total time=   3.3s
[CV 3/5; 7/81] START learning_rate=0.01, max_depth=3, n_estimators=150, subsample=0.7
[CV 4/5; 7/81] START learning_rate=0.01, max_depth=3, n_estimators=150, subsample=0.7
[CV 5/5; 7/81] START learning_rate=0.01, max_depth=3, n_estimators=150, subsample=0.7
[CV 5/5; 4/81] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.7;, score=0.664 tota

In [13]:
from sklearn.ensemble import GradientBoostingClassifier
def tune_and_evaluate_gboost(X_combined, y_combined, X_test, y_test):
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 1.0]
    }

    gb_model = GradientBoostingClassifier(random_state=42)
    gb_grid_search = GridSearchCV(gb_model, param_grid, scoring='roc_auc', cv=5, verbose=10, n_jobs=-1)
    gb_grid_search.fit(X_combined, y_combined)

    best_gb = gb_grid_search.best_estimator_
    print(f"Best Parameters: {gb_grid_search.best_params_}")
    print(f"Best Cross-Validation AUC Score: {gb_grid_search.best_score_:.4f}")

    # Test the best model
    y_pred_proba = best_gb.predict_proba(X_test)[:, 1]
    y_pred = best_gb.predict(X_test)

    print(f"\nTest Set AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

In [14]:
for n_factors in [2,10,50]:
    X_combined, y_combined, X_test, y_test = train_svd_and_prepare_features(n_factors)
    tune_and_evaluate_gboost(X_combined, y_combined, X_test, y_test)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV 5/5; 3/54] START learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8[CV 2/5; 2/54] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=1.0
[CV 1/5; 2/54] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=1.0
[CV 4/5; 1/54] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8
[CV 3/5; 2/54] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=1.0[CV 5/5; 2/54] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=1.0

[CV 4/5; 2/54] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=1.0[CV 1/5; 4/54] START learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0
[CV 4/5; 3/54] START learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8


[CV 5/5; 1/54] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8
[CV 1/5; 1/54] START learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8


In [15]:
from sklearn.linear_model import LogisticRegression

def tune_and_evaluate_lr(X_combined, y_combined, X_test, y_test):
    param_grid = {
        'C': [0.01, 0.1, 1, 10, 100],
    }

    gb_model = LogisticRegression(max_iter=1000)
    gb_grid_search = GridSearchCV(gb_model, param_grid, scoring='roc_auc', cv=5, verbose=10, n_jobs=-1)
    gb_grid_search.fit(X_combined, y_combined)

    best_xgb = gb_grid_search.best_estimator_
    print(f"Best Parameters: {gb_grid_search.best_params_}")
    print(f"Best Cross-Validation AUC Score: {gb_grid_search.best_score_:.4f}")

    # Test the best model
    y_pred_proba = best_xgb.predict_proba(X_test)[:, 1]
    y_pred = best_xgb.predict(X_test)

    print(f"\nTest Set AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

In [16]:
for n_factors in [2,10,50]:
    X_combined, y_combined, X_test, y_test = train_svd_and_prepare_features(n_factors)
    tune_and_evaluate_lr(X_combined, y_combined, X_test, y_test)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5; 1/5] START C=0.01......................................................
[CV 2/5; 1/5] START C=0.01......................................................
[CV 3/5; 1/5] START C=0.01......................................................
[CV 4/5; 1/5] START C=0.01......................................................
[CV 5/5; 1/5] START C=0.01......................................................
[CV 1/5; 1/5] END .......................C=0.01;, score=0.615 total time=   0.1s
[CV 1/5; 2/5] START C=0.1.......................................................
[CV 2/5; 1/5] END .......................C=0.01;, score=0.597 total time=   0.2s
[CV 1/5; 4/5] START C=10........................................................
[CV 3/5; 1/5] END .......................C=0.01;, score=0.603 total time=   0.2s
[CV 3/5; 4/5] START C=10........................................................
[CV 4/5; 1/5] END .......................C=0.01;,

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5; 1/5] START C=0.01......................................................
[CV 2/5; 1/5] START C=0.01......................................................
[CV 3/5; 1/5] START C=0.01......................................................
[CV 4/5; 1/5] START C=0.01......................................................
[CV 5/5; 1/5] START C=0.01......................................................
[CV 1/5; 2/5] START C=0.1.......................................................
[CV 2/5; 2/5] START C=0.1.......................................................
[CV 3/5; 2/5] START C=0.1.......................................................
[CV 4/5; 2/5] START C=0.1.......................................................
[CV 5/5; 2/5] START C=0.1.......................................................
[CV 1/5; 3/5] START C=1.........................................................
[CV 2/5; 3/5] START C=1..........................

In [23]:
import matplotlib.pyplot as plt

num_factors = [2,10,50]

classifiers_metrics = {
    "GradientBoostingClassifier": [0.7708, 0.7999, 0.8418],
    "LogisticRegression": [0.6039, 0.6273, 0.6641]
}

metrics_df = pd.DataFrame(classifiers_metrics, index=num_factors)
metrics_df = metrics_df.transpose()
metrics_df.index.name = 'Classifier'
metrics_df.columns.name = 'Number of Factors'

print(metrics_df)

Number of Factors               2       10      50
Classifier                                        
GradientBoostingClassifier  0.7708  0.7999  0.8418
LogisticRegression          0.6039  0.6273  0.6641


In [24]:
best_gd = GradientBoostingClassifier(
    learning_rate= 0.2, max_depth= 5, n_estimators= 200, subsample = 0.8
)


In [31]:

# Train SVD model using the training set
trainset = tuple_to_surprise_dataset(train_tuple).build_full_trainset()
algo = SVD(n_factors=50)
algo.fit(trainset)

# Extract user and item factors
user_factors = {trainset.to_raw_uid(uid): algo.pu[uid] for uid in range(trainset.n_users)}
item_factors = {trainset.to_raw_iid(iid): algo.qi[iid] for iid in range(trainset.n_items)}

# Create features and labels
X_train, y_train = prepare_features_and_labels(train_tuple, user_factors, item_factors, n_factors)
X_valid, y_valid = prepare_features_and_labels(valid_tuple, user_factors, item_factors, n_factors)
X_test, y_test = prepare_features_and_labels(test_tuple, user_factors, item_factors, n_factors)


In [32]:
# Combine training and validation for model selection
X_combined = np.vstack((X_train, X_valid, X_test))
y_combined = np.hstack((y_train, y_valid, y_test))

In [33]:
best_gd.fit(X_combined, y_combined)


In [41]:
final_set_te_df = pd.read_csv("../data_movie_lens_100k/ratings_masked_leaderboard_set.csv")

final_te_data_tuple = (
    final_set_te_df["user_id"].values,
    final_set_te_df["item_id"].values
 )


In [43]:
ratings_dict = {
    "userID": final_te_data_tuple[0],
    "itemID": final_te_data_tuple[1],
}
df = pd.DataFrame(ratings_dict)
reader = Reader(rating_scale=(1, 5))
dataset = Dataset.load_from_df(df[["userID", "itemID"]], reader)

te_set = tuple_to_surprise_dataset(final_te_data_tuple).build_full_trainset()
user_factors = {te_set.to_raw_uid(uid): algo.pu[uid] for uid in range(te_set.n_users)}
item_factors = {te_set.to_raw_iid(iid): algo.qi[iid] for iid in range(te_set.n_items)}

IndexError: tuple index out of range

In [None]:
# Create features and labels
X_test, y_test = prepare_features_and_labels(final_te_data_tuple, user_factors, item_factors, n_factors)