In [None]:
# latest

In [None]:
!pip install surprise

import pandas as pd
from surprise import Dataset
from surprise import Reader

from google.colab import drive
drive.mount('/content/drive')

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3163350 sha256=6f3fadb7c91a3cbc0f14bc46035b4df6e8b2ae13f29c211dd7bebbb5e7fe5ad3
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1
Mounted at /content/drive


In [None]:
news_df = pd.read_csv('/content/drive/MyDrive/295A/MINDlarge_train/news.tsv', sep='\t', header=None, names=['NewsID', 'Category', 'Subcategory', 'Title', 'Abstract', 'URL', 'Title Entities', 'Abstract Entities'])
behaviors_df = pd.read_csv('/content/drive/MyDrive/295A/MINDlarge_train/behaviors.tsv', sep='\t', header=None, names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions'])

In [None]:
news_df = news_df[['NewsID']]
behaviors_df = behaviors_df[['UserID', 'Impressions']]

behaviors_df.head()

Unnamed: 0,UserID,Impressions
0,U87243,N78206-0 N26368-0 N7578-0 N58592-0 N19858-0 N5...
1,U598644,N47996-0 N82719-0 N117066-0 N8491-0 N123784-0 ...
2,U532401,N103852-0 N53474-0 N127836-0 N47925-1
3,U593596,N38902-0 N76434-0 N71593-0 N100073-0 N108736-0...
4,U239687,N76209-0 N48841-0 N67937-0 N62235-0 N6307-0 N3...


In [None]:
behaviors_df.shape

(2232748, 2)

In [None]:
mask = behaviors_df['Impressions'].str.contains('-1') & behaviors_df['Impressions'].str.contains('-0')
balanced_behaviors = behaviors_df[mask]

In [None]:
# Sample 15% from balanced behaviors
sampled_behaviors = balanced_behaviors.sample(frac=0.15)
print(sampled_behaviors.shape)

# Extract interactions from sampled behaviors
interactions = []
for _, row in sampled_behaviors.iterrows():
    for impression in row['Impressions'].split(' '):
        news_id, interaction = impression.split('-')
        interactions.append([row['UserID'], news_id, int(interaction)])

interactions_df = pd.DataFrame(interactions, columns=['UserID', 'NewsID', 'Rating'])

(401895, 2)


In [None]:
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(interactions_df[['UserID', 'NewsID', 'Rating']], reader)

In [None]:
from surprise import SVD, SVDpp, accuracy
from surprise.model_selection import train_test_split

# Split data into train and test set
trainset, testset = train_test_split(data, test_size=0.2)

# Build and train the SVD model
svd = SVD()
svd.fit(trainset)
predictions_svd = svd.test(testset)

rmse_svd = accuracy.rmse(predictions_svd)
mae_svd = accuracy.mae(predictions_svd)

print("SVD RMSE:", rmse_svd)
print("SVD MAE:", mae_svd)

RMSE: 0.2153
MAE:  0.1077
SVD RMSE: 0.21531060385526446
SVD MAE: 0.10772619204522202


In [None]:
svdpp = SVDpp()
svdpp.fit(trainset)
predictions_svdpp = svdpp.test(testset)

rmse_svdpp = accuracy.rmse(predictions_svdpp)
mae_svdpp = accuracy.mae(predictions_svdpp)

RMSE: 0.2177
MAE:  0.1112
SVD RMSE: 0.21771650003964746
SVD MAE: 0.11119217052632945


In [None]:
print("SVDpp RMSE:", rmse_svdpp)
print("SVDpp MAE:", mae_svdpp)

SVDpp RMSE: 0.21771650003964746
SVDpp MAE: 0.11119217052632945


In [None]:
from sklearn.metrics import roc_auc_score
import numpy as np

def compute_auc(predictions):
    true_labels = [pred.r_ui for pred in predictions]
    scores = [pred.est for pred in predictions]
    return roc_auc_score(true_labels, scores)

def compute_ndcg(predictions, k=10):
    users = {}
    for uid, _, true_r, est, _ in predictions:
        if not users.get(uid):
            users[uid] = []
        users[uid].append((est, true_r))

    def single_user_ndcg(items):
        items = sorted(items, key=lambda x: x[0], reverse=True)
        gain = 0.0
        for i, (_, true_r) in enumerate(items[:k]):
            gain += true_r / np.log2(i+2)  # +2 because of 0-based indexing
        ideal_items = sorted(items, key=lambda x: x[1], reverse=True)
        ideal_gain = 0.0
        for i, (_, true_r) in enumerate(ideal_items[:k]):
            ideal_gain += true_r / np.log2(i+2)
        if ideal_gain == 0:
            return 0
        return gain / ideal_gain

    ndcg = np.mean([single_user_ndcg(user) for user in users.values()])
    return ndcg

In [None]:
# AUC
auc_svd = compute_auc(predictions_svd)

# NDCG@10
ndcg10_svd = compute_ndcg(predictions_svd, k=10)


# NDCG@5
ndcg5_svd = compute_ndcg(predictions_svd, k=5)


# Printing the metrics
print("SVD AUC:", auc_svd)
print("SVD NDCG@10:", ndcg10_svd)
print("SVD NDCG@5:", ndcg5_svd)

SVD AUC: 0.6742085303899098
SVD NDCG@10: 0.20859465962934035
SVD NDCG@5: 0.1877216642889651
