In [None]:
!pip install surprise

import pandas as pd
from surprise import Dataset
from surprise import Reader

from google.colab import drive
drive.mount('/content/drive')

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3163346 sha256=133bf8862e7cd1bd3cd9888a2fb7a6fda35b01fca39a0022abc4acc0b131468b
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1
Mounted at /content/drive


In [None]:
news_df = pd.read_csv('/content/drive/MyDrive/295A/MINDlarge_train/news.tsv', sep='\t', header=None, names=['NewsID', 'Category', 'Subcategory', 'Title', 'Abstract', 'URL', 'Title Entities', 'Abstract Entities'])
behaviors_df = pd.read_csv('/content/drive/MyDrive/295A/MINDlarge_train/behaviors.tsv', sep='\t', header=None, names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions'])

In [None]:
news_df = news_df[['NewsID']]
behaviors_df = behaviors_df[['UserID', 'Impressions']]

behaviors_df.head()

Unnamed: 0,UserID,Impressions
0,U87243,N78206-0 N26368-0 N7578-0 N58592-0 N19858-0 N5...
1,U598644,N47996-0 N82719-0 N117066-0 N8491-0 N123784-0 ...
2,U532401,N103852-0 N53474-0 N127836-0 N47925-1
3,U593596,N38902-0 N76434-0 N71593-0 N100073-0 N108736-0...
4,U239687,N76209-0 N48841-0 N67937-0 N62235-0 N6307-0 N3...


In [None]:
behaviors_df.shape

(2232748, 2)

In [None]:
mask = behaviors_df['Impressions'].str.contains('-1') & behaviors_df['Impressions'].str.contains('-0')
balanced_behaviors = behaviors_df[mask]

In [None]:
# Sample 15% from balanced behaviors
sampled_behaviors = balanced_behaviors.sample(frac=0.15)
print(sampled_behaviors.shape)

# Extract interactions from sampled behaviors
interactions = []
for _, row in sampled_behaviors.iterrows():
    for impression in row['Impressions'].split(' '):
        news_id, interaction = impression.split('-')
        interactions.append([row['UserID'], news_id, int(interaction)])

interactions_df = pd.DataFrame(interactions, columns=['UserID', 'NewsID', 'Rating'])

(334912, 2)


In [None]:
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(interactions_df[['UserID', 'NewsID', 'Rating']], reader)

In [None]:
from surprise import SVD, SVDpp, NMF, accuracy
from surprise.model_selection import train_test_split

# Split data into train and test set
trainset, testset = train_test_split(data, test_size=0.2)

# Build and train the SVD model
nmf = NMF()
nmf.fit(trainset)
predictions_nmf = nmf.test(testset)

rmse_nmf = accuracy.rmse(predictions_nmf)
mae_nmf = accuracy.mae(predictions_nmf)

print("NMF RMSE:", rmse_nmf)
print("NMF MAE:", mae_nmf)

RMSE: 0.2003
MAE:  0.0448
NMF RMSE: 0.20034395387571743
NMF MAE: 0.04481896349614981


In [None]:
from sklearn.metrics import roc_auc_score
import numpy as np

def compute_auc(predictions):
    true_labels = [pred.r_ui for pred in predictions]
    scores = [pred.est for pred in predictions]
    return roc_auc_score(true_labels, scores)

def compute_ndcg(predictions, k=10):
    users = {}
    for uid, _, true_r, est, _ in predictions:
        if not users.get(uid):
            users[uid] = []
        users[uid].append((est, true_r))

    def single_user_ndcg(items):
        items = sorted(items, key=lambda x: x[0], reverse=True)
        gain = 0.0
        for i, (_, true_r) in enumerate(items[:k]):
            gain += true_r / np.log2(i+2)  # +2 because of 0-based indexing
        ideal_items = sorted(items, key=lambda x: x[1], reverse=True)
        ideal_gain = 0.0
        for i, (_, true_r) in enumerate(ideal_items[:k]):
            ideal_gain += true_r / np.log2(i+2)
        if ideal_gain == 0:
            return 0
        return gain / ideal_gain

    ndcg = np.mean([single_user_ndcg(user) for user in users.values()])
    return ndcg

In [None]:
# AUC
auc_nmf = compute_auc(predictions_nmf)

# NDCG@10
ndcg10_nmf = compute_ndcg(predictions_nmf, k=10)


# NDCG@5
ndcg5_nmf = compute_ndcg(predictions_nmf, k=5)


# Printing the metrics
print("NMF AUC:", auc_nmf)
print("NMF NDCG@10:", ndcg10_nmf)
print("NMF NDCG@5:", ndcg5_nmf)

NMF AUC: 0.4959536676562538
NMF NDCG@10: 0.20115974379559498
NMF NDCG@5: 0.17960114144442693
