# Homework 2. Hybrid News RecSys : CS6365 Fall 2025

Justin Mittereder - G49843234

This approach will use the news article embeddings from the content based approach in Homework2_content.ipynb and the article_article_similarity matrix from Homework2_cf.ipynb to hopefully get better evaluation results on warm and cold users by combining both approaches. 

In [1]:
import pandas as pd
import datetime
import numpy as np

col_names  = ['Impression ID', 'User ID', 'Time', 'History', 'Impressions']
train_behaviors_df = pd.read_csv('data/MINDsmall_train/behaviors.tsv', sep='\t', header=None, names=col_names)
train_behaviors_df.sort_values(by=['User ID', 'Time'], inplace=True)
train_behaviors_df.reset_index(drop=True, inplace=True)
val_behaviors_df = pd.read_csv('data/MINDsmall_dev/behaviors.tsv', sep='\t', header=None, names=col_names)
val_behaviors_df.sort_values(by=['User ID', 'Time'], inplace=True)
val_behaviors_df.reset_index(drop=True, inplace=True)

col_names  = ['News ID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'Title Entities', 'Abstract Entities']
train_news_df = pd.read_csv('data/MINDsmall_train/news.tsv', sep='\t', header=None, names=col_names)
val_news_df = pd.read_csv('data/MINDsmall_dev/news.tsv', sep='\t', header=None, names=col_names)


In [2]:
train_behaviors_df["unix_timestamp"] = pd.to_datetime(
    train_behaviors_df["Time"], 
    format="%m/%d/%Y %I:%M:%S %p"
).astype("int64") // 1_000_000_000

val_behaviors_df["unix_timestamp"] = pd.to_datetime(
    val_behaviors_df["Time"], 
    format="%m/%d/%Y %I:%M:%S %p"
).astype("int64") // 1_000_000_000

earliest_train_time = datetime.datetime.fromtimestamp(train_behaviors_df['unix_timestamp'].min())
latest_train_time = datetime.datetime.fromtimestamp(train_behaviors_df['unix_timestamp'].max())
earliest_val_time = datetime.datetime.fromtimestamp(val_behaviors_df['unix_timestamp'].min())
latest_val_time = datetime.datetime.fromtimestamp(val_behaviors_df['unix_timestamp'].max())

print(f"Training impressions are from between {earliest_train_time} and {latest_train_time} . ")
print(f"Validation impressions are from between {earliest_val_time} and {latest_val_time} . ")


Training impressions are from between 2019-11-08 19:00:19 and 2019-11-14 18:59:13 . 
Validation impressions are from between 2019-11-14 19:00:01 and 2019-11-15 18:58:03 . 


In [3]:
import datetime

#get full df of all user impressions
def get_impressions(df, include_history=False):
    rows = []
    for idx, row in df.iterrows():
        user_id = row["User ID"]
        time = row["unix_timestamp"]
        impressions = row["Impressions"]

        # split impressions
        for impression in impressions.split():
            article, click = impression.split("-")
            click = int(click)

            rows.append({
                "user_id": user_id,
                "timestamp": time,
                "article": article,
                "impression": click
            })

        if(include_history):
            # split impressions
            history = str(row['History'])
            for article in history.split(" "):
                rows.append({
                    "user_id": user_id,
                    "timestamp": "N/A",
                    "article": article,
                    "impression": 1 
                })
        
    impressions_df = pd.DataFrame(rows)
    impressions_df.sort_values(by=['user_id', 'timestamp'], inplace=True)
    impressions_df.reset_index(drop=True, inplace=True)
    return impressions_df    


impressions_df = get_impressions(train_behaviors_df)
impressions_df = impressions_df[impressions_df['impression'] == 1]
print("Only Considering Clicked on Articles")
print("Train: Number of Impressions: ", len(impressions_df))
print("Train: Number of articles: ", impressions_df['article'].nunique())
print("Train: Number of Users: ", impressions_df['user_id'].nunique())
print(impressions_df.head())

val_impressions_df = get_impressions(val_behaviors_df)
val_impressions_df = val_impressions_df[val_impressions_df['impression'] == 1]
print("Only Considering Clicked on Articles")
print("Val: Number of Impressions: ", len(val_impressions_df))
print("Val: Number of articles: ", val_impressions_df['article'].nunique())
print("Val: Number of Users: ", val_impressions_df['user_id'].nunique())
print(val_impressions_df.head())

Only Considering Clicked on Articles
Train: Number of Impressions:  236344
Train: Number of articles:  7713
Train: Number of Users:  50000
    user_id   timestamp article  impression
3      U100  1573544052   N7800           1
134   U1000  1573686978  N53875           1
166   U1000  1573693256  N29739           1
180   U1000  1573693256   N7670           1
307   U1000  1573771041  N58656           1
Only Considering Clicked on Articles
Val: Number of Impressions:  111383
Val: Number of articles:  2212
Val: Number of Users:  50000
   user_id   timestamp article  impression
1       U1  1573802905  N20036           1
13     U10  1573798095  N32536           1
59  U10000  1573807178  N31958           1
63  U10000  1573811650  N50775           1
93  U10000  1573823809  N60215           1


Load Embeddings for each article in the validation set 

In [4]:
#load generated embeddings
val_news_df = pd.read_pickle("data/MINDsmall_dev/news_w_embeddings.pkl")
type(val_news_df.loc[0, "embedding"])

numpy.ndarray

In [5]:
#for validation users that have no history, set their History to empty list
val_behaviors_df["History"] = val_behaviors_df["History"].fillna("")
val_behaviors_df['History'] = val_behaviors_df['History'].apply(lambda x: x.split())

Get embeddings for each user that are the average of the embeddings for each article in the user's history

In [6]:
val_article_emb_dict = dict(zip(val_news_df["News ID"], val_news_df["embedding"]))
val_user_embeddings = {}

for user, rows in val_behaviors_df.groupby("User ID"):
    clicked = rows["History"].sum()  # concatenates all lists of clicked items
    clicked_embs = []
    for article_id in clicked: 
        if article_id in val_article_emb_dict: # if we have embedding for this article
            clicked_embs.append(val_article_emb_dict[article_id])

    if len(clicked_embs) == 0: #no clicked articles in user history
        continue
    #print(clicked_embs)
    val_user_embeddings[user] = np.mean(clicked_embs, axis=0) #gets average of the embeddings of all articles in validation user's history

Get User and article similarity by getting the dot product of both embeddings

In [7]:
#get all item ids
item_ids = list(val_article_emb_dict.keys())
#dictionary that maps item ids as keys and index as values
item_id_to_index = {item_id: idx for idx, item_id in enumerate(item_ids)}
#get all embeddings for each article into a matrix
item_matrix = np.vstack([val_article_emb_dict[i] for i in item_ids])  # shape (N_items, embedding_dim)
# transpose to (embedding_dim, N_items)
item_matrix = item_matrix.T  

In [8]:
user_item_scores = {}

for user, user_emb in val_user_embeddings.items():
    scores = np.dot(user_emb, item_matrix)    # shape (N_items,)
    user_item_scores[user] = scores

Map all User IDs and Article IDs to integers so that we can use them as row and columns in user-article matrix


In [9]:
from sklearn.preprocessing import LabelEncoder

user_encoder = LabelEncoder()
article_encoder = LabelEncoder()

impressions_df['user_idx'] = user_encoder.fit_transform(impressions_df['user_id']) #Map all User IDs to integers so we can use them for the rows in the user-article matrix 
impressions_df['article_idx'] = article_encoder.fit_transform(impressions_df['article']) #Map all Article IDs to integers so we can use them for the columns in the user-article matrix 

In [10]:
from scipy.sparse import csr_matrix
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

num_users = impressions_df['user_idx'].nunique()
num_articles = impressions_df['article_idx'].nunique()

user_article_matrix = csr_matrix((np.ones(len(impressions_df)),(impressions_df['user_idx'], impressions_df['article_idx'])),shape=(num_users, num_articles))
print("User Article Matrix Shape: ", user_article_matrix.shape)

#calculate the similarity between articles based on which users have interacted with which articles using cosine similarity function on transpose(user-article) matrix -> article-user matrix
article_article_sim = cosine_similarity(user_article_matrix.T) #items are similar if they've been clicekd on by similar users/have similar user impression vectors

User Article Matrix Shape:  (50000, 7713)


In [11]:
#dictionary that maps article Ids to their indices for CF article_article sim matrix
article_id_to_index = {
    article_id: idx
    for idx, article_id in enumerate(article_encoder.classes_)
}

#dictionary that maps indices to their article Ids , reverse of above
index_to_article_id = {
    idx: article_id
    for idx, article_id in enumerate(article_encoder.classes_)
}

def to_clicked_ids(clicked_idx):
    return [index_to_article_id[i] for i in clicked_idx if i in index_to_article_id]


In [12]:
val_behaviors_df.head()

Unnamed: 0,Impression ID,User ID,Time,History,Impressions,unix_timestamp
0,8158,U1,11/15/2019 7:28:25 AM,"[N23571, N58267, N25682, N10646, N32607, N5773...",N14637-0 N20036-1,1573802905
1,71327,U10,11/15/2019 6:08:15 AM,"[N27612, N36699, N64777, N9120, N9803, N57967,...",N33397-0 N46917-0 N11930-0 N58612-0 N47612-0 N...,1573798095
2,5144,U10000,11/15/2019 1:16:49 PM,"[N10059, N46978, N53234, N3345, N3345, N9155, ...",N48740-0 N51470-0 N1952-0 N23675-0 N56969-0 N6...,1573823809
3,41696,U10000,11/15/2019 8:39:38 AM,"[N10059, N46978, N53234, N3345, N3345, N9155, ...",N29393-0 N20036-0 N30290-0 N31958-1 N23513-0 N...,1573807178
4,1307,U10000,11/15/2019 9:54:10 AM,"[N10059, N46978, N53234, N3345, N3345, N9155, ...",N50775-1 N31958-0 N53572-0 N5472-0 N58251-0 N3...,1573811650


In [13]:
#split articles previously clicked by users in validation set
#val_behaviors_df["History_list"] = val_behaviors_df["History"].apply(lambda x: x.split() if isinstance(x, str) else [])
val_behaviors_df['History_list'] = val_behaviors_df['History']

#take all impressions in validation set that user actually clicked on
def get_clicked_articles(impressions_str):
    clicks = []
    for imp in impressions_str.split():
        article, clicked = imp.split("-")
        if int(clicked) == 1:
            clicks.append(article)
    return clicks

#create new col in val set called clicked_articles with all articles user viewed
val_behaviors_df['clicked_articles'] = val_behaviors_df['Impressions'].apply(get_clicked_articles)

In [14]:
val_behaviors_df.head()

Unnamed: 0,Impression ID,User ID,Time,History,Impressions,unix_timestamp,History_list,clicked_articles
0,8158,U1,11/15/2019 7:28:25 AM,"[N23571, N58267, N25682, N10646, N32607, N5773...",N14637-0 N20036-1,1573802905,"[N23571, N58267, N25682, N10646, N32607, N5773...",[N20036]
1,71327,U10,11/15/2019 6:08:15 AM,"[N27612, N36699, N64777, N9120, N9803, N57967,...",N33397-0 N46917-0 N11930-0 N58612-0 N47612-0 N...,1573798095,"[N27612, N36699, N64777, N9120, N9803, N57967,...",[N32536]
2,5144,U10000,11/15/2019 1:16:49 PM,"[N10059, N46978, N53234, N3345, N3345, N9155, ...",N48740-0 N51470-0 N1952-0 N23675-0 N56969-0 N6...,1573823809,"[N10059, N46978, N53234, N3345, N3345, N9155, ...",[N60215]
3,41696,U10000,11/15/2019 8:39:38 AM,"[N10059, N46978, N53234, N3345, N3345, N9155, ...",N29393-0 N20036-0 N30290-0 N31958-1 N23513-0 N...,1573807178,"[N10059, N46978, N53234, N3345, N3345, N9155, ...",[N31958]
4,1307,U10000,11/15/2019 9:54:10 AM,"[N10059, N46978, N53234, N3345, N3345, N9155, ...",N50775-1 N31958-0 N53572-0 N5472-0 N58251-0 N3...,1573811650,"[N10059, N46978, N53234, N3345, N3345, N9155, ...",[N50775]


In [15]:
print("Total articles in validation candidate set:", len(item_ids))

Total articles in validation candidate set: 42416


In [16]:
# helper function
def to_idx_list(article_list, article_id_to_index):
    return [article_id_to_index[a] for a in article_list if a in article_id_to_index]

# history and clicked articles as indices
val_behaviors_df['history_idx'] = val_behaviors_df['History_list'].apply(lambda x: to_idx_list(x, article_id_to_index))
val_behaviors_df['clicked_idx'] = val_behaviors_df['clicked_articles'].apply(lambda x: to_idx_list(x, article_id_to_index))

# Pre-filter history indices to only include articles in CF similarity matrix
val_behaviors_df['history_idx_cf'] = val_behaviors_df['history_idx'].apply(
    lambda idxs: [i for i in idxs if i < article_article_sim.shape[0]]
)

In [17]:
alpha = 0.3  # weight for CF vs CB

results = []

for i, row in val_behaviors_df.iterrows():
    user = row['User ID']
    history_idx = row['history_idx_cf']
    clicked_idx = row['clicked_idx']

    #collaborative filtering scores if there is history in article_article sim matrix for user
    cf_scores = []
    for item_id in item_ids:  # all validation articles
        if item_id in article_id_to_index:  # article exists in CF (training set)
            c = article_id_to_index[item_id]  # index in CF matrix
            if len(history_idx) > 0:
                sims = article_article_sim[c, history_idx]
                cf_scores.append(sims.max())
            else:
                cf_scores.append(0.0)  # user has no CF-known history
        else:
            cf_scores.append(0.0)  # validation-only article, CF not defined
    cf_scores = np.array(cf_scores)

    #content based scores using the users embedding and every other article in val set
    if user in val_user_embeddings:
        user_emb = val_user_embeddings[user]  # mean of history embeddings
        cb_scores = np.dot(user_emb, item_matrix)  # dot product with all articles
    else:
        cb_scores = np.zeros(len(item_ids))  # user has no history at all

    #combined score using cf and cb
    hybrid_scores = alpha * cf_scores + (1 - alpha) * cb_scores

    #rank all articles for recommendations, get top 50 to reduce memory load
    top_k = 50  # store top 50 for eval metrics
    ranked_idx = np.argpartition(hybrid_scores, -top_k)[-top_k:]
    ranked_idx = ranked_idx[np.argsort(hybrid_scores[ranked_idx])[::-1]]
    ranked_items = [item_ids[j] for j in ranked_idx]

    clicked_ids = to_clicked_ids(clicked_idx) #convert encoded clicked articles back to regular articles for eval metrics

    results.append({
        'Impression ID': row['Impression ID'],
        'ranked_items': ranked_items,
        'scores': hybrid_scores[ranked_idx],
        'clicked_idx': clicked_ids
    })


In [18]:
def hit_at_k_multi(ranked_items, clicked_idx, k):
    return int(any(item in ranked_items[:k] for item in clicked_idx))

def reciprocal_rank_multi(ranked_items, clicked_idx):
    for rank, item in enumerate(ranked_items, 1):
        if item in clicked_idx:
            return 1.0 / rank
    return 0.0

def ndcg_at_k_multi(ranked_items, clicked_idx, k):
    dcg = 0.0
    for rank, item in enumerate(ranked_items[:k], 1):
        if item in clicked_idx:
            dcg += 1.0 / np.log2(rank + 1)
    # ideal DCG
    idcg = sum(1.0 / np.log2(i + 1) for i in range(1, min(k, len(clicked_idx)) + 1))
    return dcg / idcg if idcg > 0 else 0.0


In [19]:
mrr_list = []
hit5_list = []
hit10_list = []
ndcg5_list = []
ndcg10_list = []

for row in results:
    ranked_items = row['ranked_items']
    clicked_idx = row['clicked_idx']

    mrr_list.append(reciprocal_rank_multi(ranked_items, clicked_idx))
    hit5_list.append(hit_at_k_multi(ranked_items, clicked_idx, 5))
    hit10_list.append(hit_at_k_multi(ranked_items, clicked_idx, 10))
    ndcg5_list.append(ndcg_at_k_multi(ranked_items, clicked_idx, 5))
    ndcg10_list.append(ndcg_at_k_multi(ranked_items, clicked_idx, 10))

metrics = {
    'MRR': np.mean(mrr_list),
    'HitRate@5': np.mean(hit5_list),
    'HitRate@10': np.mean(hit10_list),
    'nDCG@5': np.mean(ndcg5_list),
    'nDCG@10': np.mean(ndcg10_list)
}

print("Evaluation Metrics:")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")


Evaluation Metrics:
MRR: 0.0026
HitRate@5: 0.0032
HitRate@10: 0.0044
nDCG@5: 0.0020
nDCG@10: 0.0023
