# Homework 2. Collaborative Filtering News RecSys : CS6365 Fall 2025

Justin Mittereder - G49843234

In [28]:
import pandas as pd
import datetime
import numpy

col_names  = ['Impression ID', 'User ID', 'Time', 'History', 'Impressions']
train_behaviors_df = pd.read_csv('data/MINDsmall_train/behaviors.tsv', sep='\t', header=None, names=col_names)
train_behaviors_df.sort_values(by=['User ID', 'Time'], inplace=True)
train_behaviors_df.reset_index(drop=True, inplace=True)
val_behaviors_df = pd.read_csv('data/MINDsmall_dev/behaviors.tsv', sep='\t', header=None, names=col_names)
val_behaviors_df.sort_values(by=['User ID', 'Time'], inplace=True)
val_behaviors_df.reset_index(drop=True, inplace=True)

col_names  = ['News ID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'Title Entities', 'Abstract Entities']
train_news_df = pd.read_csv('data/MINDsmall_train/news.tsv', sep='\t', header=None, names=col_names)
val_news_df = pd.read_csv('data/MINDsmall_dev/news.tsv', sep='\t', header=None, names=col_names)


In [29]:
train_behaviors_df["unix_timestamp"] = pd.to_datetime(
    train_behaviors_df["Time"], 
    format="%m/%d/%Y %I:%M:%S %p"
).astype("int64") // 1_000_000_000

val_behaviors_df["unix_timestamp"] = pd.to_datetime(
    val_behaviors_df["Time"], 
    format="%m/%d/%Y %I:%M:%S %p"
).astype("int64") // 1_000_000_000

earliest_train_time = datetime.datetime.fromtimestamp(train_behaviors_df['unix_timestamp'].min())
latest_train_time = datetime.datetime.fromtimestamp(train_behaviors_df['unix_timestamp'].max())
earliest_val_time = datetime.datetime.fromtimestamp(val_behaviors_df['unix_timestamp'].min())
latest_val_time = datetime.datetime.fromtimestamp(val_behaviors_df['unix_timestamp'].max())

print(f"Training impressions are from between {earliest_train_time} and {latest_train_time} . ")
print(f"Validation impressions are from between {earliest_val_time} and {latest_val_time} . ")


Training impressions are from between 2019-11-08 19:00:19 and 2019-11-14 18:59:13 . 
Validation impressions are from between 2019-11-14 19:00:01 and 2019-11-15 18:58:03 . 


In [30]:
#get full df of all user impressions
import datetime
import pandas as pd

def get_impressions(df, include_history=False):
    rows = []
    for idx, row in df.iterrows():
        user_id = row["User ID"]
        time = row["unix_timestamp"]
        impressions = row["Impressions"]

        # split impressions
        for impression in impressions.split():
            article, click = impression.split("-")
            click = int(click)

            rows.append({
                "user_id": user_id,
                "timestamp": time,
                "article": article,
                "impression": click
            })

        if(include_history):
            # split impressions
            history = str(row['History'])
            for article in history.split(" "):
                rows.append({
                    "user_id": user_id,
                    "timestamp": "N/A",
                    "article": article,
                    "impression": 1 
                })
        
    impressions_df = pd.DataFrame(rows)
    impressions_df.sort_values(by=['user_id', 'timestamp'], inplace=True)
    impressions_df.reset_index(drop=True, inplace=True)
    return impressions_df    


impressions_df = get_impressions(train_behaviors_df)
impressions_df = impressions_df[impressions_df['impression'] == 1]
print("Only Considering Clicked on Articles")
print("Train: Number of Impressions: ", len(impressions_df))
print("Train: Number of articles: ", impressions_df['article'].nunique())
print("Train: Number of Users: ", impressions_df['user_id'].nunique())
print(impressions_df.head())

val_impressions_df = get_impressions(val_behaviors_df)
val_impressions_df = val_impressions_df[val_impressions_df['impression'] == 1]
print("Only Considering Clicked on Articles")
print("Val: Number of Impressions: ", len(val_impressions_df))
print("Val: Number of articles: ", val_impressions_df['article'].nunique())
print("Val: Number of Users: ", val_impressions_df['user_id'].nunique())
print(val_impressions_df.head())

Only Considering Clicked on Articles
Train: Number of Impressions:  236344
Train: Number of articles:  7713
Train: Number of Users:  50000
    user_id   timestamp article  impression
3      U100  1573544052   N7800           1
134   U1000  1573686978  N53875           1
166   U1000  1573693256  N29739           1
180   U1000  1573693256   N7670           1
307   U1000  1573771041  N58656           1
Only Considering Clicked on Articles
Val: Number of Impressions:  111383
Val: Number of articles:  2212
Val: Number of Users:  50000
   user_id   timestamp article  impression
1       U1  1573802905  N20036           1
13     U10  1573798095  N32536           1
59  U10000  1573807178  N31958           1
63  U10000  1573811650  N50775           1
93  U10000  1573823809  N60215           1


Get Most Popular items based on number of successful impressions in training set

Map all User IDs and Article IDs to integers so that we can use them as row and columns in user-article matrix


In [31]:
from sklearn.preprocessing import LabelEncoder

user_encoder = LabelEncoder()
article_encoder = LabelEncoder()

impressions_df['user_idx'] = user_encoder.fit_transform(impressions_df['user_id']) #Map all User IDs to integers so we can use them for the rows in the user-article matrix 
impressions_df['article_idx'] = article_encoder.fit_transform(impressions_df['article']) #Map all Article IDs to integers so we can use them for the columns in the user-article matrix 

In [32]:
from scipy.sparse import csr_matrix
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

num_users = impressions_df['user_idx'].nunique()
num_articles = impressions_df['article_idx'].nunique()

user_article_matrix = csr_matrix((np.ones(len(impressions_df)),(impressions_df['user_idx'], impressions_df['article_idx'])),shape=(num_users, num_articles))
print("User Article Matrix Shape: ", user_article_matrix.shape)

#calculate the similarity between articles based on which users have interacted with which articles using cosine similarity function on transpose(user-article) matrix -> article-user matrix
article_article_sim = cosine_similarity(user_article_matrix.T) #items are similar if they've been clicekd on by similar users/have similar user impression vectors

User Article Matrix Shape:  (50000, 7713)


In [33]:
#dictionary that maps article Ids to their indices
article_id_to_index = {
    article_id: idx
    for idx, article_id in enumerate(article_encoder.classes_)
}

#dictionary that maps indices to their article Ids , reverse of above
index_to_article_id = {
    idx: article_id
    for idx, article_id in enumerate(article_encoder.classes_)
}


To recommend items for a user:
- Look at the items the user clicked
- Gather similar items from item_item_sim
- Rank by similarity score
- Remove already-clicked items
- Take Top-K

In [34]:
val_behaviors_df.head()

Unnamed: 0,Impression ID,User ID,Time,History,Impressions,unix_timestamp
0,8158,U1,11/15/2019 7:28:25 AM,N23571 N58267 N25682 N10646 N32607 N57737 N523...,N14637-0 N20036-1,1573802905
1,71327,U10,11/15/2019 6:08:15 AM,N27612 N36699 N64777 N9120 N9803 N57967 N2945,N33397-0 N46917-0 N11930-0 N58612-0 N47612-0 N...,1573798095
2,5144,U10000,11/15/2019 1:16:49 PM,N10059 N46978 N53234 N3345 N3345 N9155 N9653 N...,N48740-0 N51470-0 N1952-0 N23675-0 N56969-0 N6...,1573823809
3,41696,U10000,11/15/2019 8:39:38 AM,N10059 N46978 N53234 N3345 N3345 N9155 N9653 N...,N29393-0 N20036-0 N30290-0 N31958-1 N23513-0 N...,1573807178
4,1307,U10000,11/15/2019 9:54:10 AM,N10059 N46978 N53234 N3345 N3345 N9155 N9653 N...,N50775-1 N31958-0 N53572-0 N5472-0 N58251-0 N3...,1573811650


In [35]:
#split articles previously clicked by users in validation set
val_behaviors_df["History_list"] = val_behaviors_df["History"].apply(lambda x: x.split() if isinstance(x, str) else [])

#take all impressions in validation set that user actually clicked on
def get_clicked_articles(impressions_str):
    clicks = []
    for imp in impressions_str.split():
        article, clicked = imp.split("-")
        if int(clicked) == 1:
            clicks.append(article)
    return clicks

#create new col in val set called clicked_articles with all articles user viewed
val_behaviors_df['clicked_articles'] = val_behaviors_df['Impressions'].apply(get_clicked_articles)

In [36]:
val_behaviors_df.head()

Unnamed: 0,Impression ID,User ID,Time,History,Impressions,unix_timestamp,History_list,clicked_articles
0,8158,U1,11/15/2019 7:28:25 AM,N23571 N58267 N25682 N10646 N32607 N57737 N523...,N14637-0 N20036-1,1573802905,"[N23571, N58267, N25682, N10646, N32607, N5773...",[N20036]
1,71327,U10,11/15/2019 6:08:15 AM,N27612 N36699 N64777 N9120 N9803 N57967 N2945,N33397-0 N46917-0 N11930-0 N58612-0 N47612-0 N...,1573798095,"[N27612, N36699, N64777, N9120, N9803, N57967,...",[N32536]
2,5144,U10000,11/15/2019 1:16:49 PM,N10059 N46978 N53234 N3345 N3345 N9155 N9653 N...,N48740-0 N51470-0 N1952-0 N23675-0 N56969-0 N6...,1573823809,"[N10059, N46978, N53234, N3345, N3345, N9155, ...",[N60215]
3,41696,U10000,11/15/2019 8:39:38 AM,N10059 N46978 N53234 N3345 N3345 N9155 N9653 N...,N29393-0 N20036-0 N30290-0 N31958-1 N23513-0 N...,1573807178,"[N10059, N46978, N53234, N3345, N3345, N9155, ...",[N31958]
4,1307,U10000,11/15/2019 9:54:10 AM,N10059 N46978 N53234 N3345 N3345 N9155 N9653 N...,N50775-1 N31958-0 N53572-0 N5472-0 N58251-0 N3...,1573811650,"[N10059, N46978, N53234, N3345, N3345, N9155, ...",[N50775]


In [37]:
import numpy as np

def score_candidates(history_idx, candidate_idx, sim_matrix):
    """
    Compute item-item CF scores for candidate articles given user's history.
    
    history_idx: list of article indices in history (must exist in sim_matrix)
    candidate_idx: list of article indices to score
    sim_matrix: article-article similarity matrix
    """
    if len(history_idx) == 0:
        return np.zeros(len(candidate_idx))  # no known history
    
    scores = []
    for c in candidate_idx:
        sims = sim_matrix[c, history_idx]
        scores.append(sims.max())
    return np.array(scores)


In [38]:
# helper function
def to_idx_list(article_list, article_id_to_index):
    return [article_id_to_index[a] for a in article_list if a in article_id_to_index]

# history and clicked articles as indices
val_behaviors_df['history_idx'] = val_behaviors_df['History_list'].apply(lambda x: to_idx_list(x, article_id_to_index))
val_behaviors_df['clicked_idx'] = val_behaviors_df['clicked_articles'].apply(lambda x: to_idx_list(x, article_id_to_index))

# candidate articles (all articles in impression) as indices
def extract_candidate_ids(impression_str):
    return [x.split("-")[0] for x in impression_str.split()]

val_behaviors_df['candidate_articles'] = val_behaviors_df['Impressions'].apply(extract_candidate_ids)
val_behaviors_df['candidate_idx'] = val_behaviors_df['candidate_articles'].apply(lambda x: to_idx_list(x, article_id_to_index))



In [39]:
results = []

for i, row in val_behaviors_df.iterrows():
    history_idx = row['history_idx']
    if len(history_idx) == 0:
        continue  # skip users with unknown history

    candidate_idx = row['candidate_idx']
    clicked_idx = row['clicked_idx']

    # score all candidate articles
    scores = score_candidates(history_idx, candidate_idx, article_article_sim)

    # rank candidates descending
    ranked_idx = np.argsort(scores)[::-1]
    ranked_items = [candidate_idx[j] for j in ranked_idx]

    results.append({
        'Impression ID': row['Impression ID'],
        'ranked_items': ranked_items,
        'scores': scores[ranked_idx],
        'clicked_idx': clicked_idx
    })


In [41]:
def hit_at_k_multi(ranked_items, clicked_idx, k):
    return int(any(item in ranked_items[:k] for item in clicked_idx))

def reciprocal_rank_multi(ranked_items, clicked_idx):
    for rank, item in enumerate(ranked_items, 1):
        if item in clicked_idx:
            return 1.0 / rank
    return 0.0

def ndcg_at_k_multi(ranked_items, clicked_idx, k):
    dcg = 0.0
    for rank, item in enumerate(ranked_items[:k], 1):
        if item in clicked_idx:
            dcg += 1.0 / np.log2(rank + 1)
    # ideal DCG
    idcg = sum(1.0 / np.log2(i + 1) for i in range(1, min(k, len(clicked_idx)) + 1))
    return dcg / idcg if idcg > 0 else 0.0


In [43]:
mrr_list = []
hit5_list = []
hit10_list = []
ndcg5_list = []
ndcg10_list = []

for row in results:
    ranked_items = row['ranked_items']
    clicked_idx = row['clicked_idx']

    mrr_list.append(reciprocal_rank_multi(ranked_items, clicked_idx))
    hit5_list.append(hit_at_k_multi(ranked_items, clicked_idx, 5))
    hit10_list.append(hit_at_k_multi(ranked_items, clicked_idx, 10))
    ndcg5_list.append(ndcg_at_k_multi(ranked_items, clicked_idx, 5))
    ndcg10_list.append(ndcg_at_k_multi(ranked_items, clicked_idx, 10))

metrics = {
    'MRR': np.mean(mrr_list),
    'HitRate@5': np.mean(hit5_list),
    'HitRate@10': np.mean(hit10_list),
    'nDCG@5': np.mean(ndcg5_list),
    'nDCG@10': np.mean(ndcg10_list)
}

print("Evaluation Metrics:")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")


Evaluation Metrics:
MRR: 0.1920
HitRate@5: 0.3049
HitRate@10: 0.4052
nDCG@5: 0.1904
nDCG@10: 0.2246


Since there are some users that have all cold start items included in their article history, we aren't able to figure out what this user likes. Their historical items are not included in the article_article similarity matrix, so we are stuck excluding these users for now. For all other users with atleast 1 article from their history included in the similarity matrix, we are able to take this item and get items most similar to it. These are users where we can do CF recommendations. The evaluation metrics reflect users with known items in their history which we could use to find other articles liked by similar users.  

Notably, the evaluation metrics for the CF approach are optimistically high. This approach only scores articles included in the impression. Since we are are excluding cold start users, and also only considering the articles included in the Impression (impression candidates) for each user (rather than the entire dataset of possible articles), the ranking metrics are higher than they would be for a content-based approach.


Metrics (HitRate, MRR, nDCG) are computed only over the set of articles included in the impression, so the userâ€™s clicked articles have a high chance of being ranked highly if the CF approach works.
If during evaluation, we looped over all articles in the validation set, most articles will not be relevant to the user, so the clicked articles become extremely sparse among all possible candidates.
<br>
Example: if there are 7,000 articles in validation and only 2 are clicked:
- HitRate@5 becomes very low because the clicked articles are unlikely to be in the top 5 of a ranking over 7,000 items.
- MRR will drop for the same reason.
- nDCG also decreases because relevant articles are buried deep in the ranking.

#TODO: Need to address users where their validation items are not included in the training set/not included in the article-article similarity matrix. These users picked articles that could not have been recommended because the CF model didn't have these articles included in the training set. Therefore, they couldn't have been included in the recommendations given for this user. 

In [44]:
val_behaviors_df.head()

Unnamed: 0,Impression ID,User ID,Time,History,Impressions,unix_timestamp,History_list,clicked_articles,history_idx,clicked_idx,candidate_articles,candidate_idx
0,8158,U1,11/15/2019 7:28:25 AM,N23571 N58267 N25682 N10646 N32607 N57737 N523...,N14637-0 N20036-1,1573802905,"[N23571, N58267, N25682, N10646, N32607, N5773...",[N20036],[426],[1251],"[N14637, N20036]",[1251]
1,71327,U10,11/15/2019 6:08:15 AM,N27612 N36699 N64777 N9120 N9803 N57967 N2945,N33397-0 N46917-0 N11930-0 N58612-0 N47612-0 N...,1573798095,"[N27612, N36699, N64777, N9120, N9803, N57967,...",[N32536],[],[],"[N33397, N46917, N11930, N58612, N47612, N2362...","[3004, 4787, 234, 4887, 5205, 6109, 7611, 3464..."
2,5144,U10000,11/15/2019 1:16:49 PM,N10059 N46978 N53234 N3345 N3345 N9155 N9653 N...,N48740-0 N51470-0 N1952-0 N23675-0 N56969-0 N6...,1573823809,"[N10059, N46978, N53234, N3345, N3345, N9155, ...",[N60215],[7117],[],"[N48740, N51470, N1952, N23675, N56969, N62365...","[1195, 7400, 7303, 2820, 6345, 6617, 234, 313]"
3,41696,U10000,11/15/2019 8:39:38 AM,N10059 N46978 N53234 N3345 N3345 N9155 N9653 N...,N29393-0 N20036-0 N30290-0 N31958-1 N23513-0 N...,1573807178,"[N10059, N46978, N53234, N3345, N3345, N9155, ...",[N31958],[7117],[2820],"[N29393, N20036, N30290, N31958, N23513, N1999...","[1251, 2604, 2820, 1709, 3464]"
4,1307,U10000,11/15/2019 9:54:10 AM,N10059 N46978 N53234 N3345 N3345 N9155 N9653 N...,N50775-1 N31958-0 N53572-0 N5472-0 N58251-0 N3...,1573811650,"[N10059, N46978, N53234, N3345, N3345, N9155, ...",[N50775],[7117],[],"[N50775, N31958, N53572, N5472, N58251, N37352...",[2820]


In [45]:
num_unreachable = (val_behaviors_df['clicked_idx'].apply(lambda x: all(i == -1 for i in x))).sum()
print(f"Number of rows where all clicked items are unseen in training: {num_unreachable}")

Number of rows where all clicked items are unseen in training: 32595


Note: There are 32,595 rows where all the articles that users have read are not seen in the training set; therefore, we were not able to see these items when computing the item-item similarity matrix, and we did not see these items when getting the most popular items. These rows with only cold start items were not included in the metrics, so we will need to generate content-based recommendations for these remaining rows. 