In [19]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("arashnic/mind-news-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/mind-news-dataset


In [18]:
print(articles_df.columns)
print(user_histories_df.columns)
print(behaviors_df.columns)

Index(['id', 'category', 'subcategory', 'title', 'abstract', 'url', 'entity',
       'misc', 'abstract_clean', 'content'],
      dtype='object')
Index(['user_id', 'clicked_articles', 'history'], dtype='object')
Index(['impression_id', 'user_id', 'timestamp', 'history', 'impressions',
       'clicked_articles', 'all_articles'],
      dtype='object')


In [26]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import ast
from tqdm import tqdm

# === Adjust this path to where kagglehub saved the dataset ===
DATA_DIR = "/kaggle/working/arashnic-mind-news-dataset"  # or wherever your dataset is extracted

# === Load and concatenate news files ===
def load_news():
    dfs = []
    for split in ['train', 'dev', 'test']:
        path = os.path.join(DATA_DIR, split, '/kaggle/input/mind-news-dataset/news.tsv/news.tsv')
        df = pd.read_csv(path, sep='\t', names=['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'TitleEntities', 'AbstractEntities'])
        dfs.append(df)
    news_df = pd.concat(dfs, ignore_index=True)
    # Drop duplicates just in case
    news_df.drop_duplicates(subset=['NewsID'], inplace=True)
    return news_df

# === Load and concatenate behaviors files ===
def load_behaviors():
    dfs = []
    for split in ['train', 'dev', 'test']:
        path = os.path.join(DATA_DIR, split, '/kaggle/input/mind-news-dataset/MINDsmall_train/behaviors.tsv')
        df = pd.read_csv(path, sep='\t', names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions'])
        dfs.append(df)
    behaviors_df = pd.concat(dfs, ignore_index=True)
    return behaviors_df

# === Parse clicked news from impressions string ===
def parse_clicked(impression_str):
    # impression_str example: "N12345-1 N23456-0 N34567-1"
    clicked = []
    for item in impression_str.strip().split():
        try:
            news_id, click = item.split('-')
            if click == '1':
                clicked.append(news_id)
        except:
            continue
    return clicked

# === Parse user history string ===
def parse_history(history_str):
    if pd.isna(history_str) or history_str.strip() == '':
        return []
    return history_str.strip().split()

# === Build TF-IDF matrix for news content (title + abstract) ===
def build_tfidf(news_df):
    news_df['content'] = news_df['Title'].fillna('') + " " + news_df['Abstract'].fillna('')
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    tfidf_matrix = vectorizer.fit_transform(news_df['content'])
    return tfidf_matrix, vectorizer

# === Build user profile vectors by averaging clicked news vectors ===
def build_user_profiles(behaviors_df, news2idx, tfidf_matrix):
    user_profiles = {}
    skipped_no_history = 0
    for user_id, group in tqdm(behaviors_df.groupby('UserID'), desc="Building user profiles"):
        # Aggregate all clicked news from history + impressions
        clicked_news = set()
        for _, row in group.iterrows():
            clicked_news.update(parse_history(row['History']))
            clicked_news.update(parse_clicked(row['Impressions']))
        # Filter clicked news to those in news2idx
        valid_indices = [news2idx[nid] for nid in clicked_news if nid in news2idx]
        if not valid_indices:
            skipped_no_history += 1
            continue
        # Average vectors
        profile_vec = np.asarray(tfidf_matrix[valid_indices].mean(axis=0)).ravel()
        user_profiles[user_id] = profile_vec
    print(f"Skipped {skipped_no_history} users with no valid clicked news.")
    return user_profiles

# === Popularity baseline for fallback recommendations ===
def get_popularity(behaviors_df):
    click_counts = defaultdict(int)
    for _, row in behaviors_df.iterrows():
        clicked = parse_clicked(row['Impressions'])
        for nid in clicked:
            click_counts[nid] += 1
    popular_news = [nid for nid, _ in sorted(click_counts.items(), key=lambda x: x[1], reverse=True)]
    return popular_news

# === Recommend top-k news for a user based on profile vector ===
def recommend(user_vec, tfidf_matrix, news_df, news2idx, popular_news, clicked_news_set, k=10):
    if user_vec is None:
        recs = [nid for nid in popular_news if nid not in clicked_news_set]
        return recs[:k]
    user_vec = np.asarray(user_vec)
    if user_vec.ndim == 1:
        user_vec = user_vec.reshape(1, -1)
    sims = cosine_similarity(user_vec, tfidf_matrix).flatten()
    top_indices = sims.argsort()[::-1]
    recs = []
    
    for idx in top_indices:
        nid = news_df.iloc[idx]['NewsID']
        if nid not in clicked_news_set:
            recs.append(nid)
        if len(recs) == k:
            break
    # If not enough recs, fill with popular news
    if len(recs) < k:
        for nid in popular_news:
            if nid not in clicked_news_set and nid not in recs:
                recs.append(nid)
            if len(recs) == k:
                break
    return recs

# === Evaluation metrics ===
def precision_at_k(actual, predicted, k):
    actual_set = set(actual)
    pred_k = predicted[:k]
    return len(set(pred_k) & actual_set) / k

def recall_at_k(actual, predicted, k):
    actual_set = set(actual)
    pred_k = predicted[:k]
    return len(set(pred_k) & actual_set) / len(actual_set) if actual_set else 0

def ndcg_at_k(actual, predicted, k):
    actual_set = set(actual)
    dcg = 0.0
    for i, p in enumerate(predicted[:k]):
        if p in actual_set:
            dcg += 1 / np.log2(i + 2)
    idcg = sum([1 / np.log2(i + 2) for i in range(min(len(actual_set), k))])
    return dcg / idcg if idcg > 0 else 0

# === Main pipeline ===
def main():
    print("Loading news...")
    news_df = load_news()
    print(f"Total news articles: {len(news_df)}")

    print("Loading behaviors...")
    behaviors_df = load_behaviors()
    print(f"Total behavior records: {len(behaviors_df)}")

    print("Building TF-IDF matrix...")
    tfidf_matrix, vectorizer = build_tfidf(news_df)

    news2idx = {nid: idx for idx, nid in enumerate(news_df['NewsID'])}

    print("Building user profiles...")
    user_profiles = build_user_profiles(behaviors_df, news2idx, tfidf_matrix)

    print("Calculating popularity baseline...")
    popular_news = get_popularity(behaviors_df)

    print("Generating recommendations and evaluating...")
    precisions, recalls, ndcgs = [], [], []
    evaluated_users = 0
    skipped_users = 0

    for user_id, group in tqdm(behaviors_df.groupby('UserID')):
        # Aggregate clicked news in impressions for evaluation
        actual_clicked = []
        for _, row in group.iterrows():
            actual_clicked.extend(parse_clicked(row['Impressions']))
        actual_clicked = list(set(actual_clicked))
        if not actual_clicked:
            skipped_users += 1
            continue

        user_vec = user_profiles.get(user_id, None)
        # Also get clicked news from history + impressions to exclude from recommendations
        clicked_news = set()
        for _, row in group.iterrows():
            clicked_news.update(parse_history(row['History']))
            clicked_news.update(parse_clicked(row['Impressions']))

        recs = recommend(user_vec, tfidf_matrix, news_df, news2idx, popular_news, clicked_news, k=10)

        precisions.append(precision_at_k(actual_clicked, recs, 10))
        recalls.append(recall_at_k(actual_clicked, recs, 10))
        ndcgs.append(ndcg_at_k(actual_clicked, recs, 10))
        evaluated_users += 1

    # Save recommendations for all users
    print("Saving recommendations for all users...")
    recs_output = []
    for user_id, group in behaviors_df.groupby('UserID'):
        user_vec = user_profiles.get(user_id, None)
        clicked_news = set()
        for _, row in group.iterrows():
            clicked_news.update(parse_history(row['History']))
            clicked_news.update(parse_clicked(row['Impressions']))
        recs = recommend(user_vec, tfidf_matrix, news_df, news2idx, popular_news, clicked_news, k=10)
        recs_output.append({'user_id': user_id, 'recommended_news_ids': recs})

    recs_df = pd.DataFrame(recs_output)
    recs_df.to_csv("mind_tfidf_recommendations.csv", index=False)
    print("Recommendations saved to mind_tfidf_recommendations.csv")

if __name__ == "__main__":
    main()


Loading news...
Total news articles: 51282
Loading behaviors...
Total behavior records: 470895
Building TF-IDF matrix...
Building user profiles...


Building user profiles: 100%|██████████| 50000/50000 [00:49<00:00, 1005.23it/s]


Skipped 0 users with no valid clicked news.
Calculating popularity baseline...
Generating recommendations and evaluating...


100%|██████████| 50000/50000 [09:02<00:00, 92.20it/s] 


Saving recommendations for all users...
Recommendations saved to mind_tfidf_recommendations.csv


# **Evaluation**

In [27]:
import pandas as pd
import ast

# Load saved recommendations
recs_df = pd.read_csv("/kaggle/working/mind_tfidf_recommendations.csv")

# Convert string representation of list back to Python list
def parse_recs(rec_str):
    try:
        return ast.literal_eval(rec_str)
    except:
        return []

recs_df['recommended_news_ids'] = recs_df['recommended_news_ids'].apply(parse_recs)

# Load behaviors (ground truth)
behaviors_df = pd.read_csv("/kaggle/input/mind-news-dataset/MINDsmall_train/behaviors.tsv", sep='\t', names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions'])

# Parsing function for actual clicked news
def parse_clicked(impression_str):
    if pd.isna(impression_str) or impression_str.strip() == '':
        return []
    clicked = []
    for item in impression_str.strip().split():
        try:
            news_id, click = item.split('-')
            if click == '1':
                clicked.append(news_id)
        except:
            continue
    return clicked
from collections import defaultdict

actual_clicks_per_user = defaultdict(set)

for _, row in behaviors_df.iterrows():
    user = row['UserID']
    clicked = parse_clicked(row['Impressions'])
    actual_clicks_per_user[user].update(clicked)
def hit_rate_at_k(actual, predicted, k):
    actual_set = set(actual)
    pred_k = predicted[:k]
    return 1.0 if len(set(pred_k) & actual_set) > 0 else 0.0

def mrr_at_k(actual, predicted, k):
    actual_set = set(actual)
    for i, p in enumerate(predicted[:k]):
        if p in actual_set:
            return 1.0 / (i + 1)
    return 0.0
hit_rates = []
mrrs = []

for _, row in recs_df.iterrows():
    user = row['user_id']
    recs = row['recommended_news_ids']
    actual = list(actual_clicks_per_user.get(user, []))
    if not actual:
        continue  # Skip users with no actual clicks
    hit_rates.append(hit_rate_at_k(actual, recs, 10))
    mrrs.append(mrr_at_k(actual, recs, 10))

print(f"Hit Rate@10: {sum(hit_rates)/len(hit_rates):.4f}")
print(f"MRR@10: {sum(mrrs)/len(mrrs):.4f}")


Hit Rate@10: 0.0000
MRR@10: 0.0000


In [28]:
print("Sample user recommendations and actual clicks:")

for _, row in recs_df.head(5).iterrows():
    user = row['user_id']
    recs = row['recommended_news_ids']
    actual = list(actual_clicks_per_user.get(user, []))
    print(f"User: {user}")
    print(f"Recommended: {recs}")
    print(f"Actual clicked: {actual}")
    print("-" * 40)


Sample user recommendations and actual clicks:
User: U100
Recommended: ['N13927', 'N19429', 'N52303', 'N64622', 'N42746', 'N6464', 'N28081', 'N12412', 'N19267', 'N19285']
Actual clicked: ['N7800']
----------------------------------------
User: U1000
Recommended: ['N6785', 'N39535', 'N23084', 'N51257', 'N41835', 'N50553', 'N45324', 'N60374', 'N17196', 'N288']
Actual clicked: ['N29739', 'N53875', 'N58656', 'N7670']
----------------------------------------
User: U10001
Recommended: ['N6482', 'N4812', 'N54662', 'N19893', 'N6677', 'N58090', 'N17668', 'N55172', 'N60064', 'N28219']
Actual clicked: ['N10833', 'N35937', 'N1031']
----------------------------------------
User: U10003
Recommended: ['N64773', 'N36602', 'N45970', 'N43123', 'N30269', 'N46921', 'N11971', 'N31681', 'N63706', 'N38758']
Actual clicked: ['N57090', 'N18708', 'N55689']
----------------------------------------
User: U10008
Recommended: ['N49034', 'N1178', 'N56411', 'N46773', 'N23886', 'N31129', 'N13692', 'N21631', 'N15809', 

In [29]:
def normalize_ids(id_list):
    return set(str(i).strip().upper() for i in id_list if i)

# When you compare:
actual_norm = normalize_ids(actual)
rec_norm = normalize_ids(recs)
overlap = actual_norm & rec_norm
print(f"Overlap: {overlap}")


Overlap: set()


In [30]:
for _, row in behaviors_df.head(5).iterrows():
    print(f"User {row['UserID']} clicked: {parse_clicked(row['Impressions'])}")


User U13740 clicked: ['N55689']
User U91836 clicked: ['N17059']
User U73700 clicked: ['N23814']
User U34670 clicked: ['N49685']
User U8125 clicked: ['N8400']
