In [19]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("arashnic/mind-news-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/mind-news-dataset


In [18]:
print(articles_df.columns)
print(user_histories_df.columns)
print(behaviors_df.columns)

Index(['id', 'category', 'subcategory', 'title', 'abstract', 'url', 'entity',
       'misc', 'abstract_clean', 'content'],
      dtype='object')
Index(['user_id', 'clicked_articles', 'history'], dtype='object')
Index(['impression_id', 'user_id', 'timestamp', 'history', 'impressions',
       'clicked_articles', 'all_articles'],
      dtype='object')


In [35]:
pip install -U sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 k

In [1]:
pip install faiss-cpu sentence-transformers pandas tqdm


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB

In [3]:
import os
import pandas as pd
import numpy as np
import faiss
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from collections import defaultdict

DATA_DIR = "/kaggle/working/arashnic-mind-news-dataset"

# === Load news ===
def load_news():
    dfs = []
    for split in ['train', 'dev', 'test']:
        path = os.path.join(DATA_DIR, split, '/kaggle/input/mind-news-dataset/news.tsv/news.tsv')
        df = pd.read_csv(path, sep='\t', names=['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'TitleEntities', 'AbstractEntities'])
        dfs.append(df)
    news_df = pd.concat(dfs, ignore_index=True).drop_duplicates(subset=['NewsID'])
    return news_df

# === Load behaviors ===
def load_behaviors():
    dfs = []
    for split in ['train', 'dev', 'test']:
        path = os.path.join(DATA_DIR, split, '/kaggle/input/mind-news-dataset/MINDsmall_train/behaviors.tsv')
        df = pd.read_csv(path, sep='\t', names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions'])
        dfs.append(df)
    behaviors_df = pd.concat(dfs, ignore_index=True)
    return behaviors_df

# === Parse strings ===
def parse_clicked(impression_str):
    clicked = []
    for item in impression_str.strip().split():
        try:
            nid, click = item.split('-')
            if click == '1':
                clicked.append(nid)
        except:
            continue
    return clicked

def parse_history(history_str):
    return [] if pd.isna(history_str) or history_str.strip() == '' else history_str.strip().split()

# === Get BERT embeddings ===
def compute_news_embeddings(news_df, model_name='sentence-transformers/all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    news_df['content'] = news_df['Title'].fillna('') + " " + news_df['Abstract'].fillna('')
    embeddings = model.encode(news_df['content'].tolist(), show_progress_bar=True, convert_to_numpy=True)
    return embeddings

# === User profile from mean of clicked news vectors ===
def build_user_profiles(behaviors_df, news2idx, news_embeddings):
    user_profiles = {}
    for user_id, group in tqdm(behaviors_df.groupby('UserID'), desc="Building user profiles"):
        clicked = set()
        for _, row in group.iterrows():
            clicked.update(parse_history(row['History']))
            clicked.update(parse_clicked(row['Impressions']))
        valid_idx = [news2idx[nid] for nid in clicked if nid in news2idx]
        if valid_idx:
            profile_vec = np.mean(news_embeddings[valid_idx], axis=0)
            user_profiles[user_id] = profile_vec
    return user_profiles

# === Build FAISS index ===
def build_faiss_index(news_embeddings):
    dim = news_embeddings.shape[1]
    index = faiss.IndexFlatIP(dim)  # Inner Product = Cosine similarity after normalization
    faiss.normalize_L2(news_embeddings)
    index.add(news_embeddings)
    return index

# === Popular baseline ===
def get_popular_news(behaviors_df):
    click_counts = defaultdict(int)
    for _, row in behaviors_df.iterrows():
        for nid in parse_clicked(row['Impressions']):
            click_counts[nid] += 1
    return [nid for nid, _ in sorted(click_counts.items(), key=lambda x: x[1], reverse=True)]

# === Recommend with FAISS ===
def recommend_faiss(user_vec, index, news_df, news2idx, clicked_news, top_k=10, popular_news=[]):
    if user_vec is None:
        return [nid for nid in popular_news if nid not in clicked_news][:top_k]
    vec = user_vec.reshape(1, -1)
    faiss.normalize_L2(vec)
    D, I = index.search(vec, top_k * 2)
    recs = []
    for idx in I[0]:
        nid = news_df.iloc[idx]['NewsID']
        if nid not in clicked_news:
            recs.append(nid)
        if len(recs) == top_k:
            break
    return recs

# === Evaluation metrics ===
def precision_at_k(actual, predicted, k):
    return len(set(actual) & set(predicted[:k])) / k

def recall_at_k(actual, predicted, k):
    return len(set(actual) & set(predicted[:k])) / len(actual) if actual else 0

def ndcg_at_k(actual, predicted, k):
    dcg, idcg = 0.0, sum([1 / np.log2(i + 2) for i in range(min(len(actual), k))])
    for i, p in enumerate(predicted[:k]):
        if p in actual:
            dcg += 1 / np.log2(i + 2)
    return dcg / idcg if idcg else 0

# === Main pipeline ===
def main():
    print("🔄 Loading data...")
    news_df = load_news()
    behaviors_df = load_behaviors()
    news2idx = {nid: idx for idx, nid in enumerate(news_df['NewsID'])}

    print("🧠 Computing BERT embeddings...")
    news_embeddings = compute_news_embeddings(news_df)

    print("⚙️ Building FAISS index...")
    index = build_faiss_index(news_embeddings)

    print("👤 Building user profiles...")
    user_profiles = build_user_profiles(behaviors_df, news2idx, news_embeddings)

    print("🔥 Computing popularity baseline...")
    popular_news = get_popular_news(behaviors_df)

    print("🎯 Generating recommendations and evaluating...")
    k = 10
    precision, recall, ndcg = [], [], []
    for user_id, group in tqdm(behaviors_df.groupby('UserID')):
        actual_clicked = set()
        clicked_all = set()
        for _, row in group.iterrows():
            actual_clicked.update(parse_clicked(row['Impressions']))
            clicked_all.update(parse_clicked(row['Impressions']))
            clicked_all.update(parse_history(row['History']))
        if not actual_clicked:
            continue
        user_vec = user_profiles.get(user_id, None)
        recs = recommend_faiss(user_vec, index, news_df, news2idx, clicked_all, k, popular_news)
        precision.append(precision_at_k(actual_clicked, recs, k))
        recall.append(recall_at_k(actual_clicked, recs, k))
        ndcg.append(ndcg_at_k(actual_clicked, recs, k))

    print(f"\n✅ Evaluation Results (k={k})")
    print(f"Precision@{k}: {np.mean(precision):.4f}")
    print(f"Recall@{k}:    {np.mean(recall):.4f}")
    print(f"nDCG@{k}:      {np.mean(ndcg):.4f}")

    print("📁 Saving recommendations...")
    recs_out = []
    for user_id, group in behaviors_df.groupby('UserID'):
        clicked_all = set()
        for _, row in group.iterrows():
            clicked_all.update(parse_history(row['History']))
            clicked_all.update(parse_clicked(row['Impressions']))
        user_vec = user_profiles.get(user_id, None)
        recs = recommend_faiss(user_vec, index, news_df, news2idx, clicked_all, k, popular_news)
        recs_out.append({'user_id': user_id, 'recommended_news_ids': recs})
    pd.DataFrame(recs_out).to_csv("bert_faiss_recommendations.csv", index=False)
    print("✅ Recommendations saved to bert_faiss_recommendations.csv")

if __name__ == "__main__":
    main()


🔄 Loading data...
🧠 Computing BERT embeddings...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1603 [00:00<?, ?it/s]

⚙️ Building FAISS index...
👤 Building user profiles...


Building user profiles: 100%|██████████| 50000/50000 [00:31<00:00, 1581.96it/s]


🔥 Computing popularity baseline...
🎯 Generating recommendations and evaluating...


100%|██████████| 50000/50000 [05:46<00:00, 144.14it/s]



✅ Evaluation Results (k=10)
Precision@10: 0.0000
Recall@10:    0.0000
nDCG@10:      0.0000
📁 Saving recommendations...
✅ Recommendations saved to bert_faiss_recommendations.csv


# **Evaluation**

In [28]:
print("Sample user recommendations and actual clicks:")

for _, row in recs_df.head(5).iterrows():
    user = row['user_id']
    recs = row['recommended_news_ids']
    actual = list(actual_clicks_per_user.get(user, []))
    print(f"User: {user}")
    print(f"Recommended: {recs}")
    print(f"Actual clicked: {actual}")
    print("-" * 40)


Sample user recommendations and actual clicks:
User: U100
Recommended: ['N13927', 'N19429', 'N52303', 'N64622', 'N42746', 'N6464', 'N28081', 'N12412', 'N19267', 'N19285']
Actual clicked: ['N7800']
----------------------------------------
User: U1000
Recommended: ['N6785', 'N39535', 'N23084', 'N51257', 'N41835', 'N50553', 'N45324', 'N60374', 'N17196', 'N288']
Actual clicked: ['N29739', 'N53875', 'N58656', 'N7670']
----------------------------------------
User: U10001
Recommended: ['N6482', 'N4812', 'N54662', 'N19893', 'N6677', 'N58090', 'N17668', 'N55172', 'N60064', 'N28219']
Actual clicked: ['N10833', 'N35937', 'N1031']
----------------------------------------
User: U10003
Recommended: ['N64773', 'N36602', 'N45970', 'N43123', 'N30269', 'N46921', 'N11971', 'N31681', 'N63706', 'N38758']
Actual clicked: ['N57090', 'N18708', 'N55689']
----------------------------------------
User: U10008
Recommended: ['N49034', 'N1178', 'N56411', 'N46773', 'N23886', 'N31129', 'N13692', 'N21631', 'N15809', 

In [30]:
for _, row in behaviors_df.head(5).iterrows():
    print(f"User {row['UserID']} clicked: {parse_clicked(row['Impressions'])}")


User U13740 clicked: ['N55689']
User U91836 clicked: ['N17059']
User U73700 clicked: ['N23814']
User U34670 clicked: ['N49685']
User U8125 clicked: ['N8400']
