# Import Packages

In [None]:
import numpy as np
import pandas as pd
import ast
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

  from tqdm.autonotebook import tqdm, trange


In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/Colab Notebooks/

Mounted at /content/drive
/content/drive/My Drive/Colab Notebooks


# Data Understanding and Cleaning

### Behaviour Dataset

In [None]:
behaviors_df = pd.read_csv('/content/drive/MyDrive/BT4222/MIND_Data/MINDlarge_train/behaviors.tsv', sep='\t', names=["User ID", "Time", "History", "Impressions"]) #take some time to load as it is quite a big size file

In [None]:
behaviors_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2232748 entries, 1 to 2232748
Data columns (total 4 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   User ID      object
 1   Time         object
 2   History      object
 3   Impressions  object
dtypes: object(4)
memory usage: 85.2+ MB


In [None]:
# Group by User ID and check for duplicate History and Impressions
grouped = behaviors_df.groupby(['User ID', 'History', 'Impressions']).size().reset_index(name='count')

behaviors_df = behaviors_df.drop_duplicates(subset=['User ID', 'Impressions'])

behaviors_df.info() #2232117 rows

<class 'pandas.core.frame.DataFrame'>
Index: 2232117 entries, 1 to 2232748
Data columns (total 4 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   User ID      object
 1   Time         object
 2   History      object
 3   Impressions  object
dtypes: object(4)
memory usage: 85.1+ MB


Went into deeper understanding and realise that each user has their "preset" history hence when dropping duplicates, we ignore the Histry column and group based on User ID and Impressions.

### News Dataset

In [None]:
news_df = pd.read_csv('/content/drive/MyDrive/BT4222/MIND_Data/MINDlarge_train/news.tsv', sep='\t', names=["News ID", "Category", "SubCategory", "Title", "Abstract", "URL", "Title Entities", "Abstract Entities"])
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101527 entries, 0 to 101526
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   News ID            101527 non-null  object
 1   Category           101527 non-null  object
 2   SubCategory        101527 non-null  object
 3   Title              101527 non-null  object
 4   Abstract           96112 non-null   object
 5   URL                101527 non-null  object
 6   Title Entities     101524 non-null  object
 7   Abstract Entities  101521 non-null  object
dtypes: object(8)
memory usage: 6.2+ MB


In [None]:
# Drop rows where 'abstract' column has null values as it serves no value
news_df = news_df.dropna(subset=['Abstract'])

# Focus on these columns
news_df = news_df[['News ID', 'Title', 'Abstract', 'Category', 'SubCategory']]

news_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 96112 entries, 0 to 101524
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   News ID      96112 non-null  object
 1   Title        96112 non-null  object
 2   Abstract     96112 non-null  object
 3   Category     96112 non-null  object
 4   SubCategory  96112 non-null  object
dtypes: object(5)
memory usage: 4.4+ MB


In [None]:
#Drop news that have repeated Title in the same category
grouped = news_df.groupby(['Title', 'Category']).size().reset_index(name='count')

news_df = news_df.drop_duplicates(subset=['Title', 'Category'])

category_value = news_df['Category'].value_counts()

news_df.info()
print(category_value)

<class 'pandas.core.frame.DataFrame'>
Index: 93419 entries, 0 to 101524
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   News ID      93419 non-null  object
 1   Title        93419 non-null  object
 2   Abstract     93419 non-null  object
 3   Category     93419 non-null  object
 4   SubCategory  93419 non-null  object
dtypes: object(5)
memory usage: 4.3+ MB
Category
news             28578
sports           28391
finance           5686
travel            4542
video             4493
foodanddrink      4257
lifestyle         4177
weather           3596
health            2791
autos             2724
tv                1284
music             1210
movies             799
entertainment      791
kids                96
middleeast           2
games                1
northamerica         1
Name: count, dtype: int64


In [None]:
# Since these category has v few news, doesn't make sense to keep it
categories_to_drop = ['middleeast', 'games', 'northamerica']

# Exclude rows where the 'Category' is in categories_to_drop
news_df = news_df[~news_df['Category'].isin(categories_to_drop)]

news_df['Content'] = news_df['Title']+ ' - ' + news_df['Abstract']
news_df['Full Category'] = news_df['Category']+ ' - ' + news_df['SubCategory']

# Display info and check categories
news_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 93415 entries, 0 to 101524
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   News ID        93415 non-null  object
 1   Title          93415 non-null  object
 2   Abstract       93415 non-null  object
 3   Category       93415 non-null  object
 4   SubCategory    93415 non-null  object
 5   Content        93415 non-null  object
 6   Full Category  93415 non-null  object
dtypes: object(7)
memory usage: 5.7+ MB


# using s-bert method

In [None]:
specific_user = behaviors_df[behaviors_df['User ID'] == 'U123981']
su_history = specific_user['History']
all_history_list = su_history.str.split().explode().tolist()

clicked_news = news_df[news_df['News ID'].isin(all_history_list)]
clicked_news_details = clicked_news[['News ID', 'Full Category']]

unique_clicked_categories = clicked_news_details['Full Category'].unique()

filtered_news = news_df[news_df['Full Category'].isin(unique_clicked_categories)]
filtered_news.info()

clicked_news = filtered_news[filtered_news['News ID'].isin(all_history_list)]
clicked_news_details = clicked_news[['News ID','Content', 'Full Category']]
print(clicked_news_details)

<class 'pandas.core.frame.DataFrame'>
Index: 9497 entries, 0 to 101522
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   News ID        9497 non-null   object
 1   Title          9497 non-null   object
 2   Abstract       9497 non-null   object
 3   Category       9497 non-null   object
 4   SubCategory    9497 non-null   object
 5   Content        9497 non-null   object
 6   Full Category  9497 non-null   object
dtypes: object(7)
memory usage: 593.6+ KB
       News ID                                            Content  \
8640   N125741  14 Bad Habits You Can Break Right Now to Live ...   
9224   N127629  Eminem Says He Sides With Chris Brown Over Rih...   
34779   N39414  Prince Charles Hit by One of the Most Incredib...   
37953  N104277  Trump tweets fake image of him putting medal a...   
56060  N106692  Yahoo data breach $117.5 million settlement: G...   
59280  N107944  Simone Biles ties world gymnastics cha

In [None]:
clicked_news_ids = clicked_news_details['News ID'].tolist()

# Step 1: Load the Sentence Transformer model
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

# Step 2: Generate embeddings for all articles
batch_size = 1000
all_content = news_df['Content'].tolist()
embeddings = []

# Generate embeddings in batches
for i in tqdm(range(0, len(all_content), batch_size)):
    batch = all_content[i:i + batch_size]
    batch_embeddings = model.encode(batch)
    embeddings.extend(batch_embeddings)

embeddings = np.array(embeddings)

# Step 3: Generate recommendations without full similarity matrix
user_history_indices = [
    news_df[news_df['News ID'] == news_id].index[0]
    for news_id in clicked_news_ids if news_id in news_df['News ID'].values
]

all_indices = set(range(len(news_df)))
unread_indices = list(all_indices - set(user_history_indices))

recommended_indices = set()

for index in tqdm(user_history_indices):
    # Compute pairwise similarity with only unread articles
    unread_embeddings = embeddings[unread_indices]
    sim_scores = cosine_similarity([embeddings[index]], unread_embeddings)[0]

    # Get the indices of top 5 similar articles
    top_indices_in_unread = np.argsort(sim_scores)[::-1][:5]
    top_indices = [unread_indices[i] for i in top_indices_in_unread]
    recommended_indices.update(top_indices)

# Step 4: Retrieve recommended articles
recommended_articles = news_df.iloc[list(recommended_indices)]
print("Recommended Articles:\n", recommended_articles[['News ID', 'Title']])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
100%|██████████| 94/94 [00:33<00:00,  2.77it/s]
100%|██████████| 8/8 [00:01<00:00,  4.98it/s]

Recommended Articles:
        News ID                                              Title
44323   N13706               Hard Rock cranes to come down Sunday
14790   N31569  The Tastiest, Most Festive Snacks To Serve On ...
52685  N116297  Halloween Trick: Granbury woman believes her w...
21517   N25174  Red Wings provide Brendan Perlini fresh slate,...
50851   N56315  Brad Stevens Reveals Text He Sent Grant Willia...
32509  N109827  New video sheds light on deadly Fresno police ...
7308    N77348              55 Last-Minute Christmas Dinner Ideas
22915  N116610  Celtics have a foul opening night in Philadelphia
88361  N121439  Wendy's New Birthday Cake Frosty Is a Must-Try...
28783   N98214  Starbucks Isn't Bringing Back The Gingerbread ...
80196   N51070                   The 15 Best Islands in the World
56894   N40877  The best time to visit Iceland and 49 other to...
94024   N98404  Celtics Notes: Robert Williams Has Career Nigh...
57721   N65823  'Time was of the essence': Politician




# To test for accuracy

In [None]:
# Get the list of recommended article IDs
recommended_article_ids = recommended_articles['News ID'].tolist()

# Define a function to extract both all impressions and clicked impressions from the 'Impressions' column
def extract_impressions(impressions):
    impressions_list = impressions.split()
    all_impressions = [imp.split('-')[0] for imp in impressions_list]  # All impressions
    clicked_impressions = [imp.split('-')[0] for imp in impressions_list if imp.endswith('-1')]  # Only clicked (-1)
    return all_impressions, clicked_impressions

# Apply the function row-wise to the 'Impressions' column and create two new columns 'All Impressions' and 'Clicked Impressions'
specific_user[['All Impressions', 'Clicked Impressions']] = specific_user.apply(
    lambda row: pd.Series(extract_impressions(row['Impressions'])),
    axis=1)

recommended_set = set(recommended_articles['News ID'])
clicked_impressions_list = specific_user['Clicked Impressions'].explode().tolist()
all_impressions_list = specific_user['All Impressions'].explode().tolist()
clicked_impressions_set = set(clicked_impressions_list)
all_impressions_set = set(all_impressions_list)

print(f"Recommended articles: {recommended_set}")
print(f"Clicked impressions: {clicked_impressions_set}")
print(f"All impressions: {all_impressions_set}")


# Loop through recommended_article_ids and check if each one is in the Clicked Impressions
clicked_recommended_articles = []

for article_id in recommended_article_ids:
    # Check if the article ID is in the list of Clicked Impressions for any row
    if specific_user['Clicked Impressions'].apply(lambda x: article_id in x).any():
        clicked_recommended_articles.append(article_id)

# Print the recommended article IDs that appeared in Clicked Impressions
print("Recommended articles that appeared in Clicked Impressions:", clicked_recommended_articles)

# If you want to also check for the recommended articles in All Impressions
all_recommended_articles = []

for article_id in recommended_article_ids:
    # Check if any of the 'All Impressions' lists contains the recommended article ID
    if specific_user['All Impressions'].apply(lambda x: article_id in x).any():
        all_recommended_articles.append(article_id)

# Print the recommended article IDs that appeared in All Impressions
print("Recommended articles that appeared in All Impressions:", all_recommended_articles)

Recommended articles: {'N83631', 'N67468', 'N89146', 'N112765', 'N40004', 'N32931', 'N89289', 'N95476', 'N5379', 'N45604'}
Clicked impressions: {'N118883', 'N7277'}
All impressions: {'N24150', 'N109605', 'N36512', 'N43729', 'N55944', 'N118883', 'N90853', 'N110523', 'N3664', 'N9262', 'N16485', 'N58059', 'N27097', 'N7277', 'N51158', 'N109083', 'N38678'}
Recommended articles that appeared in Clicked Impressions: []
Recommended articles that appeared in All Impressions: []


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  specific_user[['All Impressions', 'Clicked Impressions']] = specific_user.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  specific_user[['All Impressions', 'Clicked Impressions']] = specific_user.apply(


In [None]:
# Check the categories of the recommended articles
recommended_categories = recommended_articles['Full Category'].unique()

print("Recommended Categories:")
print(recommended_categories)

# Check the categories of the clicked impressions
clicked_news = news_df[news_df['News ID'].isin(clicked_impressions_list)]
clicked_news_categories = clicked_news['Full Category'].unique()

print("Clicked Categories based on Impressions:")
print(clicked_news_categories)

# Check the categories of the all impressions
all_news = news_df[news_df['News ID'].isin(all_impressions_list)]
all_news_categories = all_news['Full Category'].unique()

print("All Categories based on Impressions:")
print(all_news_categories)

Recommended Categories:
['sports - more_sports' 'news - newspolitics']
Clicked Categories based on Impressions:
['news - newsus' 'news - newspolitics']
All Categories based on Impressions:
['foodanddrink - recipes' 'travel - travelnews' 'news - newsus'
 'news - newscrime' 'news - newsworld' 'sports - golf'
 'finance - finance-companies' 'movies - movienews' 'news - newspolitics'
 'music - music-celebrity']


In [None]:
clicked_news_categories = set(clicked_news['Full Category'])
recommended_categories = set(recommended_articles['Full Category'])

# Calculate Precision, Recall, and Accuracy
# Precision: How many recommended categories are actually clicked by the user
true_positives = len(clicked_news_categories.intersection(recommended_categories))
precision = true_positives / len(recommended_categories) if recommended_categories else 0

# Recall: How many clicked categories are recommended to the user
recall = true_positives / len(clicked_news_categories) if clicked_news_categories else 0

# Accuracy: How many categories in total are correctly recommended (both in recommended and clicked)
accuracy = true_positives / (len(clicked_news_categories) + len(recommended_categories) - true_positives) if (len(clicked_news_categories) + len(recommended_categories) - true_positives) > 0 else 0

# Print the results
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Accuracy: {accuracy:.4f}")

Precision: 0.5000
Recall: 0.5000
Accuracy: 0.3333
