In [19]:
import re
import nltk
from nltk.corpus import stopwords
import pandas as pd
from datasketch import MinHash
from sentence_transformers import SentenceTransformer
from annoy import AnnoyIndex
import numpy as np
from tqdm import tqdm
import pyarrow

In [2]:
news_df = pd.read_csv('MINDsmall_train/news.tsv', sep='\t', header=None)

news_df.columns = [
    "news_id", "category", "subcategory", "title", "abstract",
    "url", "title_entities", "abstract_entities"
]

behaviors_df = pd.read_csv('MINDsmall_train/behaviors.tsv', sep='\t', header=None)

behaviors_df.columns = [
    "impression_id", "user_id", "timestamp", "history", "impressions"
]

news_df.head(5)

Unnamed: 0,news_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [3]:
behaviors_df.head(5)

Unnamed: 0,impression_id,user_id,timestamp,history,impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...


In [4]:
behaviors_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156965 entries, 0 to 156964
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   impression_id  156965 non-null  int64 
 1   user_id        156965 non-null  object
 2   timestamp      156965 non-null  object
 3   history        153727 non-null  object
 4   impressions    156965 non-null  object
dtypes: int64(1), object(4)
memory usage: 6.0+ MB


In [5]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51282 entries, 0 to 51281
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   news_id            51282 non-null  object
 1   category           51282 non-null  object
 2   subcategory        51282 non-null  object
 3   title              51282 non-null  object
 4   abstract           48616 non-null  object
 5   url                51282 non-null  object
 6   title_entities     51279 non-null  object
 7   abstract_entities  51278 non-null  object
dtypes: object(8)
memory usage: 3.1+ MB


In [6]:
news_df['title'] = news_df['title'].fillna('')
news_df['abstract'] = news_df['abstract'].fillna('')

In [7]:
news_df['text'] = news_df['title'] + ". " + news_df['abstract']

In [8]:
news_df = news_df.drop_duplicates(subset='text')


In [9]:
news_df = news_df[news_df['text'].str.strip() != '']


# **Preprocessing for MinHash LSH** 

### **Remove special characters and tokenize the text column**

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def normalize_title(title):
    title = title.lower()
    title = re.sub(r'[^a-z0-9\s]', '', title) 
    tokens = title.split()
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

news_df['tokens'] = news_df['title'].apply(normalize_title)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\resea\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
# Load a small powerful model
embedder = SentenceTransformer('all-MiniLM-L6-v2')


In [15]:
# Generate embeddings
embeddings = embedder.encode(news_df['text'].tolist(), show_progress_bar=True, batch_size=64)

news_df['embedding'] = list(embeddings)

Batches: 100%|██████████| 792/792 [10:49<00:00,  1.22it/s]


In [20]:
embedding_dim = 384
n_trees = 50
     
annoy_index = AnnoyIndex(embedding_dim, 'angular')
for i, emb in tqdm(enumerate(news_df['embedding']), total=len(news_df)):
    annoy_index.add_item(i, emb)

annoy_index.build(n_trees)
annoy_index.save('MINDsmall_train/news_articles.ann')

news_df['annoy_id'] = news_df.index
news_df.to_parquet('news_articles.parquet', engine='pyarrow', index=False) 

100%|██████████| 50669/50669 [00:01<00:00, 41120.51it/s]
