In [1]:
import pandas as pd
import re

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df = pd.read_json('/content/drive/MyDrive/Colab Notebooks/news/data/News_Category_Dataset_v3.json', lines=True)

In [4]:
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [5]:
len(df)

209527

In [6]:
df.columns

Index(['link', 'headline', 'category', 'short_description', 'authors', 'date'], dtype='object')

In [7]:
def _clean_text(text):
    if isinstance(text, str):
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s.,!?\'"-]', '', text)
        text = text.strip()
        if not text:
            return 'no data'
        return text
    return 'no data'

search_columns = ['headline', 'short_description', 'authors']
for col in search_columns:
    df[col] = df[col].apply(_clean_text)

In [8]:
df.groupby('category').count()['link'].sort_values(ascending=False)

Unnamed: 0_level_0,link
category,Unnamed: 1_level_1
POLITICS,35602
WELLNESS,17945
ENTERTAINMENT,17362
TRAVEL,9900
STYLE & BEAUTY,9814
PARENTING,8791
HEALTHY LIVING,6694
QUEER VOICES,6347
FOOD & DRINK,6340
BUSINESS,5992


## BM25 search

In [9]:
%%capture
!pip install bm25s[full]

In [15]:
import bm25s
import Stemmer
import numpy as np
from typing import List, Optional, Dict
from operator import itemgetter
from datetime import datetime

In [11]:
class BM25Retriever:
    def __init__(self, df: pd.DataFrame):
        self.data = df
        self.stemmer = Stemmer.Stemmer("english")
        self.retrievers = {}
        self.search_columns = ['headline', 'short_description', 'authors']


    def index(self):
        self.corpus = {col: self.data[col].fillna('').tolist() for col in self.search_columns}

        for col, docs in self.corpus.items():
            tokens = bm25s.tokenize(docs, stopwords="en", stemmer=self.stemmer)
            retriever = bm25s.BM25()
            retriever.index(tokens)
            self.retrievers[col] = retriever

    def save(self, path: str):
        for col, retriever in self.retrievers.items():
            retriever.save(f"{path}_{col}", corpus=self.corpus[col])

    def load(self, path: str):
        for col in self.search_columns:
            self.retrievers[col] = bm25s.BM25.load(f"{path}_{col}", load_corpus=True)

    def search(self, query: str, k: int = 5, category: Optional[str] = None, not_older_than: Optional[datetime] = None):
        query_tokens = bm25s.tokenize(query, stemmer=self.stemmer)
        matches = []
        for col, retriever in self.retrievers.items():
            results, scores = retriever.retrieve(query_tokens, k=k)
            for i in range(results.shape[1]):
                id, score = results[0, i]['id'], scores[0, i]
                if score > 0:
                    matches.append({
                        'id' : id,
                        'score': score,
                        'matched_field': col
                    })
        if matches:
            scores = np.array([m['score'] for m in matches])
            mean_score = np.mean(scores)
            std_score = np.std(scores) if np.std(scores) > 0 else 1  # Avoid division by zero

            for m in matches:
                m['score'] = 1 / (1 + np.exp(-((m['score'] - mean_score) / std_score)))

        filtered_results = []
        for m in matches:
            row = self.data.iloc[m['id']]
            filtered_results.append({
                'link': row['link'],
                'headline': row['headline'],
                'category': row['category'],
                'short_description': row['short_description'],
                'authors': row['authors'],
                'date': row['date'],
                'score': m['score'],
                'matched_fields': m['matched_field']
            })
        if category:
            filtered_results = [c for c in filtered_results if c['category'] in category]
        if not_older_than:
            filtered_results = [c for c in filtered_results if c['date'] >= not_older_than]
        filtered_results = sorted(filtered_results, key=itemgetter('score'), reverse=True)

        return filtered_results

In [12]:
RETRIVERS_PATH = '/content/drive/MyDrive/Colab Notebooks/news/retrivers/'
lexical = BM25Retriever(df)
lexical.index()
lexical.save(RETRIVERS_PATH)

Split strings:   0%|          | 0/209527 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/209527 [00:00<?, ?it/s]

DEBUG:bm25s:Building index from IDs objects


BM25S Count Tokens:   0%|          | 0/209527 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/209527 [00:00<?, ?it/s]

Split strings:   0%|          | 0/209527 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/209527 [00:00<?, ?it/s]

DEBUG:bm25s:Building index from IDs objects


BM25S Count Tokens:   0%|          | 0/209527 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/209527 [00:00<?, ?it/s]

Split strings:   0%|          | 0/209527 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/209527 [00:00<?, ?it/s]

DEBUG:bm25s:Building index from IDs objects


BM25S Count Tokens:   0%|          | 0/209527 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/209527 [00:00<?, ?it/s]

Finding newlines for mmindex:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Finding newlines for mmindex:   0%|          | 0.00/29.0M [00:00<?, ?B/s]

Finding newlines for mmindex:   0%|          | 0.00/11.8M [00:00<?, ?B/s]

In [16]:
lexical = BM25Retriever(df)
lexical.load(RETRIVERS_PATH)

In [17]:
lexical.search("Woman Who Called Cops On Black Bird-Watcher", k=3, not_older_than=datetime(2017,1,1))

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

[{'link': 'https://www.huffpost.com/entry/amy-cooper-loses-discrimination-lawsuit-franklin-templeton_n_632c6463e4b09d8701bd227e',
  'headline': 'Woman Who Called Cops On Black Bird-Watcher Loses Lawsuit Against Ex-Employer',
  'category': 'U.S. NEWS',
  'short_description': 'Amy Cooper accused investment firm Franklin Templeton of unfairly firing her and branding her a racist after video of the Central Park encounter went viral.',
  'authors': 'Nina Golgowski',
  'date': Timestamp('2022-09-22 00:00:00'),
  'score': 0.917582910338135,
  'matched_fields': 'headline'},
 {'link': 'https://www.huffpost.com/entry/trevor-noah-second-amendment-is-not-intended-for-black-people_n_5bfe1f02e4b0f43bf2662986',
  'headline': "Trevor Noah 'The Second Amendment Is Not Intended For Black People'",
  'category': 'COMEDY',
  'short_description': 'Cops are called into a situation, they see a black person and then immediately they shoot.',
  'authors': 'Carla Baranauckas',
  'date': Timestamp('2018-11-28 00

## Semantic Search

In [18]:
%%capture
!pip install annoy
!sudo apt-get install libomp-dev
!pip install faiss-cpu

In [19]:
from sklearn.metrics.pairwise import cosine_similarity
import faiss
from annoy import AnnoyIndex
from sentence_transformers import SentenceTransformer

In [20]:
path = '/content/drive/MyDrive/Colab Notebooks/news/data/'
model = SentenceTransformer("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [21]:
# search_columns = ['headline', 'short_description', 'authors']
# for col in search_columns:
#     sentences = df[col].to_list()
#     embeddings = model.encode(sentences)
#     np.savez_compressed(f'{path}{col}.npz', embeddings)
#     del sentences
#     del embeddings

In [22]:
class SemanticSearch():
    def __init__(self, data_path: str, model: str = "all-MiniLM-L6-v2", dimmensions: int = 384):
        self.data = df
        self.search_columns = ['headline', 'short_description', 'authors']
        self.model = SentenceTransformer(model)
        self.dimmensions = dimmensions
        self.faiss_indices = {}
        self.annoy_indices = {}

    def build(self, engine: str, embed_path: str):
        if engine == "faiss":
            for col in self.search_columns:
                embeddings = np.load(f"{embed_path}{col}.npz")['arr_0']
                index = faiss.IndexFlatL2(self.dimmensions)
                index.add(embeddings)
                self.faiss_indices[col] = index
                del embeddings
        elif engine == "annoy":
            for col in self.search_columns:
                embeddings = np.load(f"{embed_path}{col}.npz")['arr_0']
                index = AnnoyIndex(self.dimmensions, 'angular')
                for i, vec in enumerate(embeddings):
                    index.add_item(i, vec)
                index.build(20)
                self.annoy_indices[col] = index
                del embeddings
        else:
            raise ValueError('No such engine')

    def save(self, path: str):
        if self.faiss_indices:
            for col, index in self.faiss_indices.items():
                faiss.write_index(index, f"{path}_{col}.index")
        if self.annoy_indices:
            for col, index in self.annoy_indices.items():
                index.save(f"{path}_{col}.ann")

    def load(self, path: str):
        for col in self.search_columns:
            index = faiss.read_index(f"{path}_{col}.index")
            self.faiss_indices[col] = index
        for col in self.search_columns:
            index = AnnoyIndex(self.dimmensions, 'angular')
            index.load(f"{path}_{col}.ann")
            self.annoy_indices[col] = index

    def search(self, query: str, k: int = 5, engine: Optional[str] = 'both', category: Optional[str] = None, not_older_than: Optional[datetime] = None):
        query_embedding = self.model.encode(query)
        matches = []

        for col in self.search_columns:
            if engine in ["faiss", "both"] and self.faiss_indices:
                distances, indices = self.faiss_indices[col].search(query_embedding.reshape(1, -1), k)
                for idx, distance in zip(indices[0], distances[0]):
                    matches.append({'id': idx, 'score': 1 / (1 + distance), 'matched_field': col})

            if engine in ["annoy", "both"] and self.annoy_indices:
                indices, distances = self.annoy_indices[col].get_nns_by_vector(query_embedding, k, include_distances=True)
                similarities = [(2 - (dist**2)) / 2 for dist in distances]
                for idx, sim in zip(indices, similarities):
                    matches.append({'id': idx, 'score': sim, 'matched_field': col})

        filtered_results = []
        for m in matches:
            row = self.data.iloc[m['id']]
            filtered_results.append({
                'link': row['link'],
                'headline': row['headline'],
                'category': row['category'],
                'short_description': row['short_description'],
                'authors': row['authors'],
                'date': row['date'],
                'score': m['score'],
                'matched_fields': m['matched_field']
            })

        if category:
            filtered_results = [c for c in filtered_results if c['category'] in category]
        if not_older_than:
            filtered_results = [c for c in filtered_results if c['date'] >= not_older_than]

        return sorted(filtered_results, key=itemgetter('score'), reverse=True)


In [23]:
semantic = SemanticSearch(df)

In [24]:
semantic.build('faiss', embed_path=path)

In [25]:
semantic.build('annoy', embed_path=path)

In [26]:
semantic.save(path)

In [27]:
semantic = SemanticSearch(df)
semantic.load(path)

In [28]:
semantic.search("Woman Who Called Cops On Black Bird-Watcher", k=3, engine='faiss')

[{'link': 'https://www.huffpost.com/entry/amy-cooper-loses-discrimination-lawsuit-franklin-templeton_n_632c6463e4b09d8701bd227e',
  'headline': 'Woman Who Called Cops On Black Bird-Watcher Loses Lawsuit Against Ex-Employer',
  'category': 'U.S. NEWS',
  'short_description': 'Amy Cooper accused investment firm Franklin Templeton of unfairly firing her and branding her a racist after video of the Central Park encounter went viral.',
  'authors': 'Nina Golgowski',
  'date': Timestamp('2022-09-22 00:00:00'),
  'score': 0.6302691335141306,
  'matched_fields': 'headline'},
 {'link': 'https://www.huffingtonpost.com/entry/gwyneth-paltrow-tattoo-photos-pictures_us_5b9cbcc3e4b03a1dcc815bea',
  'headline': 'Gwyneth Paltrow Tattoo? Star Shows Off New Body Art PHOTOS',
  'category': 'STYLE & BEAUTY',
  'short_description': 'Did the actress really put a bird on it?',
  'authors': 'Rebecca Adams',
  'date': Timestamp('2013-01-20 00:00:00'),
  'score': 0.5292071457974201,
  'matched_fields': 'short_de

In [29]:
semantic.search("Woman Who Called Cops On Black Bird-Watcher", engine='annoy')

[{'link': 'https://www.huffingtonpost.com/entry/issa-rae-insecure-black-tv-shows_us_57f2b38ee4b0c2407cdf3d96',
  'headline': "How Issa Rae's 'Insecure' Validates, Expands On The Black Narrative",
  'category': 'BLACK VOICES',
  'short_description': 'She took cues from the best of black TV and weaved them into the show.',
  'authors': 'Rahel Gebreyes',
  'date': Timestamp('2016-10-06 00:00:00'),
  'score': 0.49684624837342994,
  'matched_fields': 'short_description'},
 {'link': 'https://www.huffingtonpost.com/entry/lady-bird-trailer-but-every-line-is-screamed_us_5a6c904be4b0ddb658c6cfc2',
  'headline': "So Here's The 'Lady Bird' Trailer Except Every Line Is Screamed",
  'category': 'ENTERTAINMENT',
  'short_description': "You're allowed to enjoy this -- it's the weekend.",
  'authors': 'Andy McDonald',
  'date': Timestamp('2018-01-27 00:00:00'),
  'score': 0.4640134996075176,
  'matched_fields': 'headline'},
 {'link': 'https://www.huffingtonpost.com/entry/deirdre-orozco-road-rage_n_6382

## Hybrid search

In [30]:
def hybrid_search(query, k=5, alpha: float = 0.5, category=None, not_older_than=None):
    """
    α = 1 - only semantic search
    α = 0 - only lexical search
    0 < α < 1 - lexical and semantic search
    """
    assert 0 <= alpha <= 1, "Alpha must be between 0 and 1."

    if alpha == 0:
        return lexical.search(query, k=k, category=category, not_older_than=not_older_than)
    elif alpha == 1:
        return semantic.search(query, k=k, category=category, not_older_than=not_older_than)
    else:
        lexical_results = lexical.search(query, k=k, category=category, not_older_than=not_older_than)
        semantic_results = semantic.search(query, k=k, category=category, not_older_than=not_older_than)

        # Adjust scores
        for item in lexical_results:
            item["adjusted_score"] = item["score"] * (1 - alpha)
            item["type"] = "lexical"

        for item in semantic_results:
            item["adjusted_score"] = item["score"] * alpha
            item["type"] = "semantic"

        # Combine results
        combined_results = lexical_results + semantic_results

        # Sort by adjusted score in descending order and remove duplicates based on 'link'
        seen_links = set()
        final_results = []
        for item in sorted(combined_results, key=lambda x: x["adjusted_score"], reverse=True):
            if item["link"] not in seen_links:
                seen_links.add(item["link"])
                final_results.append(item)

        return final_results



In [31]:
hybrid_search("Woman Who Called Cops On Black Bird-Watcher", alpha=0.6)

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

[{'link': 'https://www.huffpost.com/entry/amy-cooper-loses-discrimination-lawsuit-franklin-templeton_n_632c6463e4b09d8701bd227e',
  'headline': 'Woman Who Called Cops On Black Bird-Watcher Loses Lawsuit Against Ex-Employer',
  'category': 'U.S. NEWS',
  'short_description': 'Amy Cooper accused investment firm Franklin Templeton of unfairly firing her and branding her a racist after video of the Central Park encounter went viral.',
  'authors': 'Nina Golgowski',
  'date': Timestamp('2022-09-22 00:00:00'),
  'score': 0.954927427292993,
  'matched_fields': 'headline',
  'adjusted_score': 0.38197097091719723,
  'type': 'lexical'},
 {'link': 'https://www.huffingtonpost.com/entry/gwyneth-paltrow-tattoo-photos-pictures_us_5b9cbcc3e4b03a1dcc815bea',
  'headline': 'Gwyneth Paltrow Tattoo? Star Shows Off New Body Art PHOTOS',
  'category': 'STYLE & BEAUTY',
  'short_description': 'Did the actress really put a bird on it?',
  'authors': 'Rebecca Adams',
  'date': Timestamp('2013-01-20 00:00:00'),

In [32]:
hybrid_search("Bitcoin", alpha=0.6, not_older_than=datetime(2017,1,1))

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

[{'link': 'https://www.huffingtonpost.com/entry/the-bitcoin-hoax_us_5a3fd6dce4b025f99e17bb2f',
  'headline': 'The Bitcoin Hoax',
  'category': 'TECH',
  'short_description': 'We should hardly be surprised that Bitcoin is on a wild speculative ridethats the essence of privatized credit creation.',
  'authors': 'Robert Kuttner, ColumnistColumnist',
  'date': Timestamp('2017-12-24 00:00:00'),
  'score': 0.6098416791123972,
  'matched_fields': 'headline',
  'adjusted_score': 0.3659050074674383,
  'type': 'semantic'},
 {'link': 'https://www.huffingtonpost.com/entry/tiny-towns-small-states-bet-on-bitcoin-even-as-some_us_5ac7a421e4b0150d9bfe77ba',
  'headline': 'Tiny Towns, Small States Bet On Bitcoin Even As Some Shun Its Miners',
  'category': 'POLITICS',
  'short_description': 'Bitcoin enthusiasts envision a world in which the entire economy runs on blockchain technology, allowing people to buy their',
  'authors': 'Stateline, Editorial Partner',
  'date': Timestamp('2018-04-06 00:00:00'),