<a href="https://colab.research.google.com/github/kosmobiker/search_problem/blob/master/search_problem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import re

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df = pd.read_json('/content/drive/MyDrive/Colab Notebooks/news/data/News_Category_Dataset_v3.json', lines=True)

In [6]:
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [7]:
len(df)

209527

In [8]:
df.columns

Index(['link', 'headline', 'category', 'short_description', 'authors', 'date'], dtype='object')

In [9]:
def _clean_text(text):
    if isinstance(text, str):
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s.,!?\'"-]', '', text)
        text = text.strip()
        if not text:
            return 'no data'
        return text
    return 'no data'

search_columns = ['headline', 'short_description', 'authors']
for col in search_columns:
    df[col] = df[col].apply(_clean_text)

In [10]:
df.groupby('category').count()['link'].sort_values(ascending=False)

Unnamed: 0_level_0,link
category,Unnamed: 1_level_1
POLITICS,35602
WELLNESS,17945
ENTERTAINMENT,17362
TRAVEL,9900
STYLE & BEAUTY,9814
PARENTING,8791
HEALTHY LIVING,6694
QUEER VOICES,6347
FOOD & DRINK,6340
BUSINESS,5992


## BM25 search

In [13]:
%%capture
!pip install bm25s[full]

In [23]:
import bm25s
import Stemmer
import numpy as np
from typing import List, Optional, Dict
from operator import itemgetter
from datetime import datetime

In [35]:
class BM25Retriever:
    def __init__(self, df: pd.DataFrame):
        self.data = df
        self.stemmer = Stemmer.Stemmer("english")
        self.retrievers = {}
        self.search_columns = ['headline', 'short_description', 'authors']


    def index(self):
        self.corpus = {col: self.data[col].fillna('').tolist() for col in self.search_columns}

        for col, docs in self.corpus.items():
            tokens = bm25s.tokenize(docs, stopwords="en", stemmer=self.stemmer)
            retriever = bm25s.BM25()
            retriever.index(tokens)
            self.retrievers[col] = retriever

    def save(self, path: str):
        for col, retriever in self.retrievers.items():
            retriever.save(f"{path}_{col}", corpus=self.corpus[col])

    def load(self, path: str):
        for col in self.search_columns:
            self.retrievers[col] = bm25s.BM25.load(f"{path}_{col}", load_corpus=True)

    def search(self, query: str, k: int = 5, limit: int = 10, category: Optional[str] = None, not_older_than: Optional[datetime] = None):
        query_tokens = bm25s.tokenize(query, stemmer=self.stemmer)
        matches = []
        for col, retriever in self.retrievers.items():
            results, scores = retriever.retrieve(query_tokens, k=k)
            for i in range(results.shape[1]):
                id, score = results[0, i]['id'], scores[0, i]
                if score > 0:
                    matches.append({
                        'id' : id,
                        'score': score,
                        'matched_field': col
                    })
        if matches: # Normalization of BM25 score using sigmoid
            scores = np.array([m['score'] for m in matches])
            mean_score = np.mean(scores)
            std_score = np.std(scores) if np.std(scores) > 0 else 1
            for m in matches:
                m['score'] = 1 / (1 + np.exp(-((m['score'] - mean_score) / std_score)))

        filtered_results = []
        for m in matches:
            row = self.data.iloc[m['id']]
            filtered_results.append({
                'link': row['link'],
                'headline': row['headline'],
                'category': row['category'],
                'short_description': row['short_description'],
                'authors': row['authors'],
                'date': row['date'],
                'score': m['score'],
                'matched_fields': m['matched_field']
            })
        if category:
            filtered_results = [c for c in filtered_results if c['category'] in category]
        if not_older_than:
            filtered_results = [c for c in filtered_results if c['date'] >= not_older_than]
        filtered_results = sorted(filtered_results, key=itemgetter('score'), reverse=True)

        return filtered_results[:limit]

In [36]:
RETRIVERS_PATH = '/content/drive/MyDrive/Colab Notebooks/news/retrivers/'
lexical = BM25Retriever(df)
lexical.index()
lexical.save(RETRIVERS_PATH)

Split strings:   0%|          | 0/209527 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/209527 [00:00<?, ?it/s]

DEBUG:bm25s:Building index from IDs objects


BM25S Count Tokens:   0%|          | 0/209527 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/209527 [00:00<?, ?it/s]

Split strings:   0%|          | 0/209527 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/209527 [00:00<?, ?it/s]

DEBUG:bm25s:Building index from IDs objects


BM25S Count Tokens:   0%|          | 0/209527 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/209527 [00:00<?, ?it/s]

Split strings:   0%|          | 0/209527 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/209527 [00:00<?, ?it/s]

DEBUG:bm25s:Building index from IDs objects


BM25S Count Tokens:   0%|          | 0/209527 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/209527 [00:00<?, ?it/s]

Finding newlines for mmindex:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Finding newlines for mmindex:   0%|          | 0.00/29.0M [00:00<?, ?B/s]

Finding newlines for mmindex:   0%|          | 0.00/11.8M [00:00<?, ?B/s]

In [37]:
lexical = BM25Retriever(df)
lexical.load(RETRIVERS_PATH)

In [39]:
lexical.search("Woman Who Called Cops On Black Bird-Watcher", limit=3, not_older_than=datetime(2017,1,1))

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

[{'link': 'https://www.huffpost.com/entry/amy-cooper-loses-discrimination-lawsuit-franklin-templeton_n_632c6463e4b09d8701bd227e',
  'headline': 'Woman Who Called Cops On Black Bird-Watcher Loses Lawsuit Against Ex-Employer',
  'category': 'U.S. NEWS',
  'short_description': 'Amy Cooper accused investment firm Franklin Templeton of unfairly firing her and branding her a racist after video of the Central Park encounter went viral.',
  'authors': 'Nina Golgowski',
  'date': Timestamp('2022-09-22 00:00:00'),
  'score': 0.954927427292993,
  'matched_fields': 'headline'},
 {'link': 'https://www.huffpost.com/entry/trevor-noah-second-amendment-is-not-intended-for-black-people_n_5bfe1f02e4b0f43bf2662986',
  'headline': "Trevor Noah 'The Second Amendment Is Not Intended For Black People'",
  'category': 'COMEDY',
  'short_description': 'Cops are called into a situation, they see a black person and then immediately they shoot.',
  'authors': 'Carla Baranauckas',
  'date': Timestamp('2018-11-28 00

## Semantic Search

In [80]:
%%capture
!pip install annoy
!sudo apt-get install libomp-dev
!pip install faiss-cpu

In [81]:
import faiss
import torch
from annoy import AnnoyIndex
from sentence_transformers import SentenceTransformer

In [82]:
path = '/content/drive/MyDrive/Colab Notebooks/news/data/'
model = SentenceTransformer("all-MiniLM-L6-v2")

In [83]:
torch.cuda.is_available()

False

In [84]:
# perform only if GPU is availible, embeddings are already calculated
if torch.cuda.is_available():
    search_columns = ['headline', 'short_description', 'authors']
    for col in search_columns:
        sentences = df[col].to_list()
        embeddings = model.encode(sentences)
        np.savez_compressed(f'{path}{col}.npz', embeddings)
        del sentences
        del embeddings

In [85]:
class SemanticSearch():
    def __init__(self, data_path: str, model: str = "all-MiniLM-L6-v2", dimmensions: int = 384):
        self.data = df
        self.search_columns = ['headline', 'short_description']
        self.model = SentenceTransformer(model)
        self.dimmensions = dimmensions
        self.faiss_indices = {}
        self.annoy_indices = {}

    def build(self, engine: str, embed_path: str):
        if engine == "faiss":
            for col in self.search_columns:
                embeddings = np.load(f"{embed_path}{col}.npz")['arr_0']
                index = faiss.IndexFlatL2(self.dimmensions)
                index.add(embeddings)
                self.faiss_indices[col] = index
                del embeddings
        elif engine == "annoy":
            for col in self.search_columns:
                embeddings = np.load(f"{embed_path}{col}.npz")['arr_0']
                index = AnnoyIndex(self.dimmensions, 'angular')
                for i, vec in enumerate(embeddings):
                    index.add_item(i, vec)
                index.build(20)
                self.annoy_indices[col] = index
                del embeddings
        else:
            raise ValueError('No such engine')

    def save(self, path: str):
        if self.faiss_indices:
            for col, index in self.faiss_indices.items():
                faiss.write_index(index, f"{path}_{col}.index")
        if self.annoy_indices:
            for col, index in self.annoy_indices.items():
                index.save(f"{path}_{col}.ann")

    def load(self, path: str):
        for col in self.search_columns:
            index = faiss.read_index(f"{path}_{col}.index")
            self.faiss_indices[col] = index
        for col in self.search_columns:
            index = AnnoyIndex(self.dimmensions, 'angular')
            index.load(f"{path}_{col}.ann")
            self.annoy_indices[col] = index

    def search(self, query: str, k: int = 5, limit: int = 10, engine: Optional[str] = 'both', category: Optional[str] = None, not_older_than: Optional[datetime] = None):
        query_embedding = self.model.encode(query)
        matches = []

        for col in self.search_columns:
            if engine in ["faiss", "both"] and self.faiss_indices:
                distances, indices = self.faiss_indices[col].search(query_embedding.reshape(1, -1), k)
                for idx, distance in zip(indices[0], distances[0]):
                    matches.append({'id': idx, 'score': 1 / (1 + distance), 'matched_field': col})

            if engine in ["annoy", "both"] and self.annoy_indices:
                indices, distances = self.annoy_indices[col].get_nns_by_vector(query_embedding, k, include_distances=True)
                similarities = [(2 - (dist**2)) / 2 for dist in distances]
                for idx, sim in zip(indices, similarities):
                    matches.append({'id': idx, 'score': sim, 'matched_field': col})

        filtered_results = []
        for m in matches:
            row = self.data.iloc[m['id']]
            filtered_results.append({
                'link': row['link'],
                'headline': row['headline'],
                'category': row['category'],
                'short_description': row['short_description'],
                'authors': row['authors'],
                'date': row['date'],
                'score': m['score'],
                'matched_fields': m['matched_field']
            })

        if category:
            filtered_results = [c for c in filtered_results if c['category'] in category]
        if not_older_than:
            filtered_results = [c for c in filtered_results if c['date'] >= not_older_than]

        return sorted(filtered_results, key=itemgetter('score'), reverse=True)[:limit]


In [86]:
semantic = SemanticSearch(df)

In [87]:
semantic.build('faiss', embed_path=path)

In [88]:
semantic.build('annoy', embed_path=path)

In [89]:
semantic.save(path)

In [90]:
semantic = SemanticSearch(df)
semantic.load(path)

In [91]:
semantic.search("Woman Who Called Cops On Black Bird-Watcher", limit=5, engine='faiss')

[{'link': 'https://www.huffpost.com/entry/amy-cooper-loses-discrimination-lawsuit-franklin-templeton_n_632c6463e4b09d8701bd227e',
  'headline': 'Woman Who Called Cops On Black Bird-Watcher Loses Lawsuit Against Ex-Employer',
  'category': 'U.S. NEWS',
  'short_description': 'Amy Cooper accused investment firm Franklin Templeton of unfairly firing her and branding her a racist after video of the Central Park encounter went viral.',
  'authors': 'Nina Golgowski',
  'date': Timestamp('2022-09-22 00:00:00'),
  'score': 0.6302691335141306,
  'matched_fields': 'headline'},
 {'link': 'https://www.huffingtonpost.com/entry/gwyneth-paltrow-tattoo-photos-pictures_us_5b9cbcc3e4b03a1dcc815bea',
  'headline': 'Gwyneth Paltrow Tattoo? Star Shows Off New Body Art PHOTOS',
  'category': 'STYLE & BEAUTY',
  'short_description': 'Did the actress really put a bird on it?',
  'authors': 'Rebecca Adams',
  'date': Timestamp('2013-01-20 00:00:00'),
  'score': 0.5292071457974201,
  'matched_fields': 'short_de

In [92]:
semantic.search("Woman Who Called Cops On Black Bird-Watcher", limit=5, engine='annoy')

[{'link': 'https://www.huffingtonpost.com/entry/issa-rae-insecure-black-tv-shows_us_57f2b38ee4b0c2407cdf3d96',
  'headline': "How Issa Rae's 'Insecure' Validates, Expands On The Black Narrative",
  'category': 'BLACK VOICES',
  'short_description': 'She took cues from the best of black TV and weaved them into the show.',
  'authors': 'Rahel Gebreyes',
  'date': Timestamp('2016-10-06 00:00:00'),
  'score': 0.49684624837342994,
  'matched_fields': 'short_description'},
 {'link': 'https://www.huffingtonpost.com/entry/lady-bird-trailer-but-every-line-is-screamed_us_5a6c904be4b0ddb658c6cfc2',
  'headline': "So Here's The 'Lady Bird' Trailer Except Every Line Is Screamed",
  'category': 'ENTERTAINMENT',
  'short_description': "You're allowed to enjoy this -- it's the weekend.",
  'authors': 'Andy McDonald',
  'date': Timestamp('2018-01-27 00:00:00'),
  'score': 0.4640134996075176,
  'matched_fields': 'headline'},
 {'link': 'https://www.huffingtonpost.com/entry/deirdre-orozco-road-rage_n_6382

## Hybrid search

In [244]:
def hybrid_search(query, k=5, alpha: float = 0.5, limit: int = 10, engine='faiss', category=None, not_older_than=None):
    """
    α = 1 - only semantic search
    α = 0 - only lexical search
    0 < α < 1 - lexical and semantic search
    """
    assert 0 <= alpha <= 1, "Alpha must be between 0 and 1."

    if alpha == 0:
        combined_results = lexical.search(query, k=k, category=category, not_older_than=not_older_than)
        combined_results = [{**item, "adjusted_score": item["score"]} for item in combined_results]

    elif alpha == 1:
        combined_results = semantic.search(query, k=k, engine=engine, category=category, not_older_than=not_older_than)
        combined_results = [{**item, "adjusted_score": item["score"]} for item in combined_results]
    else:
        lexical_results = lexical.search(query, k=k, category=category, not_older_than=not_older_than)
        semantic_results = semantic.search(query, k=k, category=category, not_older_than=not_older_than)

        for item in lexical_results:
            item["adjusted_score"] = item["score"] * (1 - alpha)
            item["type"] = "lexical"

        for item in semantic_results:
            item["adjusted_score"] = item["score"] * alpha
            item["type"] = "semantic"

        combined_results = lexical_results + semantic_results

    seen_links = set()
    final_results = []
    for item in sorted(combined_results, key=lambda x: x["adjusted_score"], reverse=True):
        if item["link"] not in seen_links:
            seen_links.add(item["link"])
            final_results.append(item)
    for rank, item in enumerate(final_results, start=1):
        item["original_rank"] = rank

    return final_results[:limit]



In [245]:
hybrid_search("Woman Who Called Cops On Black Bird-Watcher", alpha=0.67)

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

[{'link': 'https://www.huffpost.com/entry/amy-cooper-loses-discrimination-lawsuit-franklin-templeton_n_632c6463e4b09d8701bd227e',
  'headline': 'Woman Who Called Cops On Black Bird-Watcher Loses Lawsuit Against Ex-Employer',
  'category': 'U.S. NEWS',
  'short_description': 'Amy Cooper accused investment firm Franklin Templeton of unfairly firing her and branding her a racist after video of the Central Park encounter went viral.',
  'authors': 'Nina Golgowski',
  'date': Timestamp('2022-09-22 00:00:00'),
  'score': 0.6302691335141306,
  'matched_fields': 'headline',
  'adjusted_score': 0.42228031945446753,
  'type': 'semantic',
  'original_rank': 1},
 {'link': 'https://www.huffingtonpost.com/entry/gwyneth-paltrow-tattoo-photos-pictures_us_5b9cbcc3e4b03a1dcc815bea',
  'headline': 'Gwyneth Paltrow Tattoo? Star Shows Off New Body Art PHOTOS',
  'category': 'STYLE & BEAUTY',
  'short_description': 'Did the actress really put a bird on it?',
  'authors': 'Rebecca Adams',
  'date': Timestamp

## Reranking

In [239]:
from sentence_transformers import CrossEncoder

rerank_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

In [246]:
def reranking(query, k=5, alpha: float = 0.5, limit: int = 10, engine=None, category=None, not_older_than=None,
              rerank_model=CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")):
    results = hybrid_search(query=query, alpha=alpha, k=k, limit=limit, engine=engine, category=category, not_older_than=not_older_than)

    passages = []
    for r in results:
        field_to_extract = r.get('matched_fields', [])
        text = r[field_to_extract]
        passages.append(text)

    ranks = rerank_model.rank(query, passages)
    ranks_order = [item['corpus_id'] + 1 for item in ranks]

    reranked_results = sorted(results, key=lambda x: ranks_order.index(x['original_rank']))

    for rank, item in enumerate(ranks, start=1):
        reranked_results[rank - 1]['new_rank'] = rank
        reranked_results[rank - 1]['reranked_score'] = item['score']

    return reranked_results


In [253]:
reranking("Woman Who Called Cops On Black Bird-Watcher", alpha=0.75)

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

[{'link': 'https://www.huffpost.com/entry/amy-cooper-loses-discrimination-lawsuit-franklin-templeton_n_632c6463e4b09d8701bd227e',
  'headline': 'Woman Who Called Cops On Black Bird-Watcher Loses Lawsuit Against Ex-Employer',
  'category': 'U.S. NEWS',
  'short_description': 'Amy Cooper accused investment firm Franklin Templeton of unfairly firing her and branding her a racist after video of the Central Park encounter went viral.',
  'authors': 'Nina Golgowski',
  'date': Timestamp('2022-09-22 00:00:00'),
  'score': 0.6302691335141306,
  'matched_fields': 'headline',
  'adjusted_score': 0.47270185013559796,
  'type': 'semantic',
  'original_rank': 1,
  'new_rank': 1,
  'reranked_score': 8.602636},
 {'link': 'https://www.huffpost.com/entry/trevor-noah-second-amendment-is-not-intended-for-black-people_n_5bfe1f02e4b0f43bf2662986',
  'headline': "Trevor Noah 'The Second Amendment Is Not Intended For Black People'",
  'category': 'COMEDY',
  'short_description': 'Cops are called into a situa