## Naive Reverse Index


In [1]:
from collections import Counter
from contextlib import closing
from dataclasses import dataclass, field
from nltk.corpus import stopwords
from typing import List, Set
import itertools
import json
import nltk
import pandas as pd
import requests
import sqlite3
import string
import uuid

In [2]:
# https://www.nltk.org/api/nltk.tokenize.punkt.html
nltk.download("punkt")
# https://www.nltk.org/howto/corpus.html?highlight=stopwords#word-lists-and-lexicons
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /Users/j/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/j/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
@dataclass(frozen=True)
class Document:
    id: str = field(repr=False)
    url: str
    blob: bytes = field(repr=False, hash=False)
    score: float = field(default=0.0)


class Crawler:
    """Crawl and index documents into (id, url, blob)"""

    def __init__(self, db_path):
        self.db_path = db_path

        create_sql = """
            CREATE TABLE IF NOT EXISTS docs
                (id TEXT PRIMARY KEY, url TEXT UNIQUE, blob BLOB)"""

        with closing(sqlite3.connect(self.db_path)) as conn:
            cursor = conn.cursor()
            cursor.execute(create_sql)

    def __call__(self, url) -> Document:
        insert_sql = "INSERT INTO docs (id, url, blob) VALUES (?, ?, ?)"
        check_sql = "SELECT id, url, blob FROM docs WHERE url = ?"
        blob = requests.get(url, stream=True).content

        with closing(sqlite3.connect(self.db_path)) as conn:
            cursor = conn.cursor()

            exists = cursor.execute(check_sql, (url,)).fetchone()
            if exists:
                (id, url, blob) = exists
                return Document(id, url, blob)
            else:
                doc = Document(str(uuid.uuid4()), url, blob)
                cursor.execute(insert_sql, (doc.id, url, blob))
                conn.commit()
                return doc

In [4]:
urls = [
    "https://blog.jabid.in",
    "https://blog.jabid.in/2020/02/29/track.html",
    "https://blog.jabid.in/2019/10/25/why.html",
    "https://blog.jabid.in/2019/01/13/monzo.html",
    "https://blog.jabid.in/2019/05/10/amsterdam.html",
]

crawler = Crawler("search.db")
docs = [crawler(url) for url in urls]

In [5]:
def tokenize(doc) -> Set[str]:
    """Tokenize a document to a list of words."""

    text = str(doc.blob) if isinstance(doc, Document) else str(doc)
    exclude = set(string.punctuation) - set(stopwords.words("english"))
    tokens = set(nltk.word_tokenize(text)) - exclude
    return {token.lower() for token in tokens}

In [6]:
class Indexer:
    """Build a reverse index from token => [documents]"""

    def __init__(self, db_path):
        self.db_path = db_path

        drop_sql = "DROP TABLE reverse_index;"
        create_sql = "CREATE TABLE IF NOT EXISTS reverse_index \
                (token TEXT PRIMARY KEY, documents JSON)"

        with closing(sqlite3.connect(self.db_path)) as conn:
            conn.execute("PRAGMA foreign_keys = ON")
            conn.execute(drop_sql)
            conn.execute(create_sql)
            conn.commit()

    def __call__(self, doc: Document):
        tokens = tokenize(doc)

        select_sql = "SELECT documents FROM reverse_index WHERE token = ?"
        insert_sql = "REPLACE INTO reverse_index (token, documents) VALUES (?, json(?))"

        with closing(sqlite3.connect(self.db_path)) as conn:
            for token in tokens:
                result = conn.execute(select_sql, (token,)).fetchone()
                documents = set(json.loads(result[0]) if result else [])
                documents.add(doc.id)
                conn.execute(insert_sql, (token, json.dumps(list(documents))))

            conn.commit()

In [7]:
indexer = Indexer("search.db")
[indexer(doc) for doc in docs]

[None, None, None, None, None]

In [8]:
class Search:
    def __init__(self, db_path):
        self.db = sqlite3.connect(f"file:{db_path}?mode=ro")

    def fetch_one(self, id: str) -> Document:
        sql = "SELECT id, url, blob FROM docs WHERE id = ?;"
        if exists := self.db.execute(sql, (id,)).fetchone():
            (id, url, blob) = exists
            return Document(id, url, blob)
        else:
            return None

    def fetch_many(self, ids: List[str] | Set[str]) -> List[Document]:
        return [self.fetch_one(id) for id in ids]

    def candidates(self, tokens: List[str]):
        "Candidates for the query from reverse index"

        def one(token: str):
            reverse_index_sql = "select token, documents from reverse_index where token = ?"
            row = self.db.execute(reverse_index_sql, (token,)).fetchone()
            ids = set(json.loads(row[1]) if row else [])
            return self.fetch_many(ids)

        # For each token, find all candidates and merge them.
        return list(itertools.chain(*[one(token) for token in tokens if token]))

    def rank_word_frequency(self, tokens, candidates: List[Document]):
        "Candidates with tokens repeated most often"

        df = pd.DataFrame(candidates)
        freqs = df.apply(lambda x: Counter(tokenize(x.blob)), axis="columns")
        scores = freqs.apply(lambda counter: sum(counter[token] for token in tokens))
        df["score"] = scores / scores.sum()
        topk = df[df["score"] > 0].sort_values(by="score", ascending=False)
        return topk

    def __call__(self, query):
        tokens = tokenize(query)
        candidates = self.candidates(tokens)
        return self.rank_word_frequency(tokens, candidates)


search = Search("search.db")
search("monzo").drop(columns=["id", "blob"])

Unnamed: 0,url,score
0,https://blog.jabid.in/2019/01/13/monzo.html,0.5
1,https://blog.jabid.in,0.5


## Ranking with 🤗 Transformers


In [9]:
from transformers import pipeline

In [10]:
# Sentiment Analysis
# 1. Fast enough, but not what I want for search
# 2. Should definitely hook this up with my Obsidian notes


classifier = pipeline(
    "sentiment-analysis", model="distilbert/distilbert-base-uncased-finetuned-sst-2-english", revision="af0f99b"
)

classifier("I've been waiting for a HuggingFace course my whole life.")

[{'label': 'POSITIVE', 'score': 0.9598053097724915}]

In [11]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", revision="c626438")

classifier(
    "This is a course about the Transformers library",
    candidate_labels=["education", "politics", "business"],
)

{'sequence': 'This is a course about the Transformers library',
 'labels': ['education', 'business', 'politics'],
 'scores': [0.8445975184440613, 0.11197522282600403, 0.043427180498838425]}

In [12]:
classifier(
    str(docs[1].blob),
    candidate_labels=["amsterdam", "netherlands", "travel", "tourism", "education", "politics", "business"],
)

{'sequence': 'b\'<!DOCTYPE html>\\n<html>\\n  <head>\\n  <meta charset="utf-8">\\n  <meta http-equiv="X-UA-Compatible" content="chrome=1">\\n  <meta name="viewport" content="width=device-width, initial-scale=1">\\n\\n  <title>This blog doesn&#39;t track you anymore!</title>\\n  <meta name="description" content="I avoid 3rd party tracking on the web as much as I can with several ad blockersbut then forcing it on the readers of this blog felt very hypocritical.">\\n\\n  <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.8.2/css/all.css" integrity="sha384-oS3vJWv+0UjzBfQzYUhtDYW+Pj2yciDJxpsK1OYPAYjqT085Qq/1cq5FLXAZQ7Ay" crossorigin="anonymous">\\n\\n  <link rel="stylesheet" href="/css/main.css">\\n  <link rel="canonical" href="https://blog.jabid.in/2020/02/29/track.html">\\n  <link rel="alternate" type="application/rss+xml" title="Jaseem Abid" href="https://blog.jabid.in/feed.xml">\\n</head>\\n\\n\\n  <body>\\n\\n    <div class="wrapper">\\n      <header>\\n\\n  <a href=