## Naive Reverse Index


In [1]:
from contextlib import closing
from dataclasses import dataclass, field
from nltk.corpus import stopwords
from typing import List, Set
import json
import nltk
import requests
import itertools
import sqlite3
import uuid

In [2]:
# https://www.nltk.org/api/nltk.tokenize.punkt.html
nltk.download("punkt")
# https://www.nltk.org/howto/corpus.html?highlight=stopwords#word-lists-and-lexicons
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /Users/j/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/j/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
@dataclass
class Document:
    id: str
    url: str
    blob: bytes = field(repr=False)


class Crawler:
    """Crawl and index documents into (id, url, blob)"""

    def __init__(self, db_path):
        self.db_path = db_path

        create_sql = """
            CREATE TABLE IF NOT EXISTS docs
                (id TEXT PRIMARY KEY, url TEXT UNIQUE, blob BLOB)"""

        with closing(sqlite3.connect(self.db_path)) as conn:
            cursor = conn.cursor()
            cursor.execute(create_sql)

    def __call__(self, url) -> Document:
        insert_sql = "INSERT INTO docs (id, url, blob) VALUES (?, ?, ?)"
        check_sql = "SELECT id, url, blob FROM docs WHERE url = ?"
        blob = requests.get(url, stream=True).content

        with closing(sqlite3.connect(self.db_path)) as conn:
            cursor = conn.cursor()

            exists = cursor.execute(check_sql, (url,)).fetchone()
            if exists:
                (id, url, blob) = exists
                return Document(id, url, blob)
            else:
                doc = Document(str(uuid.uuid4()), url, blob)
                cursor.execute(insert_sql, (doc.id, url, blob))
                conn.commit()
                return doc

In [4]:
urls = [
    "https://blog.jabid.in",
    "https://blog.jabid.in/2020/02/29/track.html",
    "https://blog.jabid.in/2019/10/25/why.html",
    "https://blog.jabid.in/2019/01/13/monzo.html",
    "https://blog.jabid.in/2019/05/10/amsterdam.html",
]

crawler = Crawler("search.db")
docs = [crawler(url) for url in urls]
docs

[Document(id='ac9f8a49-4c70-4e0e-97cd-b0785f2a5306', url='https://blog.jabid.in'),
 Document(id='ae076f02-9e6a-4610-8d63-dc8438a666b5', url='https://blog.jabid.in/2020/02/29/track.html'),
 Document(id='c1753123-e154-423e-adfd-bb36f6b633ba', url='https://blog.jabid.in/2019/10/25/why.html'),
 Document(id='e9637117-77eb-49e7-881c-fbf797e9daa8', url='https://blog.jabid.in/2019/01/13/monzo.html'),
 Document(id='794c26f5-7a8e-412d-9f49-6064a1689b4e', url='https://blog.jabid.in/2019/05/10/amsterdam.html')]

In [5]:
class Tokenizer:
    """Tokenize a document to a list of words."""

    def __call__(self, doc) -> Set[str]:
        text = str(doc.blob) if isinstance(doc, Document) else str(doc)
        tokens = nltk.word_tokenize(text)
        return set(tokens)

In [6]:
class Indexer:
    """Build a reverse index from token => [documents]"""

    def __init__(self, db_path, tokenizer: Tokenizer):
        self.db_path = db_path
        self.tokenizer = tokenizer

        create_sql = """
            CREATE TABLE IF NOT EXISTS reverse_index
                (token TEXT PRIMARY KEY, documents JSON)"""

        with closing(sqlite3.connect(self.db_path)) as conn:
            conn.execute("PRAGMA foreign_keys = ON")
            conn.execute(create_sql)
            conn.commit()

    def __call__(self, doc: Document):
        tokens = self.tokenizer(doc)

        select_sql = "SELECT documents FROM reverse_index WHERE token = ?"
        insert_sql = "REPLACE INTO reverse_index (token, documents) VALUES (?, json(?))"

        with closing(sqlite3.connect(self.db_path)) as conn:
            for token in tokens:
                result = conn.execute(select_sql, (token,)).fetchone()
                documents = set(json.loads(result[0]) if result else [])
                documents.add(doc.id)
                conn.execute(insert_sql, (token, json.dumps(list(documents))))

            conn.commit()

In [7]:
indexer = Indexer("search.db", Tokenizer())
[indexer(doc) for doc in docs]

[None, None, None, None, None]

In [8]:
@dataclass
class Result(Document):
    score: float


class Search:

    def __init__(self, db_path, tokenizer: Tokenizer):
        self.db = sqlite3.connect(f"file:{db_path}?mode=ro")
        self.tokenizer = tokenizer

    def fetch_one(self, id: str) -> Document:
        sql = "SELECT id, url, blob FROM docs WHERE id = ?;"
        exists = self.db.execute(sql, (id,)).fetchone()
        if exists:
            (id, url, blob) = exists
            return Document(id, url, blob)
        else:
            return None

    def fetch_many(self, ids: List[str] | Set[str]) -> List[Document]:
        return [self.fetch_one(id) for id in ids]

    def candidates(self, query: str):
        """Candidates for the query from reverse index"""

        tokens = set(self.tokenizer(query)) - set(stopwords.words("english"))

        def one(token: str):
            reverse_index_sql = "select token, documents from reverse_index where token = ?"
            row = self.db.execute(reverse_index_sql, (token,)).fetchone()
            ids = set(json.loads(row[1]) if row else [])
            return self.fetch_many(ids)

        # For each token, find all candidates and merge them.
        return itertools.chain(*[one(token) for token in tokens if token])

    def __call__(self, query):
        candidates = self.candidates(query)

        return list(candidates)

In [9]:
search = Search("search.db", Tokenizer())
query_results = search("amsterdam")
query_results

[Document(id='ac9f8a49-4c70-4e0e-97cd-b0785f2a5306', url='https://blog.jabid.in'),
 Document(id='794c26f5-7a8e-412d-9f49-6064a1689b4e', url='https://blog.jabid.in/2019/05/10/amsterdam.html')]

## Ranking with 🤗 Transformers


In [10]:
from transformers import pipeline

In [11]:
# Sentiment Analysis
# 1. Fast enough, but not what I want for search
# 2. Should definitely hook this up with my Obsidian notes


classifier = pipeline(
    "sentiment-analysis", model="distilbert/distilbert-base-uncased-finetuned-sst-2-english", revision="af0f99b"
)

classifier("I've been waiting for a HuggingFace course my whole life.")

[{'label': 'POSITIVE', 'score': 0.9598053097724915}]

In [12]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", revision="c626438")

classifier(
    "This is a course about the Transformers library",
    candidate_labels=["education", "politics", "business"],
)

{'sequence': 'This is a course about the Transformers library',
 'labels': ['education', 'business', 'politics'],
 'scores': [0.8445975184440613, 0.11197522282600403, 0.043427180498838425]}

In [13]:
classifier(
    str(query_results[1].blob),
    candidate_labels=["amsterdam", "netherlands", "travel", "tourism", "education", "politics", "business"],
)

{'sequence': 'b\'<!DOCTYPE html>\\n<html>\\n  <head>\\n  <meta charset="utf-8">\\n  <meta http-equiv="X-UA-Compatible" content="chrome=1">\\n  <meta name="viewport" content="width=device-width, initial-scale=1">\\n\\n  <title>Amsterdam, A history of the world&#39;s most liberal city \\xf0\\x9f\\x93\\x9a</title>\\n  <meta name="description" content="I finished reading \\xe2\\x80\\x9cAmsterdam, A history of the world\\xe2\\x80\\x99s most liberalcity\\xe2\\x80\\x9d last week and absolutely loved it. 5/5 \\xf0\\x9f\\x8c\\x9f">\\n\\n  <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.8.2/css/all.css" integrity="sha384-oS3vJWv+0UjzBfQzYUhtDYW+Pj2yciDJxpsK1OYPAYjqT085Qq/1cq5FLXAZQ7Ay" crossorigin="anonymous">\\n\\n  <link rel="stylesheet" href="/css/main.css">\\n  <link rel="canonical" href="https://blog.jabid.in/2019/05/10/amsterdam.html">\\n  <link rel="alternate" type="application/rss+xml" title="Jaseem Abid" href="https://blog.jabid.in/feed.xml">\\n</head>\\n\\n\\n  <bo