In [None]:
# Initialization

from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sentence_transformers import util
import pandas as pd

import pysrt

import ebooklib
from ebooklib import epub, utils
import re
from bs4 import BeautifulSoup

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def get_embeddings(sentences): 
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    return F.normalize(sentence_embeddings, p=2, dim=1)

def get_epub_df(path):
    book = epub.read_epub(path)
    chapters = []
    for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
        name = item.get_name()
        content = item.get_content().decode("utf-8")
        text = BeautifulSoup(content, "html.parser").get_text()
        chapters += [text]
    df = pd.DataFrame(chapters, columns=["text"])
    df['embedding'] = df['text'].apply(lambda x: get_embeddings(x))
    return df

def search_embeddings(term, df):
    query = { "search": term, "embedding": get_embeddings(term) }
    df["relevance"] = df["embedding"].apply(lambda x: util.cos_sim(x, query["embedding"]))
    return df.sort_values(by=["relevance"], ascending=False)

In [None]:
# Index EPUBs in Direcotry (Recursively)

import glob, os

path = "/run/media/c/Slem500Ext/CalibreLibrary/"

def get_book_name(path):
    parts = path.split("/")
    return parts[len(parts)-1]

dfs = pd.DataFrame(columns=["title", "text","embedding","path"])

for p in glob.glob(path+"**/*/*.epub"):
    
    print(f"Found: '{get_book_name(p)}'. Getting Embeddings...")
    
    book_df = get_epub_df(p)
    
    book_df["title"] = get_book_name(p)
    book_df["path"] = p
        
    dfs = pd.concat([dfs,book_df], axis=0)
    



In [None]:
# Perform a Vector Search on the Results

query = "how do i test a function?"

df = search_embeddings(query, dfs)

print(f"Top Results for Query: '{query}'")

top10 = df.head(30).to_numpy()

for item in top10:
    print()
    title, text = item[0], item[1]
    print(f'Book: {title}')
    print()
    print(f'Chapter: "{" ".join(line.strip() for line in text[:80].splitlines()).strip()}..."')
    print()
    print("--- --- ---")