<a href="https://colab.research.google.com/github/jasmeet0817/booklm/blob/main/finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In [None]:
!pip install EbookLib
!pip install -U sentence-transformers
!pip install llama-index --upgrade

Collecting EbookLib
  Downloading EbookLib-0.18.tar.gz (115 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/115.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.5/115.5 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: EbookLib
  Building wheel for EbookLib (setup.py) ... [?25l[?25hdone
  Created wheel for EbookLib: filename=EbookLib-0.18-py3-none-any.whl size=38778 sha256=2dd8f7e2fb5c21818a736d47d6865ab5c97f2e3d727467ed1f831abec451d257
  Stored in directory: /root/.cache/pip/wheels/0f/38/cc/a3728bb72a315d9d8766fb71d362136372066fc25ad838f8fa
Successfully built EbookLib
Installing collected packages: EbookLib
Successfully installed EbookLib-0.18
Collecting sentence-transformers
  Downloading sentence_transformers-2.5.1-py3-none-any.whl (156 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32

In [None]:
from google.colab import drive
drive.mount('data')
DATA_FOLDER = '/content/data/MyDrive/Colab Notebooks/book-llm/data/'

Mounted at data


In [None]:
OPENAI_KEY = ""

## Generate Dataset

train_dataset uses SimpleNodeParser

train_dataset_v2 uses SentenceNodeSplitter (512, 50)

train_dataset_v3 uses SentenceNodeSplitter (256, 50)



In [None]:
import ebooklib

from bs4 import BeautifulSoup
from ebooklib import epub
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Document
from llama_index.core.schema import MetadataMode


def html_to_text(html):
    soup = BeautifulSoup(html, "html.parser")
    return soup.get_text()

def get_book_content(file_path, search_str, chunk_size, chunk_overlap):
    book = epub.read_epub(file_path)
    documents = []
    for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
        content = html_to_text(item.get_content().decode('utf-8')).strip()
        if content == '':
            continue
        documents.append(Document(text=content))
        if search_str is not None and search_str in content:
            break

    parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    nodes = parser.get_nodes_from_documents(documents)
    corpus = {node.node_id: node.get_content(metadata_mode=MetadataMode.NONE) for node in nodes}
    return corpus

training_corpus = get_book_content(DATA_FOLDER + 'training_data.epub', None, 256, 50)
val_corpus = get_book_content(DATA_FOLDER + 'validation_data.epub', None, 256, 50)

In [None]:
import json

TRAIN_CORPUS_FPATH = DATA_FOLDER + 'train_corpus_v3.json'
VAL_CORPUS_FPATH = DATA_FOLDER + 'val_corpus_v3.json'

with open(TRAIN_CORPUS_FPATH, 'w+') as f:
    json.dump(training_corpus, f)

with open(VAL_CORPUS_FPATH, 'w+') as f:
    json.dump(val_corpus, f)

In [None]:
import openai

client = openai.OpenAI(
    # This is the default and can be omitted
    api_key=OPENAI_KEY,
)

In [None]:
import re
import uuid
from tqdm import tqdm


def generate_queries(
    corpus,
    num_questions_per_chunk=2,
    prompt_template=None,
):
    """
    Automatically generate hypothetical questions that could be answered with
    doc in the corpus.
    """

    prompt_template = prompt_template or """\
    Context information is below.

    ---------------------
    {context_str}
    ---------------------

    Given the context information and not prior knowledge.
    generate only questions based on the below query.

    You are a Teacher/ Professor. Your task is to setup \
    {num_questions_per_chunk} questions for an upcoming \
    quiz/examination. The questions should be diverse in nature \
    across the document. Restrict the questions to the \
    context information provided."
    """

    queries = {}
    relevant_docs = {}
    for node_id, text in tqdm(corpus.items()):
        query = prompt_template.format(context_str=text, num_questions_per_chunk=num_questions_per_chunk)
        response = client.chat.completions.create(
            model="gpt-3.5-turbo-0125",
            messages=[
                {"role": "user", "content": query},
            ]
        )
        result = str(response).strip().split("\n")
        questions = [
            re.sub(r"^\d+[\).\s]", "", question).strip() for question in result
        ]
        questions = [question for question in questions if len(question) > 0]

        for question in questions:
            question_id = str(uuid.uuid4())
            queries[question_id] = question
            relevant_docs[question_id] = [node_id]
    return queries, relevant_docs

In [None]:
train_queries, train_relevant_docs = generate_queries(training_corpus, num_questions_per_chunk=3)

100%|██████████| 664/664 [23:54<00:00,  2.16s/it]


In [None]:
val_queries, val_relevant_docs = generate_queries(val_corpus, num_questions_per_chunk=3)


100%|██████████| 232/232 [10:46<00:00,  2.79s/it]


In [None]:
TRAIN_QUERIES_FPATH = DATA_FOLDER + 'train_queries_v3.json'
TRAIN_RELEVANT_DOCS_FPATH = DATA_FOLDER + 'train_relevant_docs_v3.json'

VAL_QUERIES_FPATH = DATA_FOLDER + 'val_queries_v3.json'
VAL_RELEVANT_DOCS_FPATH = DATA_FOLDER + 'val_relevant_docs_v3.json'

with open(TRAIN_QUERIES_FPATH, 'w+') as f:
    json.dump(train_queries, f)

with open(TRAIN_RELEVANT_DOCS_FPATH, 'w+') as f:
    json.dump(train_relevant_docs, f)

with open(VAL_QUERIES_FPATH, 'w+') as f:
    json.dump(val_queries, f)

with open(VAL_RELEVANT_DOCS_FPATH, 'w+') as f:
    json.dump(val_relevant_docs, f)

In [None]:
TRAIN_DATASET_FPATH = DATA_FOLDER + 'train_dataset_v3.json'
VAL_DATASET_FPATH = DATA_FOLDER + 'val_dataset_v3.json'

train_dataset = {
    'queries': train_queries,
    'corpus': training_corpus,
    'relevant_docs': train_relevant_docs,
}

val_dataset = {
    'queries': val_queries,
    'corpus': val_corpus,
    'relevant_docs': val_relevant_docs,
}


with open(TRAIN_DATASET_FPATH, 'w+') as f:
    json.dump(train_dataset, f)

with open(VAL_DATASET_FPATH, 'w+') as f:
    json.dump(val_dataset, f)

## Finetune

With **MultipleNegativesRankingLoss**:

finetuned_bge_small and finetuned_bge_small_v2 are both on training_dataset_v1

finetuned_bge_small_v3 is on training_dataset_v2 with batch size 48

finetuned_bge_small_v4 is on training_dataset_v3 with batch size 48

finetuned_bge_small_v5 is on training_dataset_v3 with batch size 64

With **CachedMultipleNegativesRankingLoss**:

finetuned_bge_small_v6 is on training_dataset_v2 with batch size 64

finetuned_bge_small_v7 is on training_dataset_v2 with batch size 128

In [None]:
import json

TRAIN_DATASET_FPATH = DATA_FOLDER + 'train_dataset_v2.json'
VAL_DATASET_FPATH = DATA_FOLDER + 'val_dataset_v2.json'


with open(TRAIN_DATASET_FPATH, 'r+') as f:
    train_dataset = json.load(f)

with open(VAL_DATASET_FPATH, 'r+') as f:
    val_dataset = json.load(f)

In [None]:
from sentence_transformers import SentenceTransformer

model_id = "BAAI/bge-large-en-v1.5"
model = SentenceTransformer(model_id)

In [None]:
from sentence_transformers import InputExample

dataset = train_dataset

corpus = dataset['corpus']
queries = dataset['queries']
relevant_docs = dataset['relevant_docs']

examples = []
for query_id, query in queries.items():
    node_id = relevant_docs[query_id][0]
    text = corpus[node_id]
    example = InputExample(texts=[query, text])
    examples.append(example)

In [None]:
BATCH_SIZE = 12

from torch.utils.data import DataLoader

loader = DataLoader(
    examples, batch_size=BATCH_SIZE
)

### Define Loss


MultipleNegativesRankingLoss is a great loss function if you only have positive pairs, for example, only pairs of similar texts like pairs of paraphrases, pairs of duplicate questions, pairs of (query, response), or pairs of (source_language, target_language).

This loss function works great to train embeddings for retrieval setups where you have positive pairs (e.g. (query, relevant_doc)) as it will sample in each batch n-1 negative docs randomly.

The performance usually increases with increasing batch sizes.

In [None]:
from sentence_transformers import losses

loss = losses.CachedMultipleNegativesRankingLoss(model)


### Evaluator

In [None]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

dataset = val_dataset

corpus = dataset['corpus']
queries = dataset['queries']
relevant_docs = dataset['relevant_docs']

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

### Training

In [None]:
EPOCHS = 20

In [None]:
warmup_steps = int(len(loader) * EPOCHS * 0.1)

model.fit(
    train_objectives=[(loader, loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='exp_finetune',
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=50,
)

In [None]:
model_save_path = DATA_FOLDER + 'finetuned_bge_large_v2'
model.save(model_save_path)

## Model Evaluation

### Setup

In [None]:
import json

TRAIN_DATASET_FPATH = DATA_FOLDER + 'train_dataset_v2.json'
VAL_DATASET_FPATH = DATA_FOLDER + 'val_dataset_v2.json'


with open(TRAIN_DATASET_FPATH, 'r+') as f:
    train_dataset = json.load(f)

with open(VAL_DATASET_FPATH, 'r+') as f:
    val_dataset = json.load(f)

#### Define eval function
We use the InformationRetrievalEvaluator from sentence_transformers.

This provides a more comprehensive suite of metrics, but we can only run it against the sentencetransformers compatible models (open source and our finetuned model, not the OpenAI embedding model).

In [None]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers import SentenceTransformer

def evaluate(
    dataset,
    model_id,
    name,
):
    corpus = dataset['corpus']
    queries = dataset['queries']
    relevant_docs = dataset['relevant_docs']

    evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs, name=name)
    model = SentenceTransformer(model_id)
    return evaluator(model, output_path=DATA_FOLDER)

### Run evals

#### BGE Small

In [None]:
evaluate(val_dataset, 'BAAI/bge-small-en', name='bge-small')

0.5725032359681482

#### BGE Base

In [None]:
evaluate(val_dataset, 'BAAI/bge-base-en-v1.5', name='bge-large')

0.3740791548245909

#### BGE Large

In [None]:
evaluate(val_dataset, 'BAAI/bge-large-en-v1.5', name='bge-base')

0.5892367632164855

#### Finetuned BGE Small

In [None]:
evaluate(val_dataset, DATA_FOLDER + 'finetuned_bge_small', name='finetuned_bge_small')

0.8444047619047619

In [None]:
evaluate(val_dataset, DATA_FOLDER + 'finetuned_bge_small_v2', name='finetuned_bge_small_v2')

0.8495880574452002

In [None]:
evaluate(val_dataset, DATA_FOLDER + 'finetuned_bge_small_v3', name='finetuned_bge_small_v3')

0.9225507825507826

In [None]:
evaluate(val_dataset, DATA_FOLDER + 'finetuned_bge_small_v4', name='finetuned_bge_small_v4')

0.950933908045977

In [None]:
evaluate(val_dataset, DATA_FOLDER + 'finetuned_bge_small_v5', name='finetuned_bge_small_v5')

0.9333692528735632

In [None]:
evaluate(val_dataset, DATA_FOLDER + 'finetuned_bge_small_v6', name='finetuned_bge_small_v6')

0.920124716553288

In [None]:
evaluate(val_dataset, DATA_FOLDER + 'finetuned_bge_small_v7', name='finetuned_bge_small_v7')

0.923550061050061

#### Finetuned BGE Base

In [None]:
evaluate(val_dataset, DATA_FOLDER + 'finetuned_bge_base', name='finetuned_bge_base')

0.7751256613756613

#### Finetuned BGE Large

In [None]:
evaluate(val_dataset, DATA_FOLDER + 'finetuned_bge_large', name='finetuned_bge_large')

0.7390289802789802

## Cleanup

In [None]:
import torch

torch.cuda.empty_cache() # PyTorch thing
