<a href="https://colab.research.google.com/github/jasmeet0817/booklm/blob/main/finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In [1]:
!pip install EbookLib
!pip install -U sentence-transformers
!pip install llama-index --upgrade

Collecting EbookLib
  Downloading EbookLib-0.18.tar.gz (115 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/115.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m112.6/115.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.5/115.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: EbookLib
  Building wheel for EbookLib (setup.py) ... [?25l[?25hdone
  Created wheel for EbookLib: filename=EbookLib-0.18-py3-none-any.whl size=38778 sha256=3d31930c05b4a00e41c4e88e3b0ca35a1d1678205fedb344d00314d266186518
  Stored in directory: /root/.cache/pip/wheels/0f/38/cc/a3728bb72a315d9d8766fb71d362136372066fc25ad838f8fa
Successfully built EbookLib
Installing collected packages: EbookLib
Successfully installed EbookLib-0.18
Collecting sentence-transformers


In [2]:
from google.colab import drive
drive.mount('data')

Mounted at data


In [3]:
DATA_FOLDER = '/content/data/MyDrive/Colab Notebooks/book-llm/data/'

## Generate Dataset

In [None]:
from bs4 import BeautifulSoup
from ebooklib import epub
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core import Document
from llama_index.core.schema import MetadataMode


def html_to_text(html):
    soup = BeautifulSoup(html, "html.parser")
    return soup.get_text()

def get_book_content(file_path, search_str, chunk_size, chunk_overlap):
    book = epub.read_epub(file_path)
    documents = []
    for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
        content = html_to_text(item.get_content().decode('utf-8')).strip()
        if content == '':
            continue
        documents.append(Document(text=content))
        if search_str is not None and search_str in content:
            break

    # parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    nodes = SimpleNodeParser.from_defaults().get_nodes_from_documents(documents)
    corpus = {node.node_id: node.get_content(metadata_mode=MetadataMode.NONE) for node in nodes}
    return corpus

training_corpus = get_book_content(DATA_FOLDER + 'training_data.epub', None, 512, 50)
val_corpus = get_book_content(DATA_FOLDER + 'validation_data.epub', None, 512, 50)



In [None]:
import json

TRAIN_CORPUS_FPATH = DATA_FOLDER + 'train_corpus.json'
VAL_CORPUS_FPATH = DATA_FOLDER + 'val_corpus.json'

with open(TRAIN_CORPUS_FPATH, 'w+') as f:
    json.dump(training_corpus, f)

with open(VAL_CORPUS_FPATH, 'w+') as f:
    json.dump(val_corpus, f)

In [None]:
import openai

OPENAI_KEY = "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"
client = openai.OpenAI(
    # This is the default and can be omitted
    api_key=OPENAI_KEY,
)

In [None]:
import re
import uuid
from tqdm import tqdm


def generate_queries(
    corpus,
    num_questions_per_chunk=2,
    prompt_template=None,
):
    """
    Automatically generate hypothetical questions that could be answered with
    doc in the corpus.
    """

    prompt_template = prompt_template or """\
    Context information is below.

    ---------------------
    {context_str}
    ---------------------

    Given the context information and not prior knowledge.
    generate only questions based on the below query.

    You are a Teacher/ Professor. Your task is to setup \
    {num_questions_per_chunk} questions for an upcoming \
    quiz/examination. The questions should be diverse in nature \
    across the document. Restrict the questions to the \
    context information provided."
    """

    queries = {}
    relevant_docs = {}
    for node_id, text in tqdm(corpus.items()):
        query = prompt_template.format(context_str=text, num_questions_per_chunk=num_questions_per_chunk)
        response = client.chat.completions.create(
            model="gpt-3.5-turbo-0125",
            messages=[
                {"role": "user", "content": query},
            ]
        )
        result = str(response).strip().split("\n")
        questions = [
            re.sub(r"^\d+[\).\s]", "", question).strip() for question in result
        ]
        questions = [question for question in questions if len(question) > 0]

        for question in questions:
            question_id = str(uuid.uuid4())
            queries[question_id] = question
            relevant_docs[question_id] = [node_id]
    return queries, relevant_docs

In [None]:
train_queries, train_relevant_docs = generate_queries(training_corpus, num_questions_per_chunk=3)

100%|██████████| 174/174 [08:11<00:00,  2.82s/it]


In [None]:
val_queries, val_relevant_docs = generate_queries(val_corpus, num_questions_per_chunk=3)


100%|██████████| 60/60 [03:12<00:00,  3.21s/it]


In [None]:
TRAIN_QUERIES_FPATH = DATA_FOLDER + 'train_queries.json'
TRAIN_RELEVANT_DOCS_FPATH = DATA_FOLDER + 'train_relevant_docs.json'

VAL_QUERIES_FPATH = DATA_FOLDER + 'val_queries.json'
VAL_RELEVANT_DOCS_FPATH = DATA_FOLDER + 'val_relevant_docs.json'

with open(TRAIN_QUERIES_FPATH, 'w+') as f:
    json.dump(train_queries, f)

with open(TRAIN_RELEVANT_DOCS_FPATH, 'w+') as f:
    json.dump(train_relevant_docs, f)

with open(VAL_QUERIES_FPATH, 'w+') as f:
    json.dump(val_queries, f)

with open(VAL_RELEVANT_DOCS_FPATH, 'w+') as f:
    json.dump(val_relevant_docs, f)

In [None]:
TRAIN_DATASET_FPATH = DATA_FOLDER + 'train_dataset.json'
VAL_DATASET_FPATH = DATA_FOLDER + 'val_dataset.json'

train_dataset = {
    'queries': train_queries,
    'corpus': training_corpus,
    'relevant_docs': train_relevant_docs,
}

val_dataset = {
    'queries': val_queries,
    'corpus': val_corpus,
    'relevant_docs': val_relevant_docs,
}


with open(TRAIN_DATASET_FPATH, 'w+') as f:
    json.dump(train_dataset, f)

with open(VAL_DATASET_FPATH, 'w+') as f:
    json.dump(val_dataset, f)

## Finetune

In [4]:
import json

TRAIN_DATASET_FPATH = DATA_FOLDER + 'train_dataset.json'
VAL_DATASET_FPATH = DATA_FOLDER + 'val_dataset.json'


with open(TRAIN_DATASET_FPATH, 'r+') as f:
    train_dataset = json.load(f)

with open(VAL_DATASET_FPATH, 'r+') as f:
    val_dataset = json.load(f)

In [5]:
from sentence_transformers import SentenceTransformer

model_id = "BAAI/bge-small-en"
model = SentenceTransformer(model_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
from sentence_transformers import InputExample

dataset = train_dataset

corpus = dataset['corpus']
queries = dataset['queries']
relevant_docs = dataset['relevant_docs']

examples = []
for query_id, query in queries.items():
    node_id = relevant_docs[query_id][0]
    text = corpus[node_id]
    example = InputExample(texts=[query, text])
    examples.append(example)

In [7]:
# We use a very small batchsize to run this toy example on a local machine.
# This should typically be much larger.
BATCH_SIZE = 40

from torch.utils.data import DataLoader

loader = DataLoader(
    examples, batch_size=BATCH_SIZE
)

### Define Loss


MultipleNegativesRankingLoss is a great loss function if you only have positive pairs, for example, only pairs of similar texts like pairs of paraphrases, pairs of duplicate questions, pairs of (query, response), or pairs of (source_language, target_language).

This loss function works great to train embeddings for retrieval setups where you have positive pairs (e.g. (query, relevant_doc)) as it will sample in each batch n-1 negative docs randomly.

The performance usually increases with increasing batch sizes.

In [8]:
from sentence_transformers import losses

loss = losses.MultipleNegativesRankingLoss(model)


### Evaluator

In [9]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

dataset = val_dataset

corpus = dataset['corpus']
queries = dataset['queries']
relevant_docs = dataset['relevant_docs']

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

### Training

In [10]:
# We train the model for very few epochs in this toy example.
# This should typically be higher for better performance.
EPOCHS = 20

In [11]:
warmup_steps = int(len(loader) * EPOCHS * 0.1)

model.fit(
    train_objectives=[(loader, loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='exp_finetune',
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=50,
)

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

In [12]:
model_save_path = DATA_FOLDER + 'finetuned_bge_small'
model.save(model_save_path)

In [13]:
!ls "{model_save_path}"


1_Pooling    config_sentence_transformers.json	README.md		   tokenizer_config.json
2_Normalize  model.safetensors			sentence_bert_config.json  tokenizer.json
config.json  modules.json			special_tokens_map.json    vocab.txt


In [14]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(model_save_path)

In [15]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)