## LLM recaps
This notebook creates recaps from the BOOKSUM dataset for the experiments in https://github.com/jecGrimm/Recap.git. The recaps are generated by Gemma-2-2b-it.

Please connect to T4 to run the notebook.

In [None]:
!pip install transformers torch huggingface_hub
!pip install datasets

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
!git clone https://github.com/jecGrimm/Recap.git

Cloning into 'Recap'...
remote: Enumerating objects: 232, done.[K
remote: Counting objects: 100% (232/232), done.[K
remote: Compressing objects: 100% (171/171), done.[K
remote: Total 232 (delta 120), reused 163 (delta 54), pack-reused 0 (from 0)[K
Receiving objects: 100% (232/232), 8.93 MiB | 15.09 MiB/s, done.
Resolving deltas: 100% (120/120), done.


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import notebook_login
import torch
from datasets import load_dataset, Dataset
import re
from Recap.data import RecapData
from collections import defaultdict
from tqdm import tqdm
import json

In [None]:
# Please provide your HuggingFace access token and go to https://huggingface.co/google/gemma-2-2b to get access to Gemma-2-2b-it.
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

1. Load Data and Model

In [None]:
# Load model
torch.cuda.empty_cache()

it_model_name = "google/gemma-2-2b-it"
it_model = AutoModelForCausalLM.from_pretrained(
    it_model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
it_model_tokenizer = AutoTokenizer.from_pretrained(it_model_name, trust_remote_code=True)

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [None]:
# Load mapped last chapters in BOOKSUM
test_summs = RecapData("/content/Recap/data/summ_test.jsonl", split = "test")
dataset = test_summs.mapped_summs["test"]

In [None]:
# Load BOOKSUM
kmfoda_test = load_dataset("kmfoda/booksum", split = "test")

Generating train split:   0%|          | 0/9600 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1484 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1431 [00:00<?, ? examples/s]

In [None]:
# filter out any books without a last chapter
kmfoda_recap_summs = kmfoda_test.filter(lambda inst: [bid == inst["bid"] for bid in set(dataset["bid"])])

Filter:   0%|          | 0/1431 [00:00<?, ? examples/s]

In [None]:
# fetch book titles
recap_book_titles = kmfoda_recap_summs.map(lambda inst: {"book_title": re.match(r"(.*?)\.", inst["book_id"]).group(1)})

Map:   0%|          | 0/1431 [00:00<?, ? examples/s]

In [None]:
# map book ids to titles
book_titles = set(zip(recap_book_titles["bid"], recap_book_titles["book_title"]))

In [None]:
# filter out double book titles
# 24 books in book_titles -> 21 in book_titles_cleaned
book_titles_cleaned =  set()
included_bids = set()
for bid, title in book_titles:
  if bid not in included_bids:
    book_titles_cleaned.add((bid, title))
    included_bids.add(bid)

2. Create Recaps

In [None]:
def create_messages(title):
  '''
  This function creates the prompts for the recap generation.

  @param title: title of one book
  @returns messages: list with three prompts
  '''
  messages = [
      {"role": "user", "content": f"Please generate a recap for the last chapter of the book {title}."},
      {"role": "user", "content": f"What do I need to know before reading the last chapter of the book {title}?"},
      {"role": "user", "content": f"Please summarize the book {title} excluding the last chapter."}
  ]
  return messages

In [None]:
def store_recaps(filename, recaps):
  '''
  This function saves the generated recaps in a file.

  @params
    filename: path of the output file
    recaps: generated recaps
  '''
  with open(filename, 'w', encoding="utf-8") as f:
      json.dump(recaps, f, indent=4)

In [None]:
def create_llm_recaps(book_titles):
  '''
  This function generates recaps with Gemma-2-2b-it for several books.

  @param book_titles: set of tuples with the book id and its title
  @returns recaps: dictionary with the generated recaps mapped to its book id
  '''
  recaps = defaultdict(list)

  for bid, title in tqdm(book_titles, desc="Generating recaps"):
    for message in create_messages(title):
      input_ids = it_model_tokenizer.apply_chat_template([message], return_tensors="pt", return_dict=True).to("cuda")
      outputs = it_model.generate(**input_ids, max_new_tokens=506)
      recap = it_model_tokenizer.decode(outputs[0])
      recaps[bid].append(recap)

  return recaps

In [None]:
# create recaps
llm_recaps = create_llm_recaps(book_titles_cleaned)
store_recaps("./test_llm.json", llm_recaps)

Generating recaps: 100%|██████████| 21/21 [24:59<00:00, 71.42s/it]
