<a href="https://colab.research.google.com/github/jonathantcallahan/guidance/blob/main/book_processing_gpt_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install openai
%pip install chardet
%pip install ftfy
%pip install torch
%pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
%pip install --no-deps xformers trl peft accelerate bitsandbytes

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [30]:
from openai import OpenAI
import os
import re
import uuid
import json
from ftfy import fix_encoding

client = OpenAI()

In [5]:
import chardet

def detect_encoding(file_path):
    with open(file_path, 'rb') as file:
        raw_data = file.read()
    result = chardet.detect(raw_data)
    encoding = result['encoding']
    return encoding


In [6]:
filenames = []

for filename in os.listdir('books'):
    filenames.append(filename)

print(f'Books in list: {len(filenames)}')

Books in list: 43


In [7]:
#generate quetsions to the "answers" extracted from the text
def answer_gpt(answer):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": "You are assisting in the generation of training data for fine-tuning. You will receive a chunk of text, and will respond with a short casually phrased question to which the chunk of text you received would be an expected answer. The person who generated the answer is Alan Watt's and often he will give a response that answers a question only indirectly. The question should not exactly contain the subject matter of the answer. The question you create should be one to which the answer would be a correct indirect or metaphorical answer."
            },
            {
                "role": "user",
                "content": f"Return a short casually phrased question to which this text would be an appropriate. The question should not reference the core subject matter of the answer in an overt way. : {answer}"
            }
        ]
    )
    return response.choices[0].message.content.strip()

In [61]:
#split chunks into sections that could reasonably be the answer to a question
def chunk_gpt(text):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        messages=[
            {
                "role": "system",
                "content": "You are a document processor used to create fine-tuning data. You will receive a paragraph from a book, and extract portions of text that would be coherent as the answer to a theoretical, unspecified question. Each answer can be up to 200 words. The cohesive answers within the text may directly following each other and there may be space between them that needs to be removed. The response you provide should strictly be the series of cohesive thoughts identified within the content separated by line breaks. Minor grammatical may be made as needed. "
            },
            {
                "role": "user",
                "content": f"Extract chunks of text from this page that would be coherent as responses to an unspecified question. :\n\n{text}"
            }
        ]
    )
    return response.choices[0].message.content.strip().split('\n')

In [62]:
#loop through all of the processed "answers" and generate questions
def process_questions(processed_answers, book_name):
    for i in range(len(processed_answers)):

        #limiting cycles for testing
        if i > 2 and debugger == True:
            continue

        answer = processed_answers[i]
        #remove blanks
        if len(answer) < 30:
            continue

        question = answer_gpt(answer)

        json_obj = {
            "book" : book_name,
            "instruction" : "You are English author and intellectual Alan Watts. Please answer the following question using your standard speech patterns but do not over-embellish.",
            "input" : question,
            "output" : answer
        }

        processed_json.append(json_obj)

In [63]:
#loop through the chunks of a book
def process_book_chunks(text_chunks, book_name):
    print(f'Processing {len(text_chunks)} chunks for {book_name}')
    for i in range(len(text_chunks)):
        chunk = fix_encoding(text_chunks[i])

        #skip the first and last pages which are usually credits and other misellaneous content
        if i > len(text_chunks)-8 or i < 8:
            continue

        #limiting requests for testing purposes
        if i > 9 and debugger == True:
            continue

        processed_answers = chunk_gpt(chunk)
        process_questions(processed_answers, book_name)
    print(f'Completed processing {book_name}')

In [66]:
#process books into chunks of characters
def process_books():
    chunk_size = 5000
    for i in range(len(filenames)):

        #limiting cycles for testing
        if i > 10 and debugger == True:
            continue

        encoding = detect_encoding(f'books/{filenames[i]}')
        with open(f'books/{filenames[i]}', 'r', encoding=encoding, errors='replace') as file:
            #content = file.read().replace('\n','')
            #text_chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
            
            pattern_lineb = re.compile(r'\n(?!\n)')
            contents = pattern_lineb.sub('', file.read()).split('\n')
            text_chunks = [
                paragraph.replace('\n','') 
                for paragraph 
                in contents 
                if paragraph 
                and paragraph.strip() 
                and len(paragraph) > 100 
                and not re.compile(r'\d{2,}|publi|watts|alan', re.IGNORECASE).search(paragraph)
            ]
            
            for chunk in text_chunks:
                chunk_entry(chunk, filenames[i])
            #process_book_chunks(text_chunks, filenames[i])

    print(fix_encoding(json.dumps(processed_json, indent=4, ensure_ascii=False)))

In [69]:
chunk_list = []
def chunk_entry(chunk, book):
    bulk_entry = { "custom_id": f"{book}-{str(uuid.uuid4())}", "method": "POST", "url": "/v1/chat/completions", "body": { "model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are a document processor used to create fine-tuning data. You will receive 5000 characters worth of a book, and extract portions of text that would be coherent as the answer to a theoretical, unspecified question. Each answer can be up to 200 words. The cohesive answers within the text may directly following each other and there may be space between them that needs to be removed. The response you provide should strictly be the series of cohesive thoughts identified within the content separated by line breaks. Minor grammatical may be made as needed."},{"role": "user", "content": f"Extract chunks of text from this page that would be coherent as responses to an unspecified question. :\n\n{chunk}"}],"max_tokens": 5000}}
    chunk_list.append(bulk_entry)

In [67]:
processed_json = []

In [68]:
debugger = True
process_books()

Processing 994 chunks for Alan Watts in the academy, essays and lecture (2017)_djvu.txt
Completed processing Alan Watts in the academy, essays and lecture (2017)_djvu.txt
Processing 401 chunks for Alan Watts_ Mark Watts - Talking Zen_ Reflections on Mind, Myth, and the Magic of Life-Shambhala (2022)_djvu.txt
Completed processing Alan Watts_ Mark Watts - Talking Zen_ Reflections on Mind, Myth, and the Magic of Life-Shambhala (2022)_djvu.txt
Processing 434 chunks for Alan-Watts-Mark-Watts-Talking-Zen_djvu.txt
Completed processing Alan-Watts-Mark-Watts-Talking-Zen_djvu.txt
Processing 994 chunks for Columbus, Peter (ed.) - Alan Watts_ In the Academy (SUNY, 2017)_djvu.txt
Completed processing Columbus, Peter (ed.) - Alan Watts_ In the Academy (SUNY, 2017)_djvu.txt
Processing 466 chunks for Snelling, John (ed.) - Early Writings of Alan Watts (Celestial Arts, 1987)_djvu.txt
Completed processing Snelling, John (ed.) - Early Writings of Alan Watts (Celestial Arts, 1987)_djvu.txt
Processing 329 

In [None]:
with open('chunks_3_through_189','r') as f1:
    book_one = json.load(f1)

with open('more_chunks', 'r') as f2:
    book_two = json.load(f2)

combined_data = book_one + book_two

with open('first_batch_total', 'w') as f_combined:
    json.dump(combined_data, f_combined, indent=4)

In [None]:
print(fix_encoding(json.dumps(processed_json, indent=4, ensure_ascii=False)))

[
    {
        "book": "Alan W. Watts_ Joan Watts_ Anne Watts - -New World Library (2018)_djvu.txt",
        "instruction": "You are English author and intellectual Alan Watts. Please answer the following question using your standard speech patterns but do not over-embellish.",
        "input": "What's something unique you've noticed about certain symbolic representations?",
        "output": "Apart from the six divisions, another feature, which distinguishes it from the usual mandala is its centre. Almost all the examples you showed, except some produced by pathological cases, had at the centre some kind of “holy of holies” — a temple, an egg, or a golden ball. The Buddhist Wheel, however, has a cock, a snake, and a hog, the symbols of lust (raga), ill-will (dosa), and stupidity (moha). I have never come across any instance of its being used for magical purposes."
    },
    {
        "book": "Alan W. Watts_ Joan Watts_ Anne Watts - -New World Library (2018)_djvu.txt",
        "instr