# Goals
Models are either capped at `1024` or `512` tokens. To summarize our notes, let's split each note into appropriate sections by reading in the headers. Each section will be summarized separately,and those strings will then be concatenated together

In [14]:
import markdown
from transformers import pipeline, AutoTokenizer
from functions import summarize, summarize_string

In [15]:
MAX_HEADER_DEPTH = 6

In [16]:
long_note = "/Users/joachimpfefferkorn/repos/daily_note_organizer/test_media/2023-10-17.md"

In [17]:

def clean_md(md_content: str):
    #TODO
    # Remove hyperlinks
    # Disregard edge cases, anything encapsulated in a code block
    # use a look up table of strings to replace instead of this shadowed variable
    clean_note = md_content
    clean_note = clean_note.replace('- [x]', 'Completed:').replace('- [ ]','To Do:')
    clean_note = clean_note.replace('[[', '').replace(']]','')
    clean_note = clean_note.replace('![[', 'Image file:')
    return clean_note

def prepare_note(md_path):
    with open(md_path, 'r') as note:
        md_content = note.read()
        cleaned_md = clean_md(md_content)
        return cleaned_md

In [18]:
prepared_note = prepare_note(str(long_note))
# print(type(prepared_note))
#print(prepared_note)

In [19]:
def init_note_sections(note):
    """
    Splits the note into preamble and body, returns a list to be used in splitting functions.
    Preamble consists of everything before the first header, body is everything after the first header
    """
    preamble = ""
    for i, line in enumerate(note.split("\n")):
        if line.startswith("# " or "## " or "### " or "#### " or "##### " or "###### ") == False: #TODO dry
            preamble += line + " " #TODO why no newline chars here?

        else:
            body = note[len(preamble):]
            break
    return [preamble, body]

In [20]:
#GPT copypasta
def find_longest_header(text):
    lines = text.splitlines()  # Split the text into lines
    # Filter lines that start with "#" and are followed by a space after the "#" characters
    headers = [line for line in lines if line.startswith('#') and line.lstrip('#').startswith(' ')]
    
    if not headers:
        return None  # Return None if no headers are found
    
    # Find the longest header based on the number of "#" characters before the space
    longest_header = max(headers, key=lambda header: len(header.split()[0]))
    
    # Return only the header part (sequence of "#" followed by space)
    return longest_header.split(' ')[0] + ' '

# Example usage
text = """
# Header 1
## Header 2
### Header 3
#### Header 4
#thisgarbageisa really long hashtag
# Short Header
"""

longest_header = find_longest_header(text)
print(f"The longest header is: '{longest_header}'")


The longest header is: '#### '


In [25]:
model_name = "knkarthick/MEETING_SUMMARY"
summarizer = pipeline("summarization", model=model_name)
TOKENIZER = AutoTokenizer.from_pretrained(model_name)
MAX_TOKENS = TOKENIZER.model_max_length

#print(f"Max Tokens: {max_tokens}")

# Tokenize the note
num_tokens = len(TOKENIZER(prepared_note)[0]) #Index 0 is tokens, index 1 is the attention mask

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Token indices sequence length is longer than the specified maximum sequence length for this model (2718 > 1024). Running this sequence through the model will result in indexing errors


In [47]:
def build_subsections(header_positions, lines):
    print("🏭 Building Subsections")
    print("👑 Header positions: ", header_positions)
    
    output = []
    for h, _ in enumerate(header_positions[:-1]):
        start = header_positions[h]
        end = header_positions[h+1]
        subsection = ' '.join(lines[start:end])
        print("🎬 START", start, "🔚 END", end)
        print("🪅 SUBSECTION: \n", subsection)
        output.append(subsection)
    return output

def tokens_small_enough(sections):
    largest_section_length = 0
    for section in sections:
        print("🪙 SECTION to tokenize: ", section)
        tokens = TOKENIZER(section)
        num_tokens = len(tokens['input_ids'])
        print(f"💶 section has {num_tokens}")
        if num_tokens > largest_section_length:
            largest_section_length = num_tokens
    if largest_section_length > MAX_TOKENS:
        print(f"🐘 Largest Token size is {largest_section_length}, which is Too Big (max tokens are {MAX_TOKENS})")
        return False
    else:
        print(f"🦋 Largest Tokens Size is {largest_section_length}, which is Small Enough (max tokens are {MAX_TOKENS})")
        return True


def split_at_header(body: str, header_hashes: str) -> tuple: #should return list of subnotes
    print(f"\n⛓️‍💥 Splitting headers starting at hashes 🟩{header_hashes}🟩")

    header_positions = []
    lines = []

    for line in body.split("\n"):
        lines += [line]
    print("INITIAL LINES:", len(lines))


    for length, hash in enumerate(header_hashes):
        header = header_hashes[:length+1] + ' '
        print(f"🗿 header: 🟩{header}🟩")
        for linenum, line in enumerate(lines):
            if line.startswith(header):
                header_positions += [linenum]
        subsections = build_subsections(header_positions, lines)
        if tokens_small_enough(subsections):
            break
        else:
            continue
    
    return subsections


In [48]:

# If the note is longer than max_tokens, split the note into each header
summary = "No summary yet... initialized value"

if num_tokens <= MAX_TOKENS:
    print(f"😎 All good:\n{num_tokens} tokens less than {MAX_TOKENS} maximum")
    summary = summarize_string(prepared_note, summarizer, TOKENIZER)
else:
    print(f"⛑️ {num_tokens} is greater than {MAX_TOKENS}, we need to split up this note")

    note_sections = init_note_sections(prepared_note)
    longest_header = find_longest_header(note_sections[1])
    print(f"🏄 Longest header: 🟩{longest_header}🟩")
    preamble = [str(note_sections[0])]
    body = str(note_sections[1])
    new_sections = split_at_header(body, longest_header[:-1])

    print(f"🫀 there are {len(new_sections)} new sections after splitting the body")
    all_sections = preamble
    for section in new_sections:
        all_sections.extend(section)
    


    subsection_summaries = []
    print("SECTIONS:")
    print(all_sections)
    # for i, section in enumerate(all_sections):
    #     print(i)
    #     print(section)
#         subsection_summaries += [summarize_string(section, summarizer, TOKENIZER)]
#     summary = "\n".join(subsection_summaries)

# print("💅 Final, summarized note:\n", summary)
    
    
# if no headers are left and the note is longer than max_tokens, split at the middle line break
# Keep splitting each sub-section at the middle line break until there is a single line
# If the single line is longer than max_tokens, split the line at the middle char
# Keep splitting each sub line at the middle char until it is less than max_tokens


⛑️ 2718 is greater than 1024, we need to split up this note
🏄 Longest header: 🟩##### 🟩

⛓️‍💥 Splitting headers starting at hashes 🟩#####🟩
INITIAL LINES: 199
🗿 header: 🟩# 🟩
🏭 Building Subsections
👑 Header positions:  [0, 13, 186]
🎬 START 0 🔚 END 13
🪅 SUBSECTION: 
 # Afternoon To-Do #meta #todoList  ## Priority Completed: Complete Python module Completed: Complete Bias and Variance ## Bonus/tomorrow: Completed: Note cleanup with Aliases (see Career and Study To - Do) To Do: Linear Algebra Note Migration Completed: Integrate with this daily note 		To Do: Training, Test, and Dev Sets 		To Do: regularization (#merge and integrate) 		To Do: Cost and Loss Completed: Big overhaul of Bias and Variance with regards to todays notes. 
🎬 START 13 🔚 END 186
🪅 SUBSECTION: 
 # Machine Learning Specialization Notes: These notes will be integrated into other notes but kept here in their entirety. ## Bias and Variance Machine Learning Specialization Advanced Machine Learning Algorithms, Week 3  Models al

In [77]:
# Send each section to the summarizer
# perhaps if the section is too small (like the hashtag preamble in some notes, just dont send at all)