In [1]:
import numpy as np
from transformers import pipeline, AutoTokenizer
from functions import summarize_string

In [2]:
long_note = "/Users/joachimpfefferkorn/repos/daily_note_organizer/test_media/2023-10-17.md"

In [3]:

def clean_md(md_content: str):
    clean_note = md_content
    clean_note = clean_note.replace('- [x]', 'Completed:').replace('- [ ]','To Do:')
    clean_note = clean_note.replace('[[', '').replace(']]','')
    clean_note = clean_note.replace('![[', 'Image file:')
    return clean_note

def prepare_note(md_path):
    with open(md_path, 'r') as note:
        md_content = note.read()
        cleaned_md = clean_md(md_content)
        return cleaned_md

In [4]:
prepared_note = prepare_note(str(long_note))

In [5]:
model_name = "knkarthick/MEETING_SUMMARY"
summarizer = pipeline("summarization", model=model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [6]:
class Section:
    def __init__(self, content):
        self.content = content
        self.tokens = tokenizer(content)
        self.num_tokens = len(self.tokens['input_ids']) #TODO there is a direct way to get these, slight hack for now

print(Section(prepared_note).num_tokens)

Token indices sequence length is longer than the specified maximum sequence length for this model (2718 > 1024). Running this sequence through the model will result in indexing errors


2718


In [7]:
def split(section: Section, amount: int):
    lines = np.asarray(section.content.split("\n"), dtype=str)

    sub_arrays = np.array_split(lines, amount)

    sections = []
    #Make each sub array a string
    for array in sub_arrays:
        text = "\n".join(array)
    # then a section
    #add each section to a list
        sections.append(Section(text))
    return sections

In [8]:
def biggest_section(sections):
    biggest_section = Section("")
    for section in sections:
        if section.num_tokens > biggest_section.num_tokens:
            biggest_section = section
    return biggest_section

output_sections = []
og_section = Section(prepared_note)
def recursive_split(input_section, split_amount):

    if input_section.num_tokens > tokenizer.model_max_length and split_amount < 99:
        split_amount += 1
        print(f"🐘 Original section must be split by {split_amount}! Section tokens: {input_section.num_tokens} Max model length: {tokenizer.model_max_length}")
        new_sections = split(og_section, split_amount)
        print(f"🪸 New sections updated with split subsections, 🔃 recursively split is recursing")
        for i, subsection in enumerate(new_sections):
            print(f"⚔️ Splitting subsection {i}")
            recursive_split(subsection, split_amount)
    else:
        print(f"🦋 Section is small enough!")
        print(f"🏄 Small enough section added to new_sections")
        output_sections.append(input_section)
        return 0



split_amount = 1
input_sections = split(Section(prepared_note), split_amount)
print(type(input_sections[0]))
# for section in sections:
#     print(section.content)

recursive_split(og_section, split_amount)
print(output_sections)


<class '__main__.Section'>
🐘 Original section must be split by 2! Section tokens: 2718 Max model length: 1024
🪸 New sections updated with split subsections, 🔃 recursively split is recursing
⚔️ Splitting subsection 0
🐘 Original section must be split by 3! Section tokens: 1394 Max model length: 1024
🪸 New sections updated with split subsections, 🔃 recursively split is recursing
⚔️ Splitting subsection 0
🦋 Section is small enough!
🏄 Small enough section added to new_sections
⚔️ Splitting subsection 1
🦋 Section is small enough!
🏄 Small enough section added to new_sections
⚔️ Splitting subsection 2
🦋 Section is small enough!
🏄 Small enough section added to new_sections
⚔️ Splitting subsection 1
🐘 Original section must be split by 3! Section tokens: 1325 Max model length: 1024
🪸 New sections updated with split subsections, 🔃 recursively split is recursing
⚔️ Splitting subsection 0
🦋 Section is small enough!
🏄 Small enough section added to new_sections
⚔️ Splitting subsection 1
🦋 Section is s

In [19]:
summary = ""
for i, section in enumerate(output_sections):
    print(f"📇 Summarizing section {i}")
    summary += summarize_string(section.content, summarizer, tokenizer)[0]['summary_text'] + "\n"
    #summary += summarize_string(section.content, summarizer, tokenizer)['summary_text'] + "\n"

📇 Summarizing section 0
📇 Summarizing section 1
📇 Summarizing section 2
📇 Summarizing section 3
📇 Summarizing section 4
📇 Summarizing section 5


In [20]:
print(summary)

In Machine Learning Specialization Advanced Machine Learning Algorithms, Week 3, the author explains how to fix the first time the model doesn't work.
If the regularization parameter is large, then the learning curve gets oversimplified. If it's low, the learning algorithm underfits the data. The gap between baseline and training error shows high bias. The human level performance is 10.6%, whereas the speech recognition program's is 14.8%.
If a learning algorithm suffers from high bias, getting more training data will help. If it does not, you need to make it more powerful and flexible to fit more complex functions.
In Machine Learning Specialization Advanced Machine Learning Algorithms, Week 3, the author explains how to fix the first time the model doesn't work.
If the regularization parameter is large, then the learning curve gets oversimplified. If it's low, the learning algorithm underfits the data. The gap between baseline and training error shows high bias. The human level perfo