In [6]:
import openai, os
from tqdm import tqdm
from glob import glob

# Introduction
Takes natural language text input (novels, articles, etc.) and outputs a one sentence and one word summary.

Model documentation: https://beta.openai.com/docs/introduction

## Setup 
Export api key before running: `export OPENAI_API_KEY=<your_api_key>`

# Function definitions

## Loading and formatting

In [7]:
def text2paragraphs(file_path: str):
    """
    Reads a text file and returns a list of paragraphs.
    """
    with open(file_path, "r") as f:
        text = f.read()
    return text.split("\n\n")

def removeIntegerOnlyLines(paragraphs: list):
    """
    Removes paragraphs that contain only integers.
    """
    return [p for p in paragraphs if not p.isdigit()]

def removeEmptyLines(paragraphs: list):
    """
    Removes paragraphs that are empty.
    """
    return [p for p in paragraphs if p]

def removeNewLines(paragraphs: list):
    """
    Removes new lines.
    """
    return [p.replace("\n", " ") for p in paragraphs]

def joinParagraphsMaxLength(paragraphs: list, max_length: int = 800):
    """
    Joins paragraphs that are longer than max_length.
    """
    joined_paragraphs = [" "]
    for p in paragraphs:
        para_length = len(p.split(" "))
        previous_length = len(joined_paragraphs[-1].split(" "))
        if para_length + previous_length > max_length:
            joined_paragraphs.append(p)
        else:
            joined_paragraphs[-1] += " " + p

    return joined_paragraphs

def replaceSlashInParagraphs(paragraphs: list):
    """
    Replaces slashes in paragraphs.
    """
    return [p.replace("\'", "'") for p in paragraphs]

def loadAndFormatText(file_path: str):
    """
    Load and format text into paragraphs.
    """

    paragraphs = text2paragraphs(file_path)
    paragraphs = removeIntegerOnlyLines(paragraphs)
    paragraphs = removeEmptyLines(paragraphs)
    paragraphs = removeNewLines(paragraphs)
    paragraphs = joinParagraphsMaxLength(paragraphs)
    paragraphs = replaceSlashInParagraphs(paragraphs)

    return paragraphs

In [17]:
def dictToMarkdownFile(d: dict, file_path: str):
    """
    Converts a dictionary to a Markdown file.
    """
    with open(file_path, "w") as f:
        
        f.write("# " + d['Title'] + "\n")
        for key in d:
            if key != 'title':
                f.write("\n## " + key + "\n")
                f.write(d[key] + "\n")

def changeFileExtension(file_path: str, new_extension: str):
    """
    Changes the file extension of a file.
    """
    return os.path.splitext(file_path)[0] + new_extension

def countWordsInString(s: str):
    """
    Counts the number of words in a string.
    """
    return len(s.split(" "))

def longestWordInString(s: str):
    """
    Returns the longest word in a string.
    """
    return max(s.split(" "), key = len)

## NLP prediction

In [9]:
def tldrSummary(input_text:str, engine:str="curie", summary_prompt:str="\n\ntl;dr:", max_tokens:int=64, temperature:float=0.1):
    """
    Makes a summarisation text prediction
    """
    openai.api_key = os.getenv("OPENAI_API_KEY")
    response = openai.Completion.create(
        engine=engine,
        prompt=input_text+summary_prompt,
        temperature=temperature,
        max_tokens=max_tokens,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        stop=["\n"]
    )
    return response.choices[0].text

In [19]:
def main(file_path: str, engine_1:str = "davinci", engine_2:str = "davinci"):
    """
    Load a book, make a summarisation prediction and save the result.
    """

    output = {}
    b = os.path.basename(file_path)
    output['Title'] = os.path.splitext(b)[0]

    paragraphs = loadAndFormatText(file_path)

    # generate long summary
    summaries = []
    for p in tqdm(paragraphs):
        summaries.append(tldrSummary(p, engine=engine_1))
    # join them
    joined_summaries = joinParagraphsMaxLength(summaries)
    joined_summaries = removeEmptyLines(joined_summaries)
    joined_summaries = removeNewLines(joined_summaries)
    output['Long summary'] = joined_summaries[0].strip()

    # generate short summary
    one_sentence_summary = tldrSummary(
        joined_summaries[0], 
        temperature=0.1,
        engine=engine_2
    )
    output['Short summary'] = one_sentence_summary.strip()

    
    # generate one word summary
    one_word_summary = longestWordInString(one_sentence_summary)
    attempts = 0
    while attempts < 10:
        # try n times for a 1 word summary, else just use longest word
        one_word_attempt = tldrSummary(
            joined_summaries[0], 
            max_tokens=10,
            temperature=0.5,
            summary_prompt="\n\nTo summarize in one word:",
            engine=engine_2
        )
        if countWordsInString(one_word_attempt) == 1:
            one_word_summary = one_word_attempt
            print(f'One word from model: {one_word_summary}')
            break
        attempts += 1

    output['One word summary'] = one_word_summary.strip()

    # save it in a Markdown file
    dictToMarkdownFile(
        output,
        file_path=changeFileExtension(file_path, ".md").replace('books', 'summaries')
    )

    return output
    

# Main

In [20]:
# run for all 'books'
for book in glob("books/*.txt"):
    _ = main(book)

100%|██████████| 25/25 [01:14<00:00,  2.99s/it]


One word from model: 


100%|██████████| 9/9 [00:26<00:00,  2.94s/it]


One word from model: 


100%|██████████| 12/12 [00:28<00:00,  2.39s/it]


One word from model: 


100%|██████████| 6/6 [00:26<00:00,  4.35s/it]


One word from model: 


100%|██████████| 7/7 [00:18<00:00,  2.58s/it]


One word from model: 


100%|██████████| 10/10 [00:35<00:00,  3.53s/it]


One word from model: 


100%|██████████| 6/6 [00:22<00:00,  3.77s/it]


One word from model: 
