Complementary notebook for creating a dataset. Refer to the full paper for a complete walkthrough.

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
import os 
import spacy
import requests
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json

from ratelimiter import RateLimiter

In [None]:
# load the environment variables into global variables
INSIGHTS_ID = os.environ["INSIGHTS_ID"]
NOTION_KEY = os.environ["NOTION_KEY"]

# classes for interacting with notion API
class Client:
    def __init__(self, key):
        self.request = requests.Session()
        self.request.headers.update(
            {
                "Authorization": f"Bearer {key}",
                "Notion-Version": "2021-08-16",
                "Content-Type": "application/json",
            }
        )

        self.databases = Databases(self)
        self.pages = Pages(self)


class Databases:
    def __init__(self, client):
        self.client = client

    def query(self, database_id, cursor=None):
        url = f"https://api.notion.com/v1/databases/{database_id}/query"

        if cursor:
            response = self.client.request.post(url, json={ "start_cursor": cursor })
        else:
            response = self.client.request.post(url)
            
        return response.json()


class Pages:
    def __init__(self, client):
        self.client = client

    def query_content(self, block_id, cursor=None):
        url = f"https://api.notion.com/v1/blocks/{block_id}/children?page_size=100"

        if cursor:
            url = f"{url}&start_cursor={cursor}"

        response = self.client.request.get(url)
        return response.json()


In [None]:
# fetch pages
client = Client(NOTION_KEY)

# fetch all pages
def fetch_all_pages(pages=[], cursor=None):
    print("fetching...", len(pages), cursor)
    result = client.databases.query(INSIGHTS_ID, cursor)
    
    if result["object"] == "error":
        print(result)
        return pages

    for page in result["results"]:
        pages.append(page)

    if result["next_cursor"]:
        return fetch_all_pages(pages, result["next_cursor"])

    return pages

pages = fetch_all_pages()

fetching... 0 None
fetching... 100 e37e85c9-c6e9-4ae5-89d6-a7626801aef5
fetching... 200 b024f2c9-61cc-474f-9880-77190482ec1f
fetching... 300 da25ca6f-c7f9-4e81-81e3-b6c9eed074b0
fetching... 400 cda4e2e0-7c71-4141-b74a-9c42f2d5c607
fetching... 500 964890a5-08ba-4414-b051-34a9fa74fd48
fetching... 600 8a97037b-7686-42d8-8be3-410c25e883e7
fetching... 700 f65df2d0-c210-4b82-a76d-7e51015716ed
fetching... 800 cf2bbd53-20dd-42b5-ab19-273f838d820d
fetching... 900 b9bfb760-8952-4775-ba11-d180f34f85d9


In [None]:
# fetch page content
@RateLimiter(max_calls=3, period=1)
def query_content_recursively(block_id, blocks, cursor=None):
    response = client.pages.query_content(block_id, cursor)

    for block in response["results"]:
        if block["has_children"]:
            blocks.append([block, query_content_recursively(block["id"], [])])
        else:
            blocks.append(block)

    if response["next_cursor"]:
        return query_content_recursively(block_id, blocks, response["next_cursor"])

    return blocks


# fetch nested content
items = []
for page in pages:
    items.append({"page": page, "blocks": query_content_recursively(page["id"], [])})


In [None]:
# extract data from page content
def extract_title(page):
    return [t["plain_text"] for t in page["properties"]["Title"]["title"]]

def extract_text(blocks, output):
    for block in blocks:
        if isinstance(block, list):
            extract_text(block, output)
        elif block["type"] in ["paragraph", "quote"]:
            phrases = []
            for text in block[block["type"]]["text"]:
                phrases.append(text["plain_text"])
            output.append("".join(phrases))

    return output

def extract_data(item):
    title = "".join(extract_title(item["page"])) + "."
    blocks = extract_text(item["blocks"], [])

    return {
        "title": title,
        "blocks": blocks
    }

dataset = []
for item in items:
    title = "".join(extract_title(item["page"])) + "."
    blocks = extract_text(item["blocks"], [])

    dataset.append(title)
    for block in blocks:
        if block.endswith(('.', '!', '?')):
            dataset.append(block)


In [None]:
dataset[:10]

['Interactive suggestion systems should increase writers’ feeling of ownership.',
 'In this case, the designer could consider devising ways to keep the fraction of text written by writers to text written by the Language Model relatively high.',
 'Additionally, this can act as a feedback signal on whether to increase suggestion velocity or not. If the user has accepted too many suggestions, then perhaps the system should factor that in and not make any more suggestions until a threshold has been crossed. Otherwise, we may as well have the entire passage generated by a language model.',
 'Interactive suggestion systems should increase writers’ productivity.',
 'Similar to Ideoscope - Paul Bricman and Visualizing Item and Mnemonic Metrics Over Time, we should gather concrete metrics on how Interactive Suggestion Systemss help increase writer’s productivity. This can also include linguistic metrics specific to the collaboration process such as frequency of errors and vocabulary diversity.'

In [None]:
# todo, should add typo tolerance

In [None]:
def normalize_text(text):
    # remove Jarivs quotes
    text = text.replace("Jarvis:", "")

    # create a space between the word and the punctuation following it
    text = re.sub(r'([.,!?]+)', r" \1 ", text)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",", "'")
    text = re.sub(r"[^a-zA-Z?,.!'’]+", " ", text)

    # Compact spaces
    text = re.sub(r'[" "]+', " ", text)
    text = text.strip()
    return text

In [None]:
normalized_dataset = []

for data in dataset:
    normalized_dataset.append(normalize_text(data))

In [None]:
normalized_dataset[:10]

['Interactive suggestion systems should increase writers’ feeling of ownership .',
 'In this case , the designer could consider devising ways to keep the fraction of text written by writers to text written by the Language Model relatively high .',
 'Additionally , this can act as a feedback signal on whether to increase suggestion velocity or not . If the user has accepted too many suggestions , then perhaps the system should factor that in and not make any more suggestions until a threshold has been crossed . Otherwise , we may as well have the entire passage generated by a language model .',
 'Interactive suggestion systems should increase writers’ productivity .',
 'Similar to Ideoscope Paul Bricman and Visualizing Item and Mnemonic Metrics Over Time , we should gather concrete metrics on how Interactive Suggestion Systemss help increase writer’s productivity . This can also include linguistic metrics specific to the collaboration process such as frequency of errors and vocabulary d

In [None]:
MIN_TOKENS = 6
MAX_TOKENS = 512
filtered_dataset = []

for data in normalized_dataset:
    token_length = len(data.split(" "))

    if token_length > MIN_TOKENS and token_length < MAX_TOKENS:
        filtered_dataset.append(data)

In [None]:
with open('./data.json', 'w') as f:
    json.dump(filtered_dataset, f)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=ad10b37e-1254-49b1-9814-3334468ab840' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>