In [39]:
from dotenv import load_dotenv, find_dotenv; load_dotenv(find_dotenv())
import os, notion_client

notion = notion_client.Client(auth=os.environ['NOTION_TOKEN'])
CLIP = os.environ['CLIP_DATABSE_ID']
INPUT = os.environ['INPUT_DATABSE_ID']

In [40]:
def remove_keys(d):
    keys_to_remove = ['id', 'color', 'description']
    for k in keys_to_remove:
        d.pop(k, None)
    for k, v in d.items():
        if isinstance(v, dict):
            remove_keys(v)
        elif isinstance(v, list):
            for v_ in v:
                if isinstance(v_, dict):
                    remove_keys(v_)


In [41]:
database = notion.databases.retrieve(database_id=INPUT)
remove_keys(database)
# display(database)
database['properties']

{'URL': {'name': 'URL', 'type': 'url', 'url': {}},
 'Year': {'name': 'Year',
  'type': 'multi_select',
  'multi_select': {'options': [{'name': '2024'},
    {'name': '2024.05'},
    {'name': '2024.04'},
    {'name': '2024.03'},
    {'name': '2024.02'},
    {'name': '2024.01'},
    {'name': '2023'},
    {'name': '2023.12'},
    {'name': '2023.11'},
    {'name': '2023.10'},
    {'name': '2023.09'},
    {'name': '2023.08'},
    {'name': '2023.07'},
    {'name': '2023.06'},
    {'name': '2023.05'},
    {'name': '2023.03'},
    {'name': '2023.01'},
    {'name': '2022'},
    {'name': '2021'},
    {'name': '2020'},
    {'name': '2019'},
    {'name': '2016'},
    {'name': '2015'},
    {'name': '2013'}]}},
 'Priority': {'name': 'Priority',
  'type': 'select',
  'select': {'options': [{'name': 'SuperHigh'},
    {'name': 'High'},
    {'name': 'Medium'},
    {'name': 'Low'},
    {'name': 'Waiting'}]}},
 'Type': {'name': 'Type',
  'type': 'multi_select',
  'multi_select': {'options': [{'name': 'Twee

In [42]:
tags = [o['name'] for o in database['properties']['Tags']['multi_select']['options']]
types = [o['name'] for o in database['properties']['Type']['multi_select']['options']]

In [43]:
import requests, datetime
from bs4 import BeautifulSoup
from typing import Optional
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_community.document_loaders.html import UnstructuredHTMLLoader
from langchain_core.messages import SystemMessage
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_openai import ChatOpenAI
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_core.utils.function_calling import convert_to_openai_function, convert_to_openai_tool

In [44]:
class PageProperties(BaseModel):
    """Extract web page data"""
    title: str = Field(..., description='page title')
    short_title: str = Field(..., description='page title (short ver.)')
    year: Optional[int] = Field(description='Posted year of the page')
    month: Optional[int] = Field(description='Posted month of the page')
    # date: Optional[datetime.date] = Field(description='Posted date of the page in format "%Y/%m/%d"')
    types: list[str] = Field(..., description='page types in list. You can create new.', enum=types)
    tags: list[str] = Field(..., description='tags in list. You can create new in Camel case.', enum=tags)
    summary: str = Field(..., description='page content in Japanese.')
    # code: Optional[str] = Field(description='link url to the Code. (Github, GitLab, ...)')
    # pdf: Optional[str] = Field(description='link url to the PDF.')

extract_func = convert_to_openai_function(PageProperties)
extract_func

{'name': 'PageProperties',
 'description': 'Extract web page data',
 'parameters': {'type': 'object',
  'properties': {'title': {'description': 'page title', 'type': 'string'},
   'short_title': {'description': 'page title (short ver.)', 'type': 'string'},
   'year': {'description': 'Posted year of the page', 'type': 'integer'},
   'month': {'description': 'Posted month of the page', 'type': 'integer'},
   'types': {'description': 'page types in list. You can create new.',
    'enum': ['Tweet',
     'Announce',
     'Slides',
     'Medium',
     'Web',
     'Qiita',
     'Zenn',
     'Notion',
     'note',
     'Book',
     'Paper',
     'GitHub',
     'HuggingFace',
     'Demo',
     'KAKEN',
     'Course',
     'Tutorial',
     'Resources',
     'GPTs',
     'Thought',
     'Youtube',
     'Webinar',
     'Kaggle',
     'Service',
     'MemberOnly'],
    'type': 'array',
    'items': {'type': 'string'}},
   'tags': {'description': 'tags in list. You can create new in Camel case.',
  

In [45]:
def load_doc(url):
    response = requests.get(url)
    with open('temp.html','w') as f:
        f.write(response.text)
    loader = UnstructuredHTMLLoader("temp.html")
    docs = loader.load()
    content = docs[0].page_content.replace('\n\n', '\n')
    return content

def extract_urls(url):
    headers = {
        'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
    }
    response = requests.request("GET", url, headers=headers)
    data = BeautifulSoup(response.text, 'html.parser')
    urls = data.find_all('a', href=True)
    return urls

In [46]:
model = ChatOpenAI(model='gpt-3.5-turbo')
model_with_func = model.bind_functions([extract_func])

In [47]:
system_images_extract = '''\
You are a data extractor from a web page.
'''

template = '''\
## SourceURL
{source_url}

## WebText
{web_text}
'''

# ## URLsInPage
# {urls}

chat_template = ChatPromptTemplate.from_messages([
    SystemMessage(content=system_images_extract),
    HumanMessagePromptTemplate.from_template(template),
])

In [48]:
def create_page(x):
    page = {
        "parent": { "database_id": INPUT},
        "properties": {
            'Status': {'status': {'name': 'Imported'}},
            'Issue': {'title': [{'text': {'content': x['short_title']}}]},
            'Type': {"multi_select": [
                {'name': typ}
                for typ in x['types']
            ]},
            'Tags': {"multi_select": [
                {'name': tag}
                for tag in x['tags']
            ]},
            'Abstract': {'rich_text': [{'text': {'content': x['summary']}}]},
            'Title': {'rich_text': [{'text': {'content': x['title']}}]},
        }
    }
    if 'year' in x:
        year = []
        if x['year'] >= 2023:
            if 'month' in x:
                year.append({'name': f"{str(x['year']).zfill(4)}.{str(x['month']).zfill(2)}"}),
                year.append({'name': str(x['year'])})
        else:
            year.append({'name': str(x['year'])})
        if year:
            page['properties']['Year'] = {"multi_select": year}
    return page

In [49]:
chain = {
    'source_url': RunnablePassthrough(),
    'web_text': RunnablePassthrough() | RunnableLambda(load_doc),
    'urls': RunnablePassthrough() | RunnableLambda(extract_urls),
} | RunnablePassthrough() \
| chat_template | model_with_func | JsonOutputFunctionsParser() \
| RunnableLambda(create_page)

In [50]:
# url = 'https://zenn.dev/ysksatoo/articles/66fd26893a6cdd'
url = 'https://juu7g.hatenablog.com/entry/Python/blog/notion-api#%E3%83%87%E3%83%BC%E3%82%BF%E3%83%99%E3%83%BC%E3%82%B9%E3%81%AE%E3%83%97%E3%83%AD%E3%83%91%E3%83%86%E3%82%A3%E3%81%AB%E5%90%88%E3%81%A3%E3%81%9F%E3%83%97%E3%83%AD%E3%83%91%E3%83%86%E3%82%A3%E5%80%A4%E3%81%AE%E4%BD%9C%E3%82%8A%E6%96%B9'
res = chain.invoke(url)

In [51]:
res

{'parent': {'database_id': 'b97a725c57d74939b9b414aeb57979a7'},
 'properties': {'Status': {'status': {'name': 'Imported'}},
  'Issue': {'title': [{'text': {'content': 'PythonでのNotion API'}}]},
  'Type': {'multi_select': [{'name': 'Web'}]},
  'Tags': {'multi_select': [{'name': 'Python'},
    {'name': 'Notion'},
    {'name': 'API'}]},
  'Abstract': {'rich_text': [{'text': {'content': 'PythonでNotion APIを操作する方法について紹介しています。記事ではnotion-clientパッケージを使用して、Notionデータベースにデータを追加、更新、削除する方法やデータベースのプロパティ値の作り方などについて解説しています。また、Notion APIの各種設定値や環境変数の設定方法、実際のプログラムの実践例も紹介しています。'}}]},
  'Title': {'rich_text': [{'text': {'content': 'PythonでのNotion APIの使い方'}}]},
  'Year': {'multi_select': [{'name': '2022'}]}}}

In [52]:
notion.pages.create(**res)

{'object': 'page',
 'id': '56121ddc-ce3d-46dd-aa8a-7384db57cee2',
 'created_time': '2024-05-20T23:37:00.000Z',
 'last_edited_time': '2024-05-20T23:37:00.000Z',
 'created_by': {'object': 'user',
  'id': '79657b4f-a71f-4ee8-8fc7-b5b4d6e6f385'},
 'last_edited_by': {'object': 'user',
  'id': '79657b4f-a71f-4ee8-8fc7-b5b4d6e6f385'},
 'cover': None,
 'icon': None,
 'parent': {'type': 'database_id',
  'database_id': 'b97a725c-57d7-4939-b9b4-14aeb57979a7'},
 'archived': False,
 'in_trash': False,
 'properties': {'URL': {'id': 'CcZw', 'type': 'url', 'url': None},
  'Year': {'id': 'ECqv',
   'type': 'multi_select',
   'multi_select': [{'id': 'e542b25a-23ba-4a6f-bae2-f4746380969f',
     'name': '2022',
     'color': 'red'}]},
  'Priority': {'id': 'G%60Y%7B', 'type': 'select', 'select': None},
  'Type': {'id': 'UZhN',
   'type': 'multi_select',
   'multi_select': [{'id': 'badbe344-c582-4ad1-882d-a6e78c96530a',
     'name': 'Web',
     'color': 'brown'}]},
  'Reports': {'id': '%5CxHy',
   'type': '