In [1]:
from dotenv import load_dotenv, find_dotenv; load_dotenv(find_dotenv())
import os, notion_client, requests, datetime
from bs4 import BeautifulSoup
from typing import Optional
from operator import itemgetter
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_community.document_loaders.html import UnstructuredHTMLLoader
from langchain_core.messages import SystemMessage
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_openai import ChatOpenAI
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_core.utils.function_calling import convert_to_openai_function, convert_to_openai_tool

notion = notion_client.Client(auth=os.environ['NOTION_TOKEN'])
CLIP = os.environ['CLIP_DATABASE_ID']
INPUT = os.environ['INPUT_DATABASE_ID']

In [2]:
input_database = notion.databases.retrieve(database_id=INPUT)
tags = [o['name'] for o in input_database['properties']['Tags']['multi_select']['options']]
types = [o['name'] for o in input_database['properties']['Type']['multi_select']['options']]

In [3]:
class PageProperties(BaseModel):
    """Extract web page data"""
    title: str = Field(..., description='page title (long ver.)')
    short_title: str = Field(..., description='page title (short ver.)')
    year: Optional[int] = Field(description='Posted year of the page')
    month: Optional[int] = Field(description='Posted month of the page')
    # date: Optional[datetime.date] = Field(description='Posted date of the page in format "%Y/%m/%d"')
    types: list[str] = Field(..., description='page types in list. You can create new.', enum=types)
    tags: list[str] = Field(..., description='tags in list. You can create new in Camel case.', enum=tags)
    summary: str = Field(..., description='page content in Japanese.')
    # code: Optional[str] = Field(description='link url to the Code. (Github, GitLab, ...)')
    # pdf: Optional[str] = Field(description='link url to the PDF.')

extract_func = convert_to_openai_function(PageProperties)
extract_func

{'name': 'PageProperties',
 'description': 'Extract web page data',
 'parameters': {'type': 'object',
  'properties': {'title': {'description': 'page title (long ver.)',
    'type': 'string'},
   'short_title': {'description': 'page title (short ver.)', 'type': 'string'},
   'year': {'description': 'Posted year of the page', 'type': 'integer'},
   'month': {'description': 'Posted month of the page', 'type': 'integer'},
   'types': {'description': 'page types in list. You can create new.',
    'enum': ['Tweet',
     'Announce',
     'Slides',
     'Medium',
     'Web',
     'Qiita',
     'Zenn',
     'Notion',
     'note',
     'Book',
     'Paper',
     'GitHub',
     'HuggingFace',
     'Demo',
     'KAKEN',
     'Course',
     'Tutorial',
     'Resources',
     'GPTs',
     'Thought',
     'Youtube',
     'Webinar',
     'Kaggle',
     'Service',
     'MemberOnly',
     'Note'],
    'type': 'array',
    'items': {'type': 'string'}},
   'tags': {'description': 'tags in list. You can c

In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

def get_tweet_content_and_links(tweet_url):
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')

    # ChromeDriverのパスを指定
    driver = webdriver.Chrome(options=options)
    
    # URLにアクセス
    driver.get(tweet_url)

    # articleタグが読み込まれるまで待機（最大15秒）
    WebDriverWait(driver, 15).until(EC.visibility_of_element_located((By.TAG_NAME, 'article')))

    # ページのソースを取得して解析
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    # Tweet本文とリンクを取得
    tweet_text = ""
    tweet_links = []
    tweet_container = soup.find("div", {"data-testid": "tweetText"})
    if tweet_container:
        tweet_text = tweet_container.get_text()
        links = tweet_container.find_all("a")
        for link in links:
            tweet_links.append(link['href'])

    driver.quit()
    
    return tweet_text, tweet_links

In [5]:
def extract_urls(url):
    headers = {
        'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
    }
    response = requests.request("GET", url, headers=headers)
    data = BeautifulSoup(response.text, 'html.parser')
    urls = data.find_all('a', href=True)
    return urls

def extract_text(url):
    response = requests.get(url)
    with open('.temp.html','w') as f:
        f.write(response.text)
    loader = UnstructuredHTMLLoader(".temp.html")
    docs = loader.load()
    content = docs[0].page_content.replace('\n\n', '\n')
    return content

In [6]:
def load_web_text(url: str):
    if url.startswith('https://x.com/'):
        content, links = get_tweet_content_and_links(url)
    else:
        content = extract_text(url)
        # links = extract_urls(url)
    return content

def create_page(data):
    url = data['url']
    x = data['page']
    types = [{'name': typ} for typ in x['types']] if isinstance(x['types'], list) else [{'name': x['types']}]
    tags = [{'name': tag} for tag in x['tags']] if isinstance(x['tags'], list) else [{'name': x['tags']}]
    page = {
        "parent": { "database_id": INPUT},
        "properties": {
            'Status': {'status': {'name': 'Not started'}},
            'Import': {'checkbox': True},
            'URL': {'url': url},
            'Issue': {'title': [{'text': {'content': x['short_title']}}]},
            'Type': {"multi_select": types},
            'Tags': {"multi_select": tags},
            'Abstract': {'rich_text': [{'text': {'content': x['summary']}}]},
            'Title': {'rich_text': [{'text': {'content': x['title']}}]},
        }
    }
    if 'year' in x:
        year = []
        if x['year'] >= 2023:
            if 'month' in x:
                year.append({'name': f"{str(x['year']).zfill(4)}.{str(x['month']).zfill(2)}"}),
                year.append({'name': str(x['year'])})
        else:
            year.append({'name': str(x['year'])})
        if year:
            page['properties']['Year'] = {"multi_select": year}
    return page

model = ChatOpenAI(model='gpt-3.5-turbo')
model_with_func = model.bind_functions([extract_func])

system_images_extract = '''\
You are a data extractor from a web page.
'''

template = '''\
## SourceURL
{source_url}

## WebText
{web_text}
'''

chat_template = ChatPromptTemplate.from_messages([
    SystemMessage(content=system_images_extract),
    HumanMessagePromptTemplate.from_template(template),
])

chain = {
    'source_url': RunnablePassthrough(),
    'web_text': RunnablePassthrough() | RunnableLambda(load_web_text),
    'urls': RunnablePassthrough() | RunnableLambda(extract_urls),
} | RunnablePassthrough() \
| {
    'url': itemgetter('source_url'),
    'page': RunnablePassthrough() | chat_template | model_with_func | JsonOutputFunctionsParser()
}\
| RunnableLambda(create_page)

In [7]:
from tqdm import tqdm
items = notion.databases.query(database_id=CLIP, filter={
    "property": "Check","checkbox":{"equals": False}
})
for item in tqdm(items['results']):
    try:
        url = item['properties']['URL']['url']
        page = chain.invoke(url)
        blocks = notion.blocks.children.list(block_id=item['id'])
        page['children'] = blocks['results']
        notion.pages.create(**page)
        notion.blocks.delete(block_id=item['id'])
    except Exception as e:
        print(e)

  1%|          | 1/100 [00:07<11:44,  7.12s/it]

body failed validation: body.children[0].image.external should be defined, instead was `undefined`.


 14%|█▍        | 14/100 [02:11<20:21, 14.20s/it]

body failed validation: body.children[24].image.external.url.length should be ≤ `2000`, instead was `10738`.


 16%|█▌        | 16/100 [02:24<14:37, 10.45s/it]

In [None]:
blocks

{'object': 'list',
 'results': [{'object': 'block',
   'id': '04586813-aee7-4ab5-9ffe-ff49fb09f6c5',
   'parent': {'type': 'page_id',
    'page_id': '525e3deb-c6c7-4cdf-b884-5414fc68be9b'},
   'created_time': '2024-05-19T15:07:00.000Z',
   'last_edited_time': '2024-05-19T15:07:00.000Z',
   'created_by': {'object': 'user',
    'id': 'f51f285e-16d4-41fe-ab93-750d33107cb3'},
   'last_edited_by': {'object': 'user',
    'id': 'f51f285e-16d4-41fe-ab93-750d33107cb3'},
   'has_children': False,
   'archived': False,
   'in_trash': False,
   'type': 'image',
   'image': {'caption': [],
    'type': 'file',
    'file': {'url': 'https://prod-files-secure.s3.us-west-2.amazonaws.com/36f2205a-559e-4a1f-973b-38bea623c469/42bd8dc6-a30f-4ddc-ac65-a87cb49b5144/slide_0.jpg?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIAT73L2G45HZZMZUHI%2F20240521%2Fus-west-2%2Fs3%2Faws4_request&X-Amz-Date=20240521T013751Z&X-Amz-Expires=3600&X-Amz-Signature=c8f1c141f7d39a6434b20