In [9]:
from dotenv import load_dotenv, find_dotenv; load_dotenv(find_dotenv())
import os, notion_client, requests, datetime
from bs4 import BeautifulSoup
from typing import Optional
from operator import itemgetter
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_community.document_loaders.html import UnstructuredHTMLLoader
from langchain_core.messages import SystemMessage
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_openai import ChatOpenAI
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_core.utils.function_calling import convert_to_openai_function, convert_to_openai_tool

notion = notion_client.Client(auth=os.environ['NOTION_TOKEN'])
CLIP = os.environ['CLIP_DATABASE_ID']
INPUT = os.environ['INPUT_DATABASE_ID']

In [10]:
input_database = notion.databases.retrieve(database_id=INPUT)
tags = [o['name'] for o in input_database['properties']['Tags']['multi_select']['options']]
types = [o['name'] for o in input_database['properties']['Type']['multi_select']['options']]

In [11]:
class PageProperties(BaseModel):
    """Extract web page data"""
    title: str = Field(..., description='page title (long ver.)')
    short_title: str = Field(..., description='page title (short ver.)')
    year: Optional[int] = Field(description='Posted year of the page')
    month: Optional[int] = Field(description='Posted month of the page')
    # date: Optional[datetime.date] = Field(description='Posted date of the page in format "%Y/%m/%d"')
    types: list[str] = Field(..., description='page types in list. You can create new.', enum=types)
    tags: list[str] = Field(..., description='tags in list. You can create new in Camel case.', enum=tags)
    summary: str = Field(..., description='page content in Japanese.')
    # code: Optional[str] = Field(description='link url to the Code. (Github, GitLab, ...)')
    # pdf: Optional[str] = Field(description='link url to the PDF.')

extract_func = convert_to_openai_function(PageProperties)
extract_func

{'name': 'PageProperties',
 'description': 'Extract web page data',
 'parameters': {'type': 'object',
  'properties': {'title': {'description': 'page title (long ver.)',
    'type': 'string'},
   'short_title': {'description': 'page title (short ver.)', 'type': 'string'},
   'year': {'description': 'Posted year of the page', 'type': 'integer'},
   'month': {'description': 'Posted month of the page', 'type': 'integer'},
   'types': {'description': 'page types in list. You can create new.',
    'enum': ['Tweet',
     'Announce',
     'Slides',
     'Medium',
     'Web',
     'Qiita',
     'Zenn',
     'Notion',
     'note',
     'Book',
     'Paper',
     'GitHub',
     'HuggingFace',
     'Demo',
     'KAKEN',
     'Course',
     'Tutorial',
     'Resources',
     'GPTs',
     'Thought',
     'Youtube',
     'Webinar',
     'Kaggle',
     'Service',
     'MemberOnly',
     'Note',
     'Blog'],
    'type': 'array',
    'items': {'type': 'string'}},
   'tags': {'description': 'tags in li

In [12]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

def get_tweet_content_and_links(tweet_url):
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')

    # ChromeDriverのパスを指定
    driver = webdriver.Chrome(options=options)
    
    # URLにアクセス
    driver.get(tweet_url)

    # articleタグが読み込まれるまで待機（最大15秒）
    WebDriverWait(driver, 15).until(EC.visibility_of_element_located((By.TAG_NAME, 'article')))

    # ページのソースを取得して解析
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    # Tweet本文とリンクを取得
    tweet_text = ""
    tweet_links = []
    tweet_container = soup.find("div", {"data-testid": "tweetText"})
    if tweet_container:
        tweet_text = tweet_container.get_text()
        links = tweet_container.find_all("a")
        for link in links:
            tweet_links.append(link['href'])

    driver.quit()
    
    return tweet_text, tweet_links

In [13]:
def extract_urls(url):
    headers = {
        'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
    }
    response = requests.request("GET", url, headers=headers)
    data = BeautifulSoup(response.text, 'html.parser')
    urls = data.find_all('a', href=True)
    return urls

def extract_text(url):
    response = requests.get(url)
    with open('.temp.html','w') as f:
        f.write(response.text)
    loader = UnstructuredHTMLLoader(".temp.html")
    docs = loader.load()
    content = docs[0].page_content.replace('\n\n', '\n')
    return content

In [14]:
def load_web_text(url: str):
    if url.startswith('https://x.com/'):
        content, links = get_tweet_content_and_links(url)
    else:
        content = extract_text(url)
        # links = extract_urls(url)
    return content

def create_page(data):
    url = data['url']
    x = data['page']
    types = [{'name': typ} for typ in x['types']] if isinstance(x['types'], list) else [{'name': x['types']}]
    tags = [{'name': tag} for tag in x['tags']] if isinstance(x['tags'], list) else [{'name': x['tags']}]
    page = {
        "parent": { "database_id": INPUT},
        "properties": {
            'Status': {'status': {'name': 'Not started'}},
            'Import': {'checkbox': True},
            'URL': {'url': url},
            'Issue': {'title': [{'text': {'content': x['short_title']}}]},
            'Type': {"multi_select": types},
            'Tags': {"multi_select": tags},
            'Abstract': {'rich_text': [{'text': {'content': x['summary']}}]},
            'Title': {'rich_text': [{'text': {'content': x['title']}}]},
        }
    }
    if 'year' in x and x['year']:
        year = []
        if x['year'] >= 2023:
            if 'month' in x:
                year.append({'name': f"{str(x['year']).zfill(4)}.{str(x['month']).zfill(2)}"}),
                year.append({'name': str(x['year'])})
        else:
            year.append({'name': str(x['year'])})
        if year:
            page['properties']['Year'] = {"multi_select": year}
    return page

model = ChatOpenAI(model='gpt-3.5-turbo')
model_with_func = model.bind_functions([extract_func])

system_images_extract = '''\
You are a data extractor from a web page.
'''

template = '''\
## SourceURL
{source_url}

## WebText
{web_text}
'''

chat_template = ChatPromptTemplate.from_messages([
    SystemMessage(content=system_images_extract),
    HumanMessagePromptTemplate.from_template(template),
])

chain = {
    'source_url': RunnablePassthrough(),
    'web_text': RunnablePassthrough() | RunnableLambda(load_web_text),
    'urls': RunnablePassthrough() | RunnableLambda(extract_urls),
} | RunnablePassthrough() \
| {
    'url': itemgetter('source_url'),
    'page': RunnablePassthrough() | chat_template | model_with_func | JsonOutputFunctionsParser()
}\
| RunnableLambda(create_page)

In [15]:
from tqdm import tqdm
items = notion.databases.query(database_id=CLIP, filter={
    "property": "Check","checkbox":{"equals": False}
})
for item in tqdm(items['results']):
    try:
        url = item['properties']['URL']['url']
        page = chain.invoke(url)
        blocks = notion.blocks.children.list(block_id=item['id'])
        page['children'] = blocks['results']
        notion.pages.create(**page)
        notion.blocks.delete(block_id=item['id'])
    except Exception as e:
        print(e)

  1%|▏         | 1/69 [00:06<07:41,  6.79s/it]

body failed validation: body.children[0].image.external should be defined, instead was `undefined`.


  3%|▎         | 2/69 [00:30<18:35, 16.64s/it]

body failed validation: body.children[24].image.external.url.length should be ≤ `2000`, instead was `10738`.


  4%|▍         | 3/69 [00:33<11:23, 10.36s/it]

'>=' not supported between instances of 'NoneType' and 'int'


  6%|▌         | 4/69 [00:40<09:42,  8.97s/it]

'>=' not supported between instances of 'NoneType' and 'int'


  7%|▋         | 5/69 [00:56<12:25, 11.64s/it]

Message: 
Stacktrace:
#0 0x5575ba9bb5e3 <unknown>
#1 0x5575ba67e0b7 <unknown>
#2 0x5575ba6cbf53 <unknown>
#3 0x5575ba6cc051 <unknown>
#4 0x5575ba7119c4 <unknown>
#5 0x5575ba6f2f1d <unknown>
#6 0x5575ba70eb3d <unknown>
#7 0x5575ba6f2cc3 <unknown>
#8 0x5575ba6be0e4 <unknown>
#9 0x5575ba6bf0ae <unknown>
#10 0x5575ba981ce1 <unknown>
#11 0x5575ba985b7e <unknown>
#12 0x5575ba96f4b5 <unknown>
#13 0x5575ba9867d6 <unknown>
#14 0x5575ba952dbf <unknown>
#15 0x5575ba9a9748 <unknown>
#16 0x5575ba9a9917 <unknown>
#17 0x5575ba9ba773 <unknown>
#18 0x7ff3bf3bd609 start_thread



  9%|▊         | 6/69 [01:05<11:23, 10.84s/it]

body failed validation: body.children[0].image.external should be defined, instead was `undefined`.


 10%|█         | 7/69 [01:10<09:08,  8.84s/it]

body failed validation: body.children[0].image.external should be defined, instead was `undefined`.


 12%|█▏        | 8/69 [01:17<08:16,  8.13s/it]

Content creation Failed. Fix the following: 
Invalid image url.
Invalid image url.
Invalid image url.
Invalid image url.
Invalid image url.


 14%|█▍        | 10/69 [01:35<08:28,  8.62s/it]

Content creation Failed. Fix the following: 
Invalid image url.
Invalid image url.


 46%|████▋     | 32/69 [05:04<04:56,  8.01s/it]

body failed validation: body.children[0].image.external should be defined, instead was `undefined`.


 54%|█████▎    | 37/69 [05:37<03:38,  6.84s/it]

body failed validation: body.children[0].image.external should be defined, instead was `undefined`.


 64%|██████▍   | 44/69 [06:31<03:10,  7.64s/it]

body failed validation: body.children[0].image.external should be defined, instead was `undefined`.


 65%|██████▌   | 45/69 [06:43<03:32,  8.86s/it]

body failed validation: body.children[20].image.external should be defined, instead was `undefined`.


 67%|██████▋   | 46/69 [06:45<02:37,  6.85s/it]

Error code: 400 - {'error': {'message': "This model's maximum context length is 16385 tokens. However, your messages resulted in 24343 tokens (23173 in the messages, 1170 in the functions). Please reduce the length of the messages or functions.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}


 68%|██████▊   | 47/69 [06:56<02:54,  7.94s/it]

body failed validation: body.children[0].image.external should be defined, instead was `undefined`.


 72%|███████▏  | 50/69 [07:39<03:55, 12.37s/it]

body failed validation: body.children[35].image.external should be defined, instead was `undefined`.


 80%|███████▉  | 55/69 [08:20<01:55,  8.26s/it]

body failed validation: body.children[0].image.external should be defined, instead was `undefined`.


 96%|█████████▌| 66/69 [09:47<00:19,  6.37s/it]

Error code: 400 - {'error': {'message': "This model's maximum context length is 16385 tokens. However, your messages resulted in 29798 tokens (28628 in the messages, 1170 in the functions). Please reduce the length of the messages or functions.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}


 97%|█████████▋| 67/69 [09:50<00:10,  5.31s/it]

Error code: 400 - {'error': {'message': "This model's maximum context length is 16385 tokens. However, your messages resulted in 41518 tokens (40348 in the messages, 1170 in the functions). Please reduce the length of the messages or functions.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}


100%|██████████| 69/69 [10:09<00:00,  8.84s/it]
