In [1]:
from dotenv import load_dotenv, find_dotenv; load_dotenv(find_dotenv())
import os, notion_client, requests, datetime, pytz
from bs4 import BeautifulSoup
from typing import Optional
from operator import itemgetter
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_community.document_loaders.html import UnstructuredHTMLLoader
from langchain_core.messages import SystemMessage
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_openai import ChatOpenAI
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_core.utils.function_calling import convert_to_openai_function, convert_to_openai_tool

notion = notion_client.Client(auth=os.environ['NOTION_TOKEN'])
CLIP = os.environ['CLIP_DATABASE_ID']
INPUT = os.environ['INPUT_DATABASE_ID']

In [2]:
input_database = notion.databases.retrieve(database_id=INPUT)
tags = [o['name'] for o in input_database['properties']['Tags']['multi_select']['options']]
types = [o['name'] for o in input_database['properties']['Type']['multi_select']['options']]

In [3]:
class PageProperties(BaseModel):
    """Extract web page data"""
    title: str = Field(..., description='page title (long ver.)')
    short_title: str = Field(..., description='page title (short ver.)')
    year: Optional[int] = Field(description='Posted year of the page')
    month: Optional[int] = Field(description='Posted month of the page')
    # date: Optional[datetime.date] = Field(description='Posted date of the page in format "%Y/%m/%d"')
    types: list[str] = Field(..., description='page types in list.', enum=types)
    tags: list[str] = Field(..., description='tags in list.', enum=tags)
    summary: str = Field(..., description='detailed summary in Japanese.')
    # code: Optional[str] = Field(description='link url to the Code. (Github, GitLab, ...)')
    # pdf: Optional[str] = Field(description='link url to the PDF.')

extract_func = convert_to_openai_function(PageProperties)
extract_func

{'name': 'PageProperties',
 'description': 'Extract web page data',
 'parameters': {'type': 'object',
  'properties': {'title': {'description': 'page title (long ver.)',
    'type': 'string'},
   'short_title': {'description': 'page title (short ver.)', 'type': 'string'},
   'year': {'description': 'Posted year of the page', 'type': 'integer'},
   'month': {'description': 'Posted month of the page', 'type': 'integer'},
   'types': {'description': 'page types in list.',
    'enum': ['Tweet',
     'Announce',
     'Slides',
     'Medium',
     'Web',
     'Qiita',
     'Zenn',
     'Notion',
     'note',
     'Book',
     'Paper',
     'GitHub',
     'HuggingFace',
     'Demo',
     'KAKEN',
     'Course',
     'Tutorial',
     'Resources',
     'GPTs',
     'Thought',
     'Youtube',
     'Webinar',
     'Kaggle',
     'Service',
     'MemberOnly',
     'Blog'],
    'type': 'array',
    'items': {'type': 'string'}},
   'tags': {'description': 'tags in list.',
    'enum': ['LLM',
     'L

In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

def get_tweet_content_and_links(tweet_url):
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')

    # ChromeDriverのパスを指定
    driver = webdriver.Chrome(options=options)
    
    # URLにアクセス
    driver.get(tweet_url)

    # articleタグが読み込まれるまで待機（最大15秒）
    WebDriverWait(driver, 15).until(EC.visibility_of_element_located((By.TAG_NAME, 'article')))

    # ページのソースを取得して解析
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    # Tweet本文とリンクを取得
    tweet_text = ""
    tweet_links = []
    tweet_container = soup.find("div", {"data-testid": "tweetText"})
    if tweet_container:
        tweet_text = tweet_container.get_text()
        links = tweet_container.find_all("a")
        for link in links:
            tweet_links.append(link['href'])
    
    # Tweet投稿日時を取得
    datetime_as = soup.findAll('a', attrs={'aria-label': True, 'role': 'link'})

    # 日時の抽出
    dt_jst = None
    for datetime_a in datetime_as:
        if datetime_a:
            time_tag = datetime_a.find('time')
            if time_tag:
                time_tag_datetime = time_tag.get('datetime')
                dt_utc = datetime.datetime.fromisoformat(time_tag_datetime.replace("Z", "+00:00"))
                dt_utc = dt_utc.replace(tzinfo=pytz.utc)
                jst = pytz.timezone('Asia/Tokyo')
                dt_jst = dt_utc.astimezone(jst)

    driver.quit()
    
    return tweet_text, tweet_links, dt_jst

In [5]:
def extract_urls(url):
    headers = {
        'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
    }
    response = requests.request("GET", url, headers=headers)
    data = BeautifulSoup(response.text, 'html.parser')
    urls = data.find_all('a', href=True)
    return urls

def extract_text(url):
    response = requests.get(url)
    with open('.temp.html','w') as f:
        f.write(response.text)
    loader = UnstructuredHTMLLoader(".temp.html")
    docs = loader.load()
    content = docs[0].page_content.replace('\n\n', '\n')
    return content

In [6]:
def load_web_text(url: str):
    if url.startswith('https://x.com/'):
        content, links, dt_jst = get_tweet_content_and_links(url)
        if dt_jst:
            content = f"{content}\ndatetime: {str(dt_jst)}"
    else:
        content = extract_text(url)
        # links = extract_urls(url)
    return content

def create_page(data):
    url = data['url']
    x = data['page']
    types = [{'name': typ} for typ in x['types']] if isinstance(x['types'], list) else [{'name': x['types']}]
    tags = [{'name': tag} for tag in x['tags']] if isinstance(x['tags'], list) else [{'name': x['tags']}]
    page = {
        "parent": { "database_id": INPUT},
        "properties": {
            'Status': {'status': {'name': 'Not started'}},
            'Import': {'checkbox': True},
            'URL': {'url': url},
            'Issue': {'title': [{'text': {'content': x['short_title']}}]},
            'Type': {"multi_select": types},
            'Tags': {"multi_select": tags},
            'Abstract': {'rich_text': [{'text': {'content': x['summary']}}]},
            'Title': {'rich_text': [{'text': {'content': x['title']}}]},
        }
    }
    if 'year' in x and x['year']:
        year = []
        if x['year'] >= 2023:
            if 'month' in x:
                year.append({'name': f"{str(x['year']).zfill(4)}.{str(x['month']).zfill(2)}"}),
                year.append({'name': str(x['year'])})
        else:
            year.append({'name': str(x['year'])})
        if year:
            page['properties']['Year'] = {"multi_select": year}
    return page

model = ChatOpenAI(model='gpt-3.5-turbo')
model_with_func = model.bind_functions([extract_func])

system_images_extract = '''\
You are a data extractor from a web page.
'''

template = '''\
## SourceURL
{source_url}

## WebText
{web_text}
'''

chat_template = ChatPromptTemplate.from_messages([
    SystemMessage(content=system_images_extract),
    HumanMessagePromptTemplate.from_template(template),
])

chain = {
    'source_url': RunnablePassthrough(),
    'web_text': RunnablePassthrough() | RunnableLambda(load_web_text),
    'urls': RunnablePassthrough() | RunnableLambda(extract_urls),
} | RunnablePassthrough() \
| {
    'url': itemgetter('source_url'),
    'page': RunnablePassthrough() | chat_template | model_with_func | JsonOutputFunctionsParser()
}\
| RunnableLambda(create_page)

In [7]:
from tqdm import tqdm
items = notion.databases.query(database_id=CLIP, filter={
    "property": "Check","checkbox":{"equals": False}
})
for item in tqdm(items['results']):
    try:
        url = item['properties']['URL']['url']
        page = chain.invoke(url)
        blocks = notion.blocks.children.list(block_id=item['id'])
        page['children'] = blocks['results']
        notion.pages.create(**page)
        notion.blocks.delete(block_id=item['id'])
    except Exception as e:
        print('Skipping:', e)

  0%|          | 0/29 [00:00<?, ?it/s]

datetime attribute: 2024-05-29 19:00:02+09:00


  3%|▎         | 1/29 [00:07<03:38,  7.79s/it]

datetime attribute: 2024-05-29 22:08:41+09:00


  7%|▋         | 2/29 [00:18<04:15,  9.47s/it]

datetime attribute: 2024-05-29 21:53:17+09:00


 10%|█         | 3/29 [00:31<04:43, 10.91s/it]

datetime attribute: 2024-05-29 09:25:47+09:00


 17%|█▋        | 5/29 [00:56<05:00, 12.50s/it]

Skipping: body failed validation: body.children[28].code.rich_text[0].text.content.length should be ≤ `2000`, instead was `2048`.
datetime attribute: 2024-05-28 18:22:27+09:00


 24%|██▍       | 7/29 [01:11<03:37,  9.87s/it]

datetime attribute: 2024-05-27 15:11:25+09:00


 28%|██▊       | 8/29 [01:21<03:26,  9.83s/it]

datetime attribute: 2024-05-27 07:57:00+09:00


 31%|███       | 9/29 [01:28<03:02,  9.13s/it]

datetime attribute: 2024-05-27 14:36:39+09:00


 34%|███▍      | 10/29 [01:39<03:01,  9.56s/it]

datetime attribute: 2024-05-27 07:51:15+09:00


 41%|████▏     | 12/29 [01:53<02:19,  8.22s/it]

Skipping: body failed validation: body.children[0].image.external should be defined, instead was `undefined`.


 45%|████▍     | 13/29 [02:20<03:40, 13.76s/it]

Skipping: body failed validation: body.children[24].image.external.url.length should be ≤ `2000`, instead was `10738`.


 48%|████▊     | 14/29 [02:24<02:43, 10.88s/it]

Skipping: body failed validation: body.children[0].image.external should be defined, instead was `undefined`.


 52%|█████▏    | 15/29 [02:40<02:53, 12.36s/it]

Skipping: Message: 
Stacktrace:
#0 0x56053e05b5e3 <unknown>
#1 0x56053dd1e0b7 <unknown>
#2 0x56053dd6bf53 <unknown>
#3 0x56053dd6c051 <unknown>
#4 0x56053ddb19c4 <unknown>
#5 0x56053dd92f1d <unknown>
#6 0x56053ddaeb3d <unknown>
#7 0x56053dd92cc3 <unknown>
#8 0x56053dd5e0e4 <unknown>
#9 0x56053dd5f0ae <unknown>
#10 0x56053e021ce1 <unknown>
#11 0x56053e025b7e <unknown>
#12 0x56053e00f4b5 <unknown>
#13 0x56053e0267d6 <unknown>
#14 0x56053dff2dbf <unknown>
#15 0x56053e049748 <unknown>
#16 0x56053e049917 <unknown>
#17 0x56053e05a773 <unknown>
#18 0x7fd8e732f609 start_thread



 55%|█████▌    | 16/29 [02:50<02:30, 11.55s/it]

Skipping: body failed validation: body.children[0].image.external should be defined, instead was `undefined`.


 59%|█████▊    | 17/29 [02:57<02:04, 10.37s/it]

Skipping: body failed validation: body.children[0].image.external should be defined, instead was `undefined`.


 62%|██████▏   | 18/29 [03:05<01:44,  9.54s/it]

Skipping: Content creation Failed. Fix the following: 
Invalid image url.
Invalid image url.
Invalid image url.
Invalid image url.
Invalid image url.


 66%|██████▌   | 19/29 [03:11<01:25,  8.53s/it]

Skipping: Content creation Failed. Fix the following: 
Invalid image url.
Invalid image url.


 69%|██████▉   | 20/29 [03:15<01:03,  7.07s/it]

Skipping: body failed validation: body.children[0].image.external should be defined, instead was `undefined`.


 72%|███████▏  | 21/29 [03:23<01:00,  7.55s/it]

Skipping: Could not parse function call: 'function_call'


 76%|███████▌  | 22/29 [03:30<00:50,  7.22s/it]

Skipping: body failed validation: body.children[0].image.external should be defined, instead was `undefined`.


 79%|███████▉  | 23/29 [04:06<01:35, 15.99s/it]

Skipping: body failed validation: body.children[20].image.external should be defined, instead was `undefined`.


 83%|████████▎ | 24/29 [04:19<01:15, 15.10s/it]

Skipping: Error code: 400 - {'error': {'message': "This model's maximum context length is 16385 tokens. However, your messages resulted in 23985 tokens (23173 in the messages, 812 in the functions). Please reduce the length of the messages or functions.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}


 86%|████████▌ | 25/29 [04:33<00:58, 14.56s/it]

Skipping: body failed validation: body.children[0].image.external should be defined, instead was `undefined`.


Failed to batch ingest runs: LangSmithConnectionError('Connection error caused failure to POST https://api.smith.langchain.com/runs/batch  in LangSmith API. Please confirm your internet connection.. SSLError(MaxRetryError("HTTPSConnectionPool(host=\'api.smith.langchain.com\', port=443): Max retries exceeded with url: /runs/batch (Caused by SSLError(SSLEOFError(8, \'EOF occurred in violation of protocol (_ssl.c:2396)\')))"))')
Failed to batch ingest runs: LangSmithConnectionError('Connection error caused failure to POST https://api.smith.langchain.com/runs/batch  in LangSmith API. Please confirm your internet connection.. SSLError(MaxRetryError("HTTPSConnectionPool(host=\'api.smith.langchain.com\', port=443): Max retries exceeded with url: /runs/batch (Caused by SSLError(SSLEOFError(8, \'EOF occurred in violation of protocol (_ssl.c:2396)\')))"))')
 90%|████████▉ | 26/29 [04:39<00:36, 12.06s/it]

Skipping: body failed validation: body.children[35].image.external should be defined, instead was `undefined`.


Failed to batch ingest runs: LangSmithConnectionError('Connection error caused failure to POST https://api.smith.langchain.com/runs/batch  in LangSmith API. Please confirm your internet connection.. SSLError(MaxRetryError("HTTPSConnectionPool(host=\'api.smith.langchain.com\', port=443): Max retries exceeded with url: /runs/batch (Caused by SSLError(SSLEOFError(8, \'EOF occurred in violation of protocol (_ssl.c:2396)\')))"))')
 93%|█████████▎| 27/29 [04:44<00:20, 10.15s/it]

Skipping: body failed validation: body.children[0].image.external should be defined, instead was `undefined`.


 97%|█████████▋| 28/29 [04:46<00:07,  7.50s/it]

Skipping: Error code: 400 - {'error': {'message': "This model's maximum context length is 16385 tokens. However, your messages resulted in 29388 tokens (28576 in the messages, 812 in the functions). Please reduce the length of the messages or functions.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}


100%|██████████| 29/29 [04:47<00:00,  9.92s/it]

Skipping: Error code: 400 - {'error': {'message': "This model's maximum context length is 16385 tokens. However, your messages resulted in 41117 tokens (40305 in the messages, 812 in the functions). Please reduce the length of the messages or functions.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}





Failed to batch ingest runs: LangSmithConnectionError('Connection error caused failure to POST https://api.smith.langchain.com/runs/batch  in LangSmith API. Please confirm your internet connection.. SSLError(MaxRetryError("HTTPSConnectionPool(host=\'api.smith.langchain.com\', port=443): Max retries exceeded with url: /runs/batch (Caused by SSLError(SSLEOFError(8, \'EOF occurred in violation of protocol (_ssl.c:2396)\')))"))')
Failed to batch ingest runs: LangSmithConnectionError('Connection error caused failure to POST https://api.smith.langchain.com/runs/batch  in LangSmith API. Please confirm your internet connection.. SSLError(MaxRetryError("HTTPSConnectionPool(host=\'api.smith.langchain.com\', port=443): Max retries exceeded with url: /runs/batch (Caused by SSLError(SSLEOFError(8, \'EOF occurred in violation of protocol (_ssl.c:2396)\')))"))')
Failed to batch ingest runs: LangSmithConnectionError('Connection error caused failure to POST https://api.smith.langchain.com/runs/batch  i