In [1]:
import pandas as pd
from tqdm import tqdm
from notion.client import NotionClient
from notion.block import TextBlock, PageBlock

from pathlib import Path
import requests
from bs4 import BeautifulSoup

from io import StringIO

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

def get_abstract_from_pdf(pdf_path):
    
    output_string = StringIO()
    with open(pdf_path, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
    
    abst = ''
    flag = 0
    for row in output_string.getvalue().split('\n'):
        if row == 'ABSTRACT':
            flag = 1
            continue
        
        if flag == 1:
            abst += row
        
        if row in ['Keywords', '']:
            flag = 0
    
    return abst

In [2]:
paper_page_template = [
    'Tags',
    '',
    '著者',
    '',
    'PDF URL',
    '',
    '概要',
    '',
    '新規性・差分',
    '',
    '結果',
    '',
    'コメント',
    '',
    '関連リンク',
    '',
]

In [3]:
import sys
sys.path.append('../secrets')
import token_info

token_v2 = token_info.token_v2
page_url = 'https://www.notion.so/myaun/EDM2020-proceedings-81c7b43901384889ab66943fb747f5b8'

client = NotionClient(token_v2=token_v2)
page = client.get_block(page_url)

In [4]:
# retrieve paper list

target_url = 'https://educationaldatamining.org/edm2020/proceedings/'
r = requests.get(target_url)
soup = BeautifulSoup(r.text, 'lxml')
tables = soup.find_all('table')
paper_type_list = [ i.text for i in soup.find_all('h3')[1:6]]

rows = []
for i, tab in enumerate(tables):
    papers = tab.find_all('td')
    paper_type = paper_type_list[i]
    for p in papers:
        title = p.a.text
        pdf_url = p.a.get('href')
        authors = p.text.split('.')[0]
        
        rows.append([title, pdf_url, authors, paper_type])
        
# make notion pages

df_papers = pd.DataFrame(rows, columns=['title', 'pdf_url', 'authors', 'paper_type'])

fixed_tag = '#EDM2020'
for i, (title, pdf_url, authors, paper_type) in tqdm(enumerate(df_papers.values), total=len(df_papers)):
    
    filename = Path(f'../paper_pdf/{i}.pdf')
    response = requests.get(pdf_url)
    filename.write_bytes(response.content)
    
    abst = get_abstract_from_pdf(pdf_path=f'../paper_pdf/{i}.pdf')
    
    paper_page = page.children.add_new(PageBlock, title=f'[paper] {title}')
    for t in paper_page_template:
        if t == 'Tags':
            _ = paper_page.children.add_new(TextBlock, title=f'{t}: {fixed_tag} #{paper_type}')
        elif t == '著者':
            _ = paper_page.children.add_new(TextBlock, title=f'{t}: {authors}')
        elif t == 'PDF URL':
            _ = paper_page.children.add_new(TextBlock, title=f'{t}: {pdf_url}')
        elif t == '概要':
            _ = paper_page.children.add_new(TextBlock, title=t)
            _ = paper_page.children.add_new(TextBlock, title=abst)
        else:
            _ = paper_page.children.add_new(TextBlock, title=t)

100%|██████████| 117/117 [24:08<00:00, 12.38s/it]
