# 対象ページから論文の一覧を取得して整形する

In [None]:
from urllib import request
from typing import List
import re

from bs4 import BeautifulSoup
import pandas as pd

In [None]:
def parse_author_text(text: str) -> List[List[str]]:
    """authorのtextを整形
    
    Args:
        text (str): テキスト
    
    Returns:
        List[str]: パース後のテキストを返却
    
    Examples:
        text = ' Authors: Pieter Robberechts (KU Leuven)*; Jan Van Haaren (KU Leuven); Jesse Davis (KU Leuven)'
    """
    # Authorsという文字を削除
    text = re.sub("\s*Authors:\s*", "", text)
    # セミコロンでsplit
    authors = text.split(";")
    # 前後の空白を除去
    authors = [author.strip() for author in authors]
    # 名前と所属を分割
    parsed_authors = []
    for author in authors:
        author_name = re.sub("\s*\(.+\).*", "", author)
        try:
            author_belong = re.search("\((.+)\)", author).group(1)
        except AttributeError:
            author_belong = None
        parsed_authors.append(
            [author_name, author_belong]
        )
    return parsed_authors

text = ' Authors: Pieter Robberechts (KU Leuven)*; Jan Van Haaren (KU Leuven); Jesse Davis (KU Leuven)'
parse_author_text(text)
    

In [None]:
url = 'https://kdd.org/kdd2021/accepted-papers/index'
response = request.urlopen(url)
soup = BeautifulSoup(response)
response.close()

In [None]:
documents = []
for element in soup.find_all('div', class_='justify-content-between')[2].find_all('div', class_='media-body'):
    text = element.text
    split_text = text.split("\n")
    documents.append(
        [split_text[1], split_text[3]] + [val for row in parse_author_text(split_text[3]) for val in row]
    )

In [None]:
max([len(doc) for doc in documents])

In [None]:
pd.DataFrame(
    documents,
    columns=["title", "author_text"] + [f"author_name_{i}" if j % 2 == 0 else f"author_belong_{i}" for i in range(23) for j in range(2)]
).to_csv("../data/raw/kdd2021_ads_authors.csv", index=False)