# 2023年sigirのpaper title / abstarctを取得する

- タイトルにrecommendという文字が含まれるものに限る

In [1]:
import os
import time
from pathlib import Path

from bs4 import BeautifulSoup
import requests
from pydantic.dataclasses import dataclass
from openai import OpenAI
from dotenv import load_dotenv
from jinja2 import Template
from tqdm.notebook import tqdm
import polars as pl

load_dotenv("../.env")

TEMPERATURE = 0.0
SEED = 42
MODEL = "gpt-3.5-turbo"

OUTPUT_DIRPATH = Path("../data/processed/2023_sigir")

In [2]:
OUTPUT_DIRPATH.mkdir(exist_ok=True)

## Full Paper

In [3]:
url = "https://sigir.org/sigir2023/program/accepted-papers/full-papers/"
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")

In [13]:
@dataclass
class Paper:
    title: str
    author: str
    abstract: str = ""
    title_ja: str = ""
    abstract_ja: str = ""

papers = []
for element in soup.find("div", class_="flex_column").find_all("p"):
    title = element.find("strong").text.lstrip("● ")
    author = element.text.split("\n")[-1]
    # titleにrecommendという文字が含まれている場合のみ処理
    if "recommend" in title.lower():
        papers.append(
            Paper(title=title, author=author)
        )

print(len(papers))

65


In [15]:
# 日本語に翻訳

system_content_template = Template('''あなたの仕事は英語の論文を日本語に翻訳することです。
与えられた英語を日本語に翻訳してください
''')

user_content_template = Template('''以下に入力された英語の文章を日本語に翻訳してください

"""入力"""
{{ input }}
''')

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

for paper in tqdm(papers):
    messages = [
        {"role": "system", "content": system_content_template.render()},
        {"role": "user", "content": user_content_template.render(input=paper.title)}
    ]
    chat_completion = client.chat.completions.create(
        messages=messages,
        temperature=TEMPERATURE,
        seed=SEED,
        model=MODEL
    )
    paper.title_ja = chat_completion.choices[0].message.content
    time.sleep(1)

  0%|          | 0/65 [00:00<?, ?it/s]

In [16]:
df_papers = pl.DataFrame(papers)
df_papers.write_csv(OUTPUT_DIRPATH / "full_paper.csv")

## Short Paper

In [17]:
url = "https://sigir.org/sigir2023/program/accepted-papers/short-papers/"
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")

@dataclass
class Paper:
    title: str
    author: str
    abstract: str = ""
    title_ja: str = ""
    abstract_ja: str = ""

papers = []
for element in soup.find("div", class_="flex_column").find_all("p"):
    title = element.find("strong").text.lstrip("● ")
    author = element.text.split("\n")[-1]
    # titleにrecommendという文字が含まれている場合のみ処理
    if "recommend" in title.lower():
        papers.append(
            Paper(title=title, author=author)
        )

print(len(papers))

31


In [19]:
# 日本語に翻訳

system_content_template = Template('''あなたの仕事は英語の論文を日本語に翻訳することです。
与えられた英語を日本語に翻訳してください
''')

user_content_template = Template('''以下に入力された英語の文章を日本語に翻訳してください

"""入力"""
{{ input }}
''')

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

for paper in tqdm(papers):
    messages = [
        {"role": "system", "content": system_content_template.render()},
        {"role": "user", "content": user_content_template.render(input=paper.title)}
    ]
    chat_completion = client.chat.completions.create(
        messages=messages,
        temperature=TEMPERATURE,
        seed=SEED,
        model=MODEL
    )
    paper.title_ja = chat_completion.choices[0].message.content
    time.sleep(1)

  0%|          | 0/31 [00:00<?, ?it/s]

In [20]:
df_papers = pl.DataFrame(papers)
df_papers.write_csv(OUTPUT_DIRPATH / "short_paper.csv")

## Indursotial paper

In [21]:
url = "https://sigir.org/sigir2023/program/accepted-papers/sirip-industrial-track/"
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")

@dataclass
class Paper:
    title: str
    author: str
    abstract: str = ""
    title_ja: str = ""
    abstract_ja: str = ""

papers = []
for element in soup.find("div", class_="flex_column").find_all("p"):
    title = element.find("strong").text.lstrip("● ")
    author = element.text.split("\n")[-1]
    # titleにrecommendという文字が含まれている場合のみ処理
    if "recommend" in title.lower():
        papers.append(
            Paper(title=title, author=author)
        )

print(len(papers))

12


In [22]:
# 日本語に翻訳

system_content_template = Template('''あなたの仕事は英語の論文を日本語に翻訳することです。
与えられた英語を日本語に翻訳してください
''')

user_content_template = Template('''以下に入力された英語の文章を日本語に翻訳してください

"""入力"""
{{ input }}
''')

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

for paper in tqdm(papers):
    messages = [
        {"role": "system", "content": system_content_template.render()},
        {"role": "user", "content": user_content_template.render(input=paper.title)}
    ]
    chat_completion = client.chat.completions.create(
        messages=messages,
        temperature=TEMPERATURE,
        seed=SEED,
        model=MODEL
    )
    paper.title_ja = chat_completion.choices[0].message.content
    time.sleep(1)

  0%|          | 0/12 [00:00<?, ?it/s]

In [23]:
df_papers = pl.DataFrame(papers)
df_papers.write_csv(OUTPUT_DIRPATH / "industrial_paper.csv")