# 2023年recsysのpaper title / abstarctを取得する

In [57]:
import os
import time
from pathlib import Path

from bs4 import BeautifulSoup
import requests
from pydantic.dataclasses import dataclass
from openai import OpenAI
from dotenv import load_dotenv
from jinja2 import Template
from tqdm.notebook import tqdm
import polars as pl

load_dotenv("../.env")

TEMPERATURE = 0.0
SEED = 42
MODEL = "gpt-3.5-turbo"

OUTPUT_DIRPATH = Path("../data/processed/2023-recsys")

In [58]:
OUTPUT_DIRPATH.mkdir(exist_ok=True)

## Long paper

In [2]:
url = "https://recsys.acm.org/recsys23/accepted-contributions/#content-tab-1-0-tab"
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")

In [48]:
@dataclass
class Paper:
    title: str
    author: str
    abstract: str
    title_ja: str = ""
    abstract_ja: str = ""

papers = []
for element in soup.find(id="content-tab-1-0").find_all("li"):
    texts = [text for text in element.text.split("\n") if text]
    title = texts[0].lstrip("RES")
    author = texts[1].lstrip("by ")
    abstract = texts[2]
    papers.append(
        Paper(title=title, author=author, abstract=abstract)
    )

In [53]:
# 日本語に翻訳

system_content_template = Template('''あなたの仕事は英語の論文を日本語に翻訳することです。
与えられた英語を日本語に翻訳してください
''')

user_content_template = Template('''以下に入力された英語の文章を日本語に翻訳してください

"""入力"""
{{ input }}
''')

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

for paper in tqdm(papers):
    messages = [
        {"role": "system", "content": system_content_template.render()},
        {"role": "user", "content": user_content_template.render(input=paper.title)}
    ]
    chat_completion = client.chat.completions.create(
        messages=messages,
        temperature=TEMPERATURE,
        seed=SEED,
        model=MODEL
    )
    paper.title_ja = chat_completion.choices[0].message.content
    time.sleep(1)
    messages = [
        {"role": "system", "content": system_content_template.render()},
        {"role": "user", "content": user_content_template.render(input=paper.abstract)}
    ]
    chat_completion = client.chat.completions.create(
        messages=messages,
        temperature=TEMPERATURE,
        seed=SEED,
        model=MODEL
    )
    paper.abstract_ja = chat_completion.choices[0].message.content
    time.sleep(1)

  0%|          | 0/46 [00:00<?, ?it/s]

In [62]:
df_papers = pl.DataFrame(papers)
df_papers.write_csv(OUTPUT_DIRPATH / "long_papers.csv")

## short paper

In [63]:
papers = []
for element in soup.find(id="content-tab-1-1").find_all("li"):
    texts = [text for text in element.text.split("\n") if text]
    title = texts[0].lstrip("RES")
    author = texts[1].lstrip("by ")
    abstract = texts[2]
    papers.append(
        Paper(title=title, author=author, abstract=abstract)
    )

In [68]:
# 日本語に翻訳

system_content_template = Template('''あなたの仕事は英語の論文を日本語に翻訳することです。
与えられた英語を日本語に翻訳してください
''')

user_content_template = Template('''以下に入力された英語の文章を日本語に翻訳してください

"""入力"""
{{ input }}
''')

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

for paper in tqdm(papers):
    messages = [
        {"role": "system", "content": system_content_template.render()},
        {"role": "user", "content": user_content_template.render(input=paper.title)}
    ]
    chat_completion = client.chat.completions.create(
        messages=messages,
        temperature=TEMPERATURE,
        seed=SEED,
        model=MODEL
    )
    paper.title_ja = chat_completion.choices[0].message.content
    time.sleep(1)
    messages = [
        {"role": "system", "content": system_content_template.render()},
        {"role": "user", "content": user_content_template.render(input=paper.abstract)}
    ]
    chat_completion = client.chat.completions.create(
        messages=messages,
        temperature=TEMPERATURE,
        seed=SEED,
        model=MODEL
    )
    paper.abstract_ja = chat_completion.choices[0].message.content
    time.sleep(1)

  0%|          | 0/48 [00:00<?, ?it/s]

In [69]:
df_papers = pl.DataFrame(papers)
df_papers.write_csv(OUTPUT_DIRPATH / "short_papers.csv")

## Industory Paper

In [73]:
papers = []
for element in soup.find(id="content-tab-1-6").find_all("li"):
    texts = [text for text in element.text.split("\n") if text]
    title = texts[0].lstrip("IND")
    author = texts[1].lstrip("by ")
    abstract = texts[2]
    papers.append(
        Paper(title=title, author=author, abstract=abstract)
    )

In [76]:
# 日本語に翻訳

system_content_template = Template('''あなたの仕事は英語の論文を日本語に翻訳することです。
与えられた英語を日本語に翻訳してください
''')

user_content_template = Template('''以下に入力された英語の文章を日本語に翻訳してください

"""入力"""
{{ input }}
''')

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

for paper in tqdm(papers):
    messages = [
        {"role": "system", "content": system_content_template.render()},
        {"role": "user", "content": user_content_template.render(input=paper.title)}
    ]
    chat_completion = client.chat.completions.create(
        messages=messages,
        temperature=TEMPERATURE,
        seed=SEED,
        model=MODEL
    )
    paper.title_ja = chat_completion.choices[0].message.content
    time.sleep(1)
    messages = [
        {"role": "system", "content": system_content_template.render()},
        {"role": "user", "content": user_content_template.render(input=paper.abstract)}
    ]
    chat_completion = client.chat.completions.create(
        messages=messages,
        temperature=TEMPERATURE,
        seed=SEED,
        model=MODEL
    )
    paper.abstract_ja = chat_completion.choices[0].message.content
    time.sleep(1)

  0%|          | 0/30 [00:00<?, ?it/s]

In [77]:
df_papers = pl.DataFrame(papers)
df_papers.write_csv(OUTPUT_DIRPATH / "industory_papers.csv")