In [6]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time  # time 모듈을 사용하여 지연을 추가
import random  # 랜덤한 지연을 주기 위해 사용
from tqdm import tqdm

# CVPR 2024 논문 목록 페이지 URL 설정
cvpr_url = "https://openaccess.thecvf.com/CVPR2024?day=all"

# 페이지 요청
response = requests.get(cvpr_url)
if response.status_code != 200:
    raise Exception("Failed to load page {}".format(cvpr_url))

# BeautifulSoup으로 HTML 파싱
soup = BeautifulSoup(response.text, 'html.parser')

In [69]:
# 논문 정보를 담을 리스트 초기화
papers = []

# 논문 정보가 담긴 태그 추출 (예시: 논문 제목은 h5 태그, 저자 정보와 초록이 따로 있는 경우 등)
for i, paper in tqdm(enumerate(soup.find_all('dt'))):

    # 각 논문 상세 페이지로 이동하여 초록 정보 크롤링
    paper_url = "https://openaccess.thecvf.com" + paper.find('a')['href']
    
    # 요청 사이에 지연을 추가 (1초에서 3초 사이의 랜덤 지연)
    time.sleep(random.uniform(1, 3))
    
    paper_response = requests.get(paper_url)
    if paper_response.status_code != 200:
        print(f"Failed to retrieve paper at {paper_url}")
        continue
    
    paper_soup = BeautifulSoup(paper_response.text, 'html.parser')
    
    # 초록 정보 추출 (예시로 div 태그에서 abstract를 추출)
    abstract = paper_soup.find('div', {'id': 'abstract'}).text.strip()
    title = paper_soup.find('div', {'id': 'papertitle'}).text.strip()
    url = "https://openaccess.thecvf.com" + paper_soup.find('a', string='pdf')['href'].strip()
    
    # 논문 정보 저장
    papers.append({
        'title': title,
        'abstract': abstract,
        'url': paper_url
    })

    if (i != 0) and (i % 200 == 0):
        # 200개씩 끊어서 논문 정보 저장
        df = pd.DataFrame(papers)
        df.to_csv('cvpr2024_papers-{}.csv'.format(str(i)), index=False)
        papers = []


# DataFrame으로 변환 후 CSV 저장
df = pd.DataFrame(papers)
df.to_csv('cvpr2024_papers.csv', index=False)

print("CVPR 2024 논문 정보가 CSV 파일로 저장되었습니다.")


382it [16:49,  2.58s/it]

Failed to retrieve paper at https://openaccess.thecvf.com/content/CVPR2024/html/Chen_SportsSloMo_A_New_Benchmark_and_Baselines_for_Human-centric_Video_Frame_CVPR_2024_paper.html


1199it [55:25,  2.81s/it]

Failed to retrieve paper at https://openaccess.thecvf.com/content/CVPR2024/html/Soucek_GenHowTo_Learning_to_Generate_Actions_and_State_Transformations_from_Instructional_CVPR_2024_paper.html


2716it [2:06:21,  2.79s/it]

CVPR 2024 논문 정보가 CSV 파일로 저장되었습니다.





In [3]:
# Sort paper lists
plist = [i for i in os.listdir('./') if 'cvpr2024' in i]
plist.sort(key=lambda x: int(x.split('.')[0].split('-')[-1]))

plist

['cvpr2024_papers-200.csv',
 'cvpr2024_papers-400.csv',
 'cvpr2024_papers-600.csv',
 'cvpr2024_papers-800.csv',
 'cvpr2024_papers-1000.csv',
 'cvpr2024_papers-1200.csv',
 'cvpr2024_papers-1400.csv',
 'cvpr2024_papers-1600.csv',
 'cvpr2024_papers-1800.csv',
 'cvpr2024_papers-2000.csv',
 'cvpr2024_papers-2200.csv',
 'cvpr2024_papers-2400.csv',
 'cvpr2024_papers-2600.csv',
 'cvpr2024_papers-2800.csv']

In [4]:
for i, f in enumerate(plist):
    f_ = pd.read_csv(f)
    if i == 0:
        merged_f = f_
    else:
        merged_f = pd.concat([merged_f, f_], axis=0)

merged_f

Unnamed: 0,title,abstract,url
0,Unmixing Diffusion for Self-Supervised Hypersp...,Hyperspectral images (HSIs) have extensive app...,https://openaccess.thecvf.com/content/CVPR2024...
1,Seeing the World through Your Eyes,The reflective nature of the human eye is an u...,https://openaccess.thecvf.com/content/CVPR2024...
2,DPMesh: Exploiting Diffusion Prior for Occlude...,The recovery of occluded human meshes poses ch...,https://openaccess.thecvf.com/content/CVPR2024...
3,Ungeneralizable Examples,The training of contemporary deep learning mod...,https://openaccess.thecvf.com/content/CVPR2024...
4,LaneCPP: Continuous 3D Lane Detection using Ph...,Monocular 3D lane detection has become a funda...,https://openaccess.thecvf.com/content/CVPR2024...
...,...,...,...
110,Language-driven Object Fusion into Neural Radi...,Neural radiance field (NeRF) is an emerging te...,https://openaccess.thecvf.com/content/CVPR2024...
111,Adaptive Hyper-graph Aggregation for Modality-...,In Federated Learning (FL) the issue of statis...,https://openaccess.thecvf.com/content/CVPR2024...
112,SPIN: Simultaneous Perception Interaction and ...,While there has been remarkable progress recen...,https://openaccess.thecvf.com/content/CVPR2024...
113,DREAM: Diffusion Rectification and Estimation-...,We present DREAM a novel training framework re...,https://openaccess.thecvf.com/content/CVPR2024...


In [5]:
merged_f.to_csv('cvpr2024.csv', index=False)