In [None]:
#!/usr/bin/python
# -*- coding: UTF-8 -*-
# author : Liu Kun
# date   : 2021-09-01 00:00:01

import os
import json
import requests
from bs4 import BeautifulSoup


def request_paper_list(url):
    # 请求页面
    base_url = f"https://openaccess.thecvf.com"
    response = requests.get(url)
    # 使用BeautifulSoup加载页面
    soup = BeautifulSoup(response.content, features="lxml")
    # 按标签查找
    dl = soup.find("dl")
    dt_list = dl.find_all("dt")
    dd_list = dl.find_all("dd")
    # 去除Back
    if "back" in dd_list[0].text.strip().lower():
        dd_list.pop(0)
    # 按奇偶处理dd标签
    dd_odd_list = dd_list[::2]
    dd_even_list = dd_list[1::2]
    # 用于存放论文信息
    paper_list = list()
    # 逐行处理
    for item in zip(dt_list, dd_odd_list, dd_even_list):
        # Line 1
        line_1 = item[0]
        tag_a_1 = line_1.find("a")
        # Line 2
        line_2 = item[1]
        # Line 3
        line_3 = item[2]
        tag_a_2 = line_3.find("a")
        tag_div = line_3.find("div")
        paper_json = {
            "web_url": base_url + tag_a_1["href"],
            "title": tag_a_1.text.strip(),
            "author": line_2.text.strip().replace("\n", "").replace("\r", ""),
            "pdf_url": base_url + tag_a_2["href"],
            "infos": tag_div.text.strip().replace("\n", "").replace("\r", "").replace("[bibtex]", ""),
        }
        # print("-" * 100)
        # print(json.dumps(paper_json, sort_keys=True, indent=4, separators=(',', ': ')))
        paper_list.append(paper_json)
    print("共爬取到论文信息%d条" % len(paper_list))
    return paper_list


def log(papers, logfile):
    with open(logfile, "a+") as f:
        for p in papers:
            f.write(json.dumps(p) + "\n")
    pass


def downloader(url, folder):
    filename = url.split("/")[-1]
    file = os.path.join(folder, filename)
    if os.path.exists(file):
        print(f"skip {file} @ {url}")
        return file
    print(f"download {file} from {url}")
    r = requests.get(url) 
    with open(file, "wb") as f:
        f.write(r.content)
    return file


if __name__ == '__main__':
    # 从json加载待爬取信息
    menu_list = list()
    with open("menu.json", "r") as f:
        menu_list = json.load(f)
    # 爬取论文信息
    for item in menu_list:
        # 打印待爬取信息
        print(item)
        sub_path = item["path"]
        # 如文件夹不存在则创建
        if not os.path.exists(sub_path):
            os.makedirs(sub_path)
        log_filename = item["logfile"]
        target_url_list = item["links"]
        for target_url in target_url_list:
            # 爬取论文链接
            papers = request_paper_list(target_url)
            # 记录论文信息到日志文件
            log_file = os.path.join(sub_path, log_filename)
            log(papers, log_file)
    # 按文件夹下载
    for item in menu_list:
        paper_path = item["path"]
        paper_jsonl = item["logfile"]
        with open(f"{paper_path}/{paper_jsonl}", "r") as f:
            lines = f.readlines()
            for l in lines:
                obj = json.loads(l)
                downloader(obj["pdf_url"], paper_path)
    pass