In [1]:
import os
import json
import time
import bs4
import requests

获取所有的问题列表

In [2]:
def get_problem_set():
    response = requests.get("https://codeforces.com/api/problemset.problems")
    return response.json()

In [3]:
problem_set = get_problem_set()

In [4]:
print(f"总问题数：{len(problem_set['result']['problems'])}")

总问题数：8989


In [5]:
problem_set["result"]["problems"][0]

{'contestId': 1886,
 'index': 'F',
 'name': 'Diamond Theft',
 'type': 'PROGRAMMING',
 'tags': ['data structures', 'greedy']}

In [6]:
tag_set = set()
for problem in problem_set["result"]["problems"]:
    for tag in problem["tags"]:
        tag_set.add(tag)

In [7]:
tag_set

{'*special',
 '2-sat',
 'binary search',
 'bitmasks',
 'brute force',
 'chinese remainder theorem',
 'combinatorics',
 'constructive algorithms',
 'data structures',
 'dfs and similar',
 'divide and conquer',
 'dp',
 'dsu',
 'expression parsing',
 'fft',
 'flows',
 'games',
 'geometry',
 'graph matchings',
 'graphs',
 'greedy',
 'hashing',
 'implementation',
 'interactive',
 'math',
 'matrices',
 'meet-in-the-middle',
 'number theory',
 'probabilities',
 'schedules',
 'shortest paths',
 'sortings',
 'string suffix structures',
 'strings',
 'ternary search',
 'trees',
 'two pointers'}

In [6]:
dataset_dir = "D:\Code\DataSet\CodeForceDataSet-raw"

获取所有的比赛列表

In [7]:
contest_list_response = requests.get("https://codeforces.com/api/contest.list").json()
contest_list = contest_list_response["result"]

In [8]:
len(contest_list)

1796

In [9]:
contest_list[100]

{'id': 1791,
 'name': 'Codeforces Round 849 (Div. 4)',
 'type': 'ICPC',
 'phase': 'FINISHED',
 'frozen': False,
 'durationSeconds': 8700,
 'startTimeSeconds': 1675434900,
 'relativeTimeSeconds': 21814452}

只保存已经结束的比赛

In [42]:
for contest in contest_list:
    if contest["phase"] == "FINISHED":
        contest_dir = os.path.join(dataset_dir, str(contest["id"]))
        os.makedirs(contest_dir, exist_ok=True)

获取某个比赛的提交情况

In [10]:
contest_id = 1800

In [11]:
contest_status_response = requests.get(f"https://codeforces.com/api/contest.status?contestId={contest_id}").json()

In [12]:
contest_status_list = contest_status_response["result"]

In [13]:
len(contest_status_list)

292930

In [14]:
contest_status_list[10000]

{'id': 221039648,
 'contestId': 1800,
 'creationTimeSeconds': 1693368652,
 'relativeTimeSeconds': 2147483647,
 'problem': {'contestId': 1800,
  'index': 'A',
  'name': 'Is It a Cat?',
  'type': 'PROGRAMMING',
  'rating': 800,
  'tags': ['implementation', 'strings']},
 'author': {'contestId': 1800,
  'members': [{'handle': 'nvqminh301207'}],
  'participantType': 'PRACTICE',
  'ghost': False,
  'startTimeSeconds': 1677767700},
 'programmingLanguage': 'GNU C++14',
 'verdict': 'WRONG_ANSWER',
 'testset': 'TESTS',
 'passedTestCount': 1,
 'timeConsumedMillis': 61,
 'memoryConsumedBytes': 0}

获取该场比赛所有提交记录中使用过的编程语言

In [15]:
programming_language_set = set()

for submission in contest_status_list:
    programming_language_set.add(submission["programmingLanguage"])

In [16]:
programming_language_set

{'C# 10',
 'C# 8',
 'Clang++17 Diagnostics',
 'Clang++20 Diagnostics',
 'D',
 'Delphi',
 'FPC',
 'GNU C++14',
 'GNU C++17',
 'GNU C++17 (64)',
 'GNU C++20 (64)',
 'GNU C11',
 'Go',
 'Haskell',
 'Java 11',
 'Java 17',
 'Java 21',
 'Java 8',
 'JavaScript',
 'Kotlin 1.6',
 'Kotlin 1.7',
 'MS C++ 2017',
 'Mono C#',
 'Node.js',
 'Ocaml',
 'PHP',
 'PascalABC.NET',
 'Perl',
 'PyPy 2',
 'PyPy 3',
 'PyPy 3-64',
 'Python 2',
 'Python 3',
 'Ruby 3',
 'Rust 2021',
 'Scala'}

获取该场比赛的所有题目标题

In [19]:
contest_problem_index_set = set()

for submission in contest_status_list:
    contest_problem_index_set.add(submission["problem"]["index"])

In [20]:
contest_problem_index_set

{'A', 'B', 'C1', 'C2', 'D', 'E1', 'E2', 'F', 'G'}

获取某个题目的题面

In [125]:
from bs4 import BeautifulSoup
from tomd import Tomd
import re

In [111]:
problem_idx = "A"

In [133]:
problem_page_url = f"https://codeforces.com/contest/{contest_id}/problem/{problem_idx}"

In [134]:
problem_page_content = requests.get(problem_page_url)

In [135]:
soup = BeautifulSoup(problem_page_content.content, "lxml")

In [136]:
problem_statement = soup.find_all(name="div", attrs={"class" :"problem-statement"})[0]

In [137]:
problem_statement

<div class="problem-statement"><div class="header"><div class="title">F. Diamond Theft</div><div class="time-limit"><div class="property-title">time limit per test</div>2 seconds</div><div class="memory-limit"><div class="property-title">memory limit per test</div>256 megabytes</div><div class="input-file"><div class="property-title">input</div>standard input</div><div class="output-file"><div class="property-title">output</div>standard output</div></div><div><p>Monocarp is the most famous thief in Berland. This time, he decided to steal two diamonds. Unfortunately for Monocarp, there are $$$n$$$ cameras monitoring the diamonds. Each camera has two parameters, $$$t_i$$$ and $$$s_i$$$. The first parameter determines whether the camera is monitoring the first diamond only ($$$t_i=1$$$), the second diamond only ($$$t_i=2$$$), or both diamonds ($$$t_i=3$$$). The second parameter determines the number of seconds the camera will be disabled after it is hacked.</p><p>Every second, Monocarp ca

In [138]:
problem_statement_dict = {}
# 找到题目标题、时间、和内存限制
problem_statement_dict["title"] = soup.find_all(name="div", attrs={"class":"title"})[0].contents[-1]
problem_statement_dict["time-limit"] = soup.find_all(name="div", attrs={"class":"time-limit"})[0].contents[-1]
problem_statement_dict["memory-limit"] = soup.find_all(name="div", attrs={"class":"memory-limit"})[0].contents[-1]

In [139]:
problem_statement_dict["problem-description"] = Tomd(str(soup.find("div", class_="header").find_next_sibling("div"))).markdown
problem_statement_dict["input-specification"] = Tomd(str(soup.find("div", class_="input-specification"))).markdown
problem_statement_dict["output-specification"] = Tomd(str(soup.find("div", class_="output-specification"))).markdown
problem_statement_dict["note"] = Tomd(str(soup.find("div", class_="note"))).markdown

In [140]:
for item_title in ["problem-description", "input-specification", "output-specification", "note"]:
    problem_statement_dict[item_title] = problem_statement_dict[item_title].replace("$$$", "$").strip()

In [141]:
problem_statement_dict

{'title': 'F. Diamond Theft',
 'time-limit': '2 seconds',
 'memory-limit': '256 megabytes',
 'problem-description': "Monocarp is the most famous thief in Berland. This time, he decided to steal two diamonds. Unfortunately for Monocarp, there are $n$ cameras monitoring the diamonds. Each camera has two parameters, $t_i$ and $s_i$. The first parameter determines whether the camera is monitoring the first diamond only ($t_i=1$), the second diamond only ($t_i=2$), or both diamonds ($t_i=3$). The second parameter determines the number of seconds the camera will be disabled after it is hacked.\n\nEvery second, Monocarp can perform one of the following three actions: \n -  do nothing; -  choose a camera and hack it; if Monocarp hacks the $i$-th camera, it will be disabled for the next $s_i$ seconds (if the current second is the $T$-th one, the camera will be disabled from the $(T+1)$-th to the $(T+s_i)$-th second, inclusive); -  steal a diamond if all cameras monitoring it are currently disab

In [150]:
def strip_html_tags(input_str):
    # 将<br>, <br/>, <p> 和 <div>标签转为换行符
    input_str = re.sub(r'<br\s*?/?>', '\n', input_str)  # 处理<br>和<br/>标签
    input_str = re.sub(r'</p>', '\n', input_str)  # 处理</p>标签
    input_str = re.sub(r'</div>', '\n', input_str)  # 处理</div>标签

    # 使用正则表达式去除所有其他HTML标签
    clean = re.compile('<.*?>')
    return re.sub(clean, '', input_str).lstrip()

In [151]:
problem_statement_dict["demo-input"] = []
for item_tag in soup.find_all("div", class_="input"):
    problem_statement_dict["demo-input"].append(strip_html_tags(str(item_tag.find("pre"))))
problem_statement_dict["demo-output"] = []
for item_tag in soup.find_all("div", class_="output"):
    problem_statement_dict["demo-output"].append(strip_html_tags(str(item_tag.find("pre"))))

In [152]:
problem_statement_dict

{'title': 'F. Diamond Theft',
 'time-limit': '2 seconds',
 'memory-limit': '256 megabytes',
 'problem-description': "Monocarp is the most famous thief in Berland. This time, he decided to steal two diamonds. Unfortunately for Monocarp, there are $n$ cameras monitoring the diamonds. Each camera has two parameters, $t_i$ and $s_i$. The first parameter determines whether the camera is monitoring the first diamond only ($t_i=1$), the second diamond only ($t_i=2$), or both diamonds ($t_i=3$). The second parameter determines the number of seconds the camera will be disabled after it is hacked.\n\nEvery second, Monocarp can perform one of the following three actions: \n -  do nothing; -  choose a camera and hack it; if Monocarp hacks the $i$-th camera, it will be disabled for the next $s_i$ seconds (if the current second is the $T$-th one, the camera will be disabled from the $(T+1)$-th to the $(T+s_i)$-th second, inclusive); -  steal a diamond if all cameras monitoring it are currently disab

创建问题的文件夹

In [90]:
for contest_problem_index in contest_problem_index_set:
    contest_problem_dir = os.path.join(dataset_dir, str(contest_id), contest_problem_index)
    os.makedirs(contest_problem_dir, exist_ok=True)

In [91]:
submission_id = "222908221"

In [92]:
submission_response = requests.get(f"https://codeforces.com/contest/{contest_id}/submission/{submission_id}")

In [93]:
soup = bs4.BeautifulSoup(submission_response.content, "html.parser")

In [98]:
pre_tag = soup.find("pre", {"id": "program-source-text"})
pre_tag.text

'//Vaidehi Desai\r\n//C++ 17\r\n\r\n#include <bits/stdc++.h>\r\nusing namespace std;\r\ntypedef long long int ll;\r\n#define all(x) x.begin(), x.end()\r\n\r\nvoid solve(ll tc)\r\n{\r\n    ll n, m;\r\n    cin >> n >> m;\r\n    if(m == 1) {\r\n        cout << 0 << endl;\r\n        for(ll i = 0; i < n; i++) \r\n            cout << 0 << endl;\r\n    } else {\r\n        if(n >= m) \r\n            cout << m << endl;\r\n        else\r\n            cout << n + 1 << endl;\r\n        int st = 0;\r\n        for(int j = 0; j < n; j++) {\r\n            for(int i = st; i < m; i++) {\r\n                cout << i % m << " ";\r\n            }\r\n            for(int i = 0; i < st; i++) {\r\n                cout << i % m << " ";\r\n            }\r\n            st++;\r\n            cout << endl;\r\n        }\r\n    }\r\n}\r\n\r\nsigned main()\r\n{\r\n    #ifdef Vaidehi\r\n        freopen("error.txt", "w", stderr);\r\n    #endif  \r\n    ios_base::sync_with_stdio(false);\r\n    cin.tie(NULL);\r\n    cout.t

创建提交文件

In [97]:
for contest_status in contest_status_list:
    # 如果problem.json文件不存在，则创建一个
    problem_index = contest_status["problem"]["index"]
    if not os.path.exists(os.path.join(dataset_dir, str(contest_id), problem_index, "problem.json")):
        with open(os.path.join(dataset_dir, str(contest_id), problem_index, "problem.json"), "w") as f:
            json.dump(contest_status["problem"], f)

    contest_status_dir = os.path.join(dataset_dir, str(contest_id), problem_index)
    contest_status_id = contest_status["id"]
    contest_status_file_name = os.path.join(contest_status_dir, f"{contest_status_id}.json")
    # 获取该提交的详细信息
    submission_response = requests.get(f"https://codeforces.com/contest/{contest_id}/submission/{contest_status_id}")
    soup = bs4.BeautifulSoup(submission_response.content, "html.parser")
    pre_tag = soup.find("pre", {"id": "program-source-text"})
    contest_status["code"] = pre_tag.text
    json.dump(contest_status, open(contest_status_file_name, "w"))
    time.sleep(0.5)

    break

In [80]:
submission_response

<Response [403]>

获取题目测试用例

In [198]:
contest_id = "1"
submission_id = "150671782"

In [199]:
submission_url = f"https://codeforces.com/contest/{contest_id}/submission/{submission_id}"
submission_response = requests.get(submission_url)

In [200]:
html_str = submission_response.content

In [201]:
with open("submission_html.html", "wb") as f:
    f.write(html_str)

In [179]:
pattern = r"name='csrf_token' value='(.*?)'"
csrf_token = re.search(pattern, html_str).group(1)

In [180]:
csrf_token

'df211c55860c58dcda44845095c93139'

In [181]:
get_test_url = "https://codeforces.com/data/submitSource"
res = requests.post(get_test_url, data={"submissionId": submission_id, "csrf_token": csrf_token})


In [182]:
res.content

b'{"error":"forbidden"}'

In [202]:
import requests
import re

submission_id = "150671782"

# 使用 session 对象保持会话状态
with requests.Session() as session:
    # 获取提交页面
    submission_url = f"https://codeforces.com/contest/{contest_id}/submission/{submission_id}"
    submission_response = session.get(submission_url)
    # 提取 HTML 内容
    html_str = submission_response.content.decode("utf-8")
    # 使用正则表达式从 HTML 中提取 csrf_token
    pattern = r"name='csrf_token' value='(.*?)'"
    csrf_token = re.search(pattern, html_str).group(1)
    # 使用相同的 session 对象发送带有 csrf_token 的 POST 请求
    get_test_url = "https://codeforces.com/data/submitSource"
    # 定义请求头
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36",
        "X-Csrf-Token": csrf_token
    }
    data = {"submissionId": submission_id, "csrf_token": csrf_token}
    print(f"data: {data}")
    res = session.post(get_test_url, headers=headers, json=json.dumps(data))
    # 输出响应内容
    print(res.content)

data: {'submissionId': '150671782', 'csrf_token': 'ac5dd3d0e4e675805d4c94b287a3b5c1'}
b'{"error":"forbidden"}'
