Skip to content

Commit 4df955d

Browse files
committed
Add script to crawl and parse problems
1 parent c2da648 commit 4df955d

File tree

3 files changed

+755
-0
lines changed

3 files changed

+755
-0
lines changed

main.py

Lines changed: 353 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,353 @@
1+
import getpass
2+
import http.client
3+
import http.cookiejar
4+
import json
5+
import os
6+
import pickle
7+
import socket
8+
import sys
9+
import tempfile
10+
import traceback
11+
from enum import Enum, auto
12+
13+
import requests
14+
from PIL import Image
15+
from selenium import webdriver
16+
from selenium.common.exceptions import WebDriverException
17+
from selenium.webdriver.common.by import By
18+
from selenium.webdriver.remote.command import Command
19+
from selenium.webdriver.support import expected_conditions as EC
20+
from selenium.webdriver.support.ui import WebDriverWait
21+
from typing import NamedTuple, List, Tuple, Dict, Any, Optional, Union
22+
23+
24+
def save_screenshot_of(elem, browser):
25+
location = elem.location
26+
size = elem.size
27+
28+
f = tempfile.NamedTemporaryFile()
29+
browser.save_screenshot(f.name)
30+
31+
im = Image.open(f.name)
32+
left = location['x']
33+
top = location['y']
34+
right = left + size['width']
35+
bottom = top + size['height']
36+
im = im.crop((left, top, right, bottom))
37+
38+
f.close()
39+
return im
40+
41+
42+
def check_alive(browser):
43+
try:
44+
browser.execute(Command.STATUS)
45+
return True
46+
except (socket.error, http.client.CannotSendRequest, WebDriverException):
47+
return False
48+
49+
50+
def update_cookie(username, password):
51+
options = webdriver.ChromeOptions()
52+
# options.add_argument('--no-startup-window')
53+
browser = webdriver.Chrome(chrome_options=options)
54+
browser.set_window_position(0, 0)
55+
browser.set_window_size(800, 600)
56+
browser.switch_to.window(browser.window_handles[0])
57+
browser.get('https://leetcode.com/accounts/login/')
58+
browser.implicitly_wait(10)
59+
60+
WebDriverWait(browser, 123456789).until(
61+
EC.visibility_of_element_located((By.CSS_SELECTOR, 'button[data-cy="sign-in-btn"]'))
62+
)
63+
64+
elem = browser.find_element_by_css_selector('input[name="login"]')
65+
elem.clear()
66+
elem.send_keys(username)
67+
68+
elem = browser.find_element_by_css_selector('input[type="password"]')
69+
elem.clear()
70+
elem.send_keys(password)
71+
72+
print("User credentials filled")
73+
74+
elem = browser.find_element_by_css_selector('button[data-cy="sign-in-btn"]')
75+
browser.execute_script("arguments[0].click();", elem)
76+
77+
WebDriverWait(browser, 123456789).until(
78+
EC.presence_of_element_located((By.CSS_SELECTOR, 'img.avatar'))
79+
)
80+
81+
cookies = browser.get_cookies()
82+
jar = http.cookiejar.LWPCookieJar()
83+
for cookie in cookies:
84+
if cookie['name'] == 'WBStorage':
85+
continue
86+
jar.set_cookie(http.cookiejar.Cookie(
87+
version=0,
88+
name=cookie['name'],
89+
value=cookie['value'],
90+
port='80',
91+
port_specified=False,
92+
domain=cookie['domain'],
93+
domain_specified=True,
94+
domain_initial_dot=False,
95+
path=cookie['path'],
96+
path_specified=True,
97+
secure=cookie['secure'],
98+
expires=cookie.get('expiry', 0),
99+
discard=False,
100+
comment=None,
101+
comment_url=None,
102+
rest={}
103+
))
104+
105+
cookie_path = f'cookies/{username}.dat'
106+
jar.save(cookie_path, ignore_discard=True, ignore_expires=True)
107+
108+
print(f'Cookies saved to `{cookie_path}`')
109+
110+
browser.quit()
111+
112+
113+
class Problem(NamedTuple):
114+
url: str
115+
name: str
116+
statement: str # problem statement, including examples and constraints
117+
examples: List[str] # raw examples, consisting of inputs and outputs (and potentially explanations)
118+
code: List[str] # template code, in lines
119+
120+
121+
def get_problems(contest_url: str) -> List[Problem]:
122+
browser = webdriver.Chrome()
123+
browser.set_window_position(0, 0)
124+
browser.set_window_size(800, 600)
125+
browser.switch_to.window(browser.window_handles[0])
126+
browser.implicitly_wait(10)
127+
browser.get(contest_url)
128+
129+
cookie_jar = http.cookiejar.LWPCookieJar("cookies/huzecong.dat")
130+
cookie_jar.load(ignore_discard=True, ignore_expires=True)
131+
for c in cookie_jar:
132+
browser.add_cookie({"name": c.name, 'value': c.value, 'path': c.path, 'expiry': c.expires})
133+
134+
elem = browser.find_element_by_css_selector("ul.contest-question-list")
135+
links = elem.find_elements_by_tag_name("a")
136+
problem_paths = [(link.get_attribute("href"), link.text) for link in links]
137+
138+
parsed_problems = []
139+
for problem_url, problem_name in problem_paths:
140+
browser.get(problem_url)
141+
statement = browser.find_element_by_css_selector("div.question-content").text
142+
examples = [
143+
elem.text for elem in browser.find_elements_by_css_selector("pre:not([class])") if elem.text]
144+
code = [elem.text for elem in browser.find_elements_by_css_selector("pre.CodeMirror-line")]
145+
problem = Problem(problem_url, problem_name, statement, examples, code)
146+
parsed_problems.append(problem)
147+
148+
return parsed_problems
149+
150+
151+
class ProblemType(Enum):
152+
Normal = auto()
153+
Tree = auto() # input is a tree, requires constructing `TreeNode` structures
154+
Interactive = auto() # requires constructing the class and calling methods
155+
156+
157+
class FunctionSignature(NamedTuple):
158+
name: str
159+
arguments: List[Tuple[str, str]] # list of (type, name)
160+
return_type: str
161+
162+
163+
class Example(NamedTuple):
164+
input: Dict[str, Any]
165+
output: Any
166+
167+
168+
class ProblemSignature(NamedTuple):
169+
function: FunctionSignature
170+
examples: List[Example]
171+
172+
173+
class InteractiveExample(NamedTuple):
174+
function: str
175+
input: Dict[str, Any]
176+
output: Optional[Any]
177+
178+
179+
class InteractiveProblemSignature(NamedTuple):
180+
class_name: str
181+
functions: List[FunctionSignature]
182+
examples: List[List[InteractiveExample]]
183+
184+
185+
def parse_vardef(s: str) -> Tuple[str, str]:
186+
r"""Given a variable definition, return the type and identifier name. For instance:
187+
``TreeNode *node`` should return ``TreeNode *`` and ``node``.
188+
189+
:param s: The string to parse.
190+
:return: A tuple of (type, name).
191+
"""
192+
s = s.strip()
193+
type_end = next((idx for idx in range(len(s) - 1, -1, -1) if not s[idx].isidentifier()), -1)
194+
# In case there's no type (e.g., constructor), `type_end` will be -1, so `type_name` will be empty string.
195+
identifier = s[(type_end + 1):].strip()
196+
type_name = s[:(type_end + 1)].strip()
197+
return type_name, identifier
198+
199+
200+
def find_functions(code: List[str]) -> Tuple[str, List[FunctionSignature]]:
201+
r"""Find functions in the solution class, and parse their signatures.
202+
203+
:param code: Lines of the template code.
204+
:return: A tuple of two elements:
205+
- The class name (in most cases it's "Solution" but in interactive problems it might not).
206+
- A list of function signatures, indicating the functions in the solution class.
207+
"""
208+
start_line = next(idx for idx in range(len(code)) if code[idx].startswith("class ") and code[idx].endswith(" {"))
209+
class_name = code[start_line][len("class "):-len(" {")].strip()
210+
end_line = code.index("};")
211+
signatures = []
212+
for line in code[(start_line + 1):end_line]:
213+
# A very heuristic way to find function beginnings.
214+
if line.startswith(" ") and line.endswith("{"):
215+
# Find function name.
216+
bracket_pos = line.find("(")
217+
return_type, func_name = parse_vardef(line[:bracket_pos])
218+
args_str = line[(bracket_pos + 1):line.find(")")].split(",")
219+
arguments = [parse_vardef(s) for s in args_str]
220+
signatures.append(FunctionSignature(func_name, arguments, return_type))
221+
return class_name, signatures
222+
223+
224+
def parse_value(s: str) -> Tuple[Any, str]:
225+
r"""Parse a JSON value from the string, and return the remaining part of the string.
226+
227+
:return: A tuple of (parsed JSON object, remaining unparsed string).
228+
"""
229+
try:
230+
obj = json.loads(s)
231+
ret_str = ""
232+
except json.JSONDecodeError as e:
233+
obj = json.loads(s[:e.pos])
234+
ret_str = s[e.pos:]
235+
return obj, ret_str.strip()
236+
237+
238+
def parse_problem(problem: Problem) -> Union[ProblemSignature, InteractiveProblemSignature]:
239+
r"""Parse the problem given the raw contents crawled from the web.
240+
"""
241+
242+
def find_example_section(s: str, cur_tag: str, next_tag: str) -> str:
243+
r"""Find the part in the example that is between two tags. If ``next_tag`` does not exist, then find the part
244+
until the end.
245+
"""
246+
start_pos = s.find(cur_tag) + len(cur_tag)
247+
end_pos = s.find(next_tag, start_pos)
248+
if end_pos == -1:
249+
return s[start_pos:].strip()
250+
return s[start_pos:end_pos].strip()
251+
252+
# Parse function signature from code.
253+
class_name, func_signatures = find_functions(problem.code)
254+
assert len(func_signatures) > 0
255+
if len(func_signatures) > 1:
256+
# Probably an interactive problem, skip for now.
257+
func_map: Dict[str, FunctionSignature] = {signature.name: signature for signature in func_signatures}
258+
examples: List[List[InteractiveExample]] = []
259+
for example in problem.examples:
260+
input_str = find_example_section(example, "Input", "Output")
261+
output_str = find_example_section(example, "Output", "Explanation")
262+
263+
functions, input_str = parse_value(input_str)
264+
arg_vals, input_str = parse_value(input_str)
265+
assert len(input_str) == 0
266+
ret_vals, output_str = parse_value(output_str)
267+
assert len(output_str) == 0
268+
269+
cur_examples = [
270+
InteractiveExample(
271+
function=func,
272+
input={arg_name: val for (_, arg_name), val in zip(func_map[func].arguments, args)},
273+
output=ret)
274+
for func, args, ret in zip(functions, arg_vals, ret_vals)
275+
]
276+
examples.append(cur_examples)
277+
278+
return InteractiveProblemSignature(class_name, func_signatures, examples)
279+
280+
else:
281+
assert class_name == "Solution"
282+
283+
func_signature = func_signatures[0]
284+
examples: List[Example] = []
285+
for example in problem.examples:
286+
input_str = find_example_section(example, "Input:", "Output:")
287+
output_str = find_example_section(example, "Output:", "Explanation:")
288+
289+
input_vals = {}
290+
for idx, (_, name) in enumerate(func_signature.arguments):
291+
if idx > 0 and input_str.startswith(","):
292+
input_str = input_str[1:].strip()
293+
if idx == 0:
294+
if input_str.startswith(f"{name} = "):
295+
input_str = input_str[len(f"{name} = "):].strip()
296+
else:
297+
assert input_str.startswith(f"{name} = ")
298+
input_str = input_str[len(f"{name} = "):].strip()
299+
input_val, input_str = parse_value(input_str)
300+
input_vals[name] = input_val
301+
assert len(input_str) == 0
302+
303+
output_val, output_str = parse_value(output_str)
304+
assert len(output_str) == 0
305+
306+
examples.append(Example(input_vals, output_val))
307+
308+
return ProblemSignature(func_signature, examples)
309+
310+
311+
def generate_code(signature: ProblemSignature) -> Tuple[str, str]:
312+
r"""Generate code given the signature. Code consists of two parts:
313+
314+
- Code for the solution class. This is basically the template as-is, but could also include the statement in
315+
comments.
316+
- Code for testing the solution. This includes test functions for each example, and also the main function where
317+
the test functions are called and results are compared.
318+
319+
:return: A tuple of two strings, corresponding to code for the solution class, and code for testing.
320+
"""
321+
322+
323+
def create_project(project_name: str, problems: List[Problem]) -> None:
324+
if not os.path.exists(project_name):
325+
os.mkdir(project_name)
326+
for idx, problem in enumerate(problems):
327+
problem_signature = parse_problem(problem)
328+
solution_code, test_code = generate_code(problem_signature)
329+
with open(os.path.join(project_name, f"{idx}.cpp")) as f:
330+
f.write(template_code)
331+
332+
333+
def main():
334+
username = sys.argv[1]
335+
password = getpass.getpass()
336+
337+
try:
338+
print(f"Updating cookie for account `{username}`")
339+
update_cookie(username, password)
340+
except WebDriverException as e:
341+
traceback.print_exc()
342+
print(e.__class__.__name__ + ': ' + str(e))
343+
344+
contest_name = "weekly-contest-163"
345+
problems = get_problems(f"https://leetcode.com/contest/{contest_name}")
346+
# Save the raw info just in case.
347+
with open(f"{contest_name}.pkl", "wb") as f:
348+
pickle.dump(problems, f)
349+
create_project(contest_name, problems)
350+
351+
352+
if __name__ == '__main__':
353+
main()

0 commit comments

Comments
 (0)